In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import ElasticNet, LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import OneHotEncoder
import joblib
from transformers import AutoTokenizer, AutoModel
import torch
from tqdm import tqdm
# Load data
df = pd.read_csv("Bengaluru_House_Data.csv")

# Drop unnecessary columns
df.drop(["area_type", "society", "availability"], axis=1, inplace=True)
df.dropna(inplace=True)

# Extract BHK from "size"
df["BHK"] = df["size"].apply(lambda x: int(x.split(' ')[0]))

# Convert total_sqft to float
def convert_sqft(x):
    try:
        if '-' in x:
            a, b = map(float, x.split('-'))
            return (a + b) / 2
        return float(x)
    except:
        return np.nan

df["total_sqft"] = df["total_sqft"].apply(convert_sqft)
df.dropna(subset=["total_sqft"], inplace=True)

# Price per sqft
df["price_per_sqft"] = df["price"] * 100000 / df["total_sqft"]

# Remove location outliers
df["location"] = df["location"].str.strip()
location_stats = df["location"].value_counts()
minor_locations = location_stats[location_stats <= 10].index
df["location"] = df["location"].apply(lambda x: "other" if x in minor_locations else x)
df = df[df["location"] != "other"]

# Remove price per sqft outliers
def remove_pps_outliers(df):
    out_df = pd.DataFrame()
    for key, subdf in df.groupby("location"):
        m = subdf["price_per_sqft"].mean()
        st = subdf["price_per_sqft"].std()
        reduced_df = subdf[(subdf["price_per_sqft"] > (m - st)) & (subdf["price_per_sqft"] <= (m + st))]
        out_df = pd.concat([out_df, reduced_df], ignore_index=True)
    return out_df

df = remove_pps_outliers(df)

# Remove BHK outliers
def remove_bhk_outliers(df):
    exclude_indices = np.array([])
    for location, location_df in df.groupby('location'):
        bhk_stats = {}
        for bhk, bhk_df in location_df.groupby('BHK'):
            bhk_stats[bhk] = {
                'mean': np.mean(bhk_df.price_per_sqft),
                'std': np.std(bhk_df.price_per_sqft),
                'count': bhk_df.shape[0]
            }
        for bhk, bhk_df in location_df.groupby('BHK'):
            stats = bhk_stats.get(bhk)
            if stats and stats['count'] > 5:
                bad_index = bhk_df[
                    (bhk_df.price_per_sqft < (stats['mean'] - stats['std'])) |
                    (bhk_df.price_per_sqft > (stats['mean'] + stats['std']))
                ].index
                exclude_indices = np.concatenate((exclude_indices, bad_index))
    return df.drop(exclude_indices, axis=0)

df = remove_bhk_outliers(df)

# Additional filters
df = df[(df["total_sqft"] / df["BHK"]) >= 300]
df = df[df["bath"] < df["BHK"] + 2]

# Drop unnecessary columns
df.drop(["size", "price_per_sqft"], axis=1, inplace=True)


In [2]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased")

def encode_location(location: str):
    inputs = tokenizer(location, return_tensors="pt")
    outputs = model(**inputs)
    # Dùng embedding của [CLS] token đại diện cho location
    cls_embedding = outputs.last_hidden_state[:, 0, :].detach().numpy()
    return cls_embedding.flatten()

location_embeddings = []
for loc in tqdm(df["location"], desc="Encoding locations with BERT"):
    vec = encode_location(loc)
    location_embeddings.append(vec)

location_embeddings = np.array(location_embeddings)
location_df = pd.DataFrame(location_embeddings, columns=[f"loc_emb_{i}" for i in range(location_embeddings.shape[1])])
# Fill NaNs in balcony (if any)
df["balcony"] = df["balcony"].fillna(0)

# Features and labels
X_tabular = df[["total_sqft", "bath", "balcony", "BHK"]].reset_index(drop=True)
X = pd.concat([X_tabular, location_df], axis=1)
y = df["price"].reset_index(drop=True)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train ElasticNet model
model = LinearRegression()
model.fit(X_train, y_train)


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Error while downloading from https://cas-bridge.xethub.hf.co/xet-bridge-us/621ffdc036468d709f174338/44172b48ce7a9d951000a9d76ae331b155df32c9fd4e93239a233d720f471725?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Content-Sha256=UNSIGNED-PAYLOAD&X-Amz-Credential=cas%2F20250514%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20250514T162012Z&X-Amz-Expires=3600&X-Amz-Signature=dc2a650c10401ce35b95027bc72fe67077ecc138d44c9e26b4e66ca1d159405e&X-Amz-SignedHeaders=host&X-Xet-Cas-Uid=66707afb938266072c9f6bfb&response-content-disposition=inline%3B+filename*%3DUTF-8%27%27model.safetensors%3B+filename%3D%22model.safetensors%22%3B&x-id=GetObject&Expires=1747243212&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTc0NzI0MzIxMn19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2FzLWJyaWRnZS54ZXRodWIuaGYuY28veGV0LWJyaWRnZS11cy82MjFmZmRjMDM2NDY4ZDcwOWYxNzQzMzgvNDQxNzJiNDhjZTdhOWQ5NTEwMDBhOWQ3NmFlMzMxYjE1NWRmMzJjOWZkNGU5MzIzOWEyMzNkNzIwZjQ3MTcyNSoifV19&Signature=ok7KNQX6%7EJ5SE3ZlL%7E5biFvKh5-HPni

model.safetensors:   2%|2         | 10.5M/440M [00:00<?, ?B/s]

Error while downloading from https://cas-bridge.xethub.hf.co/xet-bridge-us/621ffdc036468d709f174338/44172b48ce7a9d951000a9d76ae331b155df32c9fd4e93239a233d720f471725?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Content-Sha256=UNSIGNED-PAYLOAD&X-Amz-Credential=cas%2F20250514%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20250514T162012Z&X-Amz-Expires=3600&X-Amz-Signature=dc2a650c10401ce35b95027bc72fe67077ecc138d44c9e26b4e66ca1d159405e&X-Amz-SignedHeaders=host&X-Xet-Cas-Uid=66707afb938266072c9f6bfb&response-content-disposition=inline%3B+filename*%3DUTF-8%27%27model.safetensors%3B+filename%3D%22model.safetensors%22%3B&x-id=GetObject&Expires=1747243212&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTc0NzI0MzIxMn19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2FzLWJyaWRnZS54ZXRodWIuaGYuY28veGV0LWJyaWRnZS11cy82MjFmZmRjMDM2NDY4ZDcwOWYxNzQzMzgvNDQxNzJiNDhjZTdhOWQ5NTEwMDBhOWQ3NmFlMzMxYjE1NWRmMzJjOWZkNGU5MzIzOWEyMzNkNzIwZjQ3MTcyNSoifV19&Signature=ok7KNQX6%7EJ5SE3ZlL%7E5biFvKh5-HPni

model.safetensors:  14%|#4        | 62.9M/440M [00:00<?, ?B/s]

Error while downloading from https://cas-bridge.xethub.hf.co/xet-bridge-us/621ffdc036468d709f174338/44172b48ce7a9d951000a9d76ae331b155df32c9fd4e93239a233d720f471725?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Content-Sha256=UNSIGNED-PAYLOAD&X-Amz-Credential=cas%2F20250514%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20250514T162012Z&X-Amz-Expires=3600&X-Amz-Signature=dc2a650c10401ce35b95027bc72fe67077ecc138d44c9e26b4e66ca1d159405e&X-Amz-SignedHeaders=host&X-Xet-Cas-Uid=66707afb938266072c9f6bfb&response-content-disposition=inline%3B+filename*%3DUTF-8%27%27model.safetensors%3B+filename%3D%22model.safetensors%22%3B&x-id=GetObject&Expires=1747243212&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTc0NzI0MzIxMn19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2FzLWJyaWRnZS54ZXRodWIuaGYuY28veGV0LWJyaWRnZS11cy82MjFmZmRjMDM2NDY4ZDcwOWYxNzQzMzgvNDQxNzJiNDhjZTdhOWQ5NTEwMDBhOWQ3NmFlMzMxYjE1NWRmMzJjOWZkNGU5MzIzOWEyMzNkNzIwZjQ3MTcyNSoifV19&Signature=ok7KNQX6%7EJ5SE3ZlL%7E5biFvKh5-HPni

model.safetensors:  31%|###       | 136M/440M [00:00<?, ?B/s]

Error while downloading from https://cas-bridge.xethub.hf.co/xet-bridge-us/621ffdc036468d709f174338/44172b48ce7a9d951000a9d76ae331b155df32c9fd4e93239a233d720f471725?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Content-Sha256=UNSIGNED-PAYLOAD&X-Amz-Credential=cas%2F20250514%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20250514T162012Z&X-Amz-Expires=3600&X-Amz-Signature=dc2a650c10401ce35b95027bc72fe67077ecc138d44c9e26b4e66ca1d159405e&X-Amz-SignedHeaders=host&X-Xet-Cas-Uid=66707afb938266072c9f6bfb&response-content-disposition=inline%3B+filename*%3DUTF-8%27%27model.safetensors%3B+filename%3D%22model.safetensors%22%3B&x-id=GetObject&Expires=1747243212&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTc0NzI0MzIxMn19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2FzLWJyaWRnZS54ZXRodWIuaGYuY28veGV0LWJyaWRnZS11cy82MjFmZmRjMDM2NDY4ZDcwOWYxNzQzMzgvNDQxNzJiNDhjZTdhOWQ5NTEwMDBhOWQ3NmFlMzMxYjE1NWRmMzJjOWZkNGU5MzIzOWEyMzNkNzIwZjQ3MTcyNSoifV19&Signature=ok7KNQX6%7EJ5SE3ZlL%7E5biFvKh5-HPni

model.safetensors:  31%|###       | 136M/440M [00:00<?, ?B/s]

Error while downloading from https://cas-bridge.xethub.hf.co/xet-bridge-us/621ffdc036468d709f174338/44172b48ce7a9d951000a9d76ae331b155df32c9fd4e93239a233d720f471725?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Content-Sha256=UNSIGNED-PAYLOAD&X-Amz-Credential=cas%2F20250514%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20250514T162012Z&X-Amz-Expires=3600&X-Amz-Signature=dc2a650c10401ce35b95027bc72fe67077ecc138d44c9e26b4e66ca1d159405e&X-Amz-SignedHeaders=host&X-Xet-Cas-Uid=66707afb938266072c9f6bfb&response-content-disposition=inline%3B+filename*%3DUTF-8%27%27model.safetensors%3B+filename%3D%22model.safetensors%22%3B&x-id=GetObject&Expires=1747243212&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTc0NzI0MzIxMn19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2FzLWJyaWRnZS54ZXRodWIuaGYuY28veGV0LWJyaWRnZS11cy82MjFmZmRjMDM2NDY4ZDcwOWYxNzQzMzgvNDQxNzJiNDhjZTdhOWQ5NTEwMDBhOWQ3NmFlMzMxYjE1NWRmMzJjOWZkNGU5MzIzOWEyMzNkNzIwZjQ3MTcyNSoifV19&Signature=ok7KNQX6%7EJ5SE3ZlL%7E5biFvKh5-HPni

model.safetensors:  31%|###       | 136M/440M [00:00<?, ?B/s]

Error while downloading from https://cas-bridge.xethub.hf.co/xet-bridge-us/621ffdc036468d709f174338/44172b48ce7a9d951000a9d76ae331b155df32c9fd4e93239a233d720f471725?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Content-Sha256=UNSIGNED-PAYLOAD&X-Amz-Credential=cas%2F20250514%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20250514T162012Z&X-Amz-Expires=3600&X-Amz-Signature=dc2a650c10401ce35b95027bc72fe67077ecc138d44c9e26b4e66ca1d159405e&X-Amz-SignedHeaders=host&X-Xet-Cas-Uid=66707afb938266072c9f6bfb&response-content-disposition=inline%3B+filename*%3DUTF-8%27%27model.safetensors%3B+filename%3D%22model.safetensors%22%3B&x-id=GetObject&Expires=1747243212&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTc0NzI0MzIxMn19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2FzLWJyaWRnZS54ZXRodWIuaGYuY28veGV0LWJyaWRnZS11cy82MjFmZmRjMDM2NDY4ZDcwOWYxNzQzMzgvNDQxNzJiNDhjZTdhOWQ5NTEwMDBhOWQ3NmFlMzMxYjE1NWRmMzJjOWZkNGU5MzIzOWEyMzNkNzIwZjQ3MTcyNSoifV19&Signature=ok7KNQX6%7EJ5SE3ZlL%7E5biFvKh5-HPni

model.safetensors:  33%|###3      | 147M/440M [00:00<?, ?B/s]

Error while downloading from https://cas-bridge.xethub.hf.co/xet-bridge-us/621ffdc036468d709f174338/44172b48ce7a9d951000a9d76ae331b155df32c9fd4e93239a233d720f471725?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Content-Sha256=UNSIGNED-PAYLOAD&X-Amz-Credential=cas%2F20250514%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20250514T162012Z&X-Amz-Expires=3600&X-Amz-Signature=dc2a650c10401ce35b95027bc72fe67077ecc138d44c9e26b4e66ca1d159405e&X-Amz-SignedHeaders=host&X-Xet-Cas-Uid=66707afb938266072c9f6bfb&response-content-disposition=inline%3B+filename*%3DUTF-8%27%27model.safetensors%3B+filename%3D%22model.safetensors%22%3B&x-id=GetObject&Expires=1747243212&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTc0NzI0MzIxMn19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2FzLWJyaWRnZS54ZXRodWIuaGYuY28veGV0LWJyaWRnZS11cy82MjFmZmRjMDM2NDY4ZDcwOWYxNzQzMzgvNDQxNzJiNDhjZTdhOWQ5NTEwMDBhOWQ3NmFlMzMxYjE1NWRmMzJjOWZkNGU5MzIzOWEyMzNkNzIwZjQ3MTcyNSoifV19&Signature=ok7KNQX6%7EJ5SE3ZlL%7E5biFvKh5-HPni

model.safetensors:  36%|###5      | 157M/440M [00:00<?, ?B/s]

Error while downloading from https://cas-bridge.xethub.hf.co/xet-bridge-us/621ffdc036468d709f174338/44172b48ce7a9d951000a9d76ae331b155df32c9fd4e93239a233d720f471725?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Content-Sha256=UNSIGNED-PAYLOAD&X-Amz-Credential=cas%2F20250514%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20250514T162012Z&X-Amz-Expires=3600&X-Amz-Signature=dc2a650c10401ce35b95027bc72fe67077ecc138d44c9e26b4e66ca1d159405e&X-Amz-SignedHeaders=host&X-Xet-Cas-Uid=66707afb938266072c9f6bfb&response-content-disposition=inline%3B+filename*%3DUTF-8%27%27model.safetensors%3B+filename%3D%22model.safetensors%22%3B&x-id=GetObject&Expires=1747243212&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTc0NzI0MzIxMn19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2FzLWJyaWRnZS54ZXRodWIuaGYuY28veGV0LWJyaWRnZS11cy82MjFmZmRjMDM2NDY4ZDcwOWYxNzQzMzgvNDQxNzJiNDhjZTdhOWQ5NTEwMDBhOWQ3NmFlMzMxYjE1NWRmMzJjOWZkNGU5MzIzOWEyMzNkNzIwZjQ3MTcyNSoifV19&Signature=ok7KNQX6%7EJ5SE3ZlL%7E5biFvKh5-HPni

model.safetensors:  38%|###8      | 168M/440M [00:00<?, ?B/s]

Error while downloading from https://cas-bridge.xethub.hf.co/xet-bridge-us/621ffdc036468d709f174338/44172b48ce7a9d951000a9d76ae331b155df32c9fd4e93239a233d720f471725?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Content-Sha256=UNSIGNED-PAYLOAD&X-Amz-Credential=cas%2F20250514%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20250514T162012Z&X-Amz-Expires=3600&X-Amz-Signature=dc2a650c10401ce35b95027bc72fe67077ecc138d44c9e26b4e66ca1d159405e&X-Amz-SignedHeaders=host&X-Xet-Cas-Uid=66707afb938266072c9f6bfb&response-content-disposition=inline%3B+filename*%3DUTF-8%27%27model.safetensors%3B+filename%3D%22model.safetensors%22%3B&x-id=GetObject&Expires=1747243212&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTc0NzI0MzIxMn19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2FzLWJyaWRnZS54ZXRodWIuaGYuY28veGV0LWJyaWRnZS11cy82MjFmZmRjMDM2NDY4ZDcwOWYxNzQzMzgvNDQxNzJiNDhjZTdhOWQ5NTEwMDBhOWQ3NmFlMzMxYjE1NWRmMzJjOWZkNGU5MzIzOWEyMzNkNzIwZjQ3MTcyNSoifV19&Signature=ok7KNQX6%7EJ5SE3ZlL%7E5biFvKh5-HPni

model.safetensors:  40%|####      | 178M/440M [00:00<?, ?B/s]

Error while downloading from https://cas-bridge.xethub.hf.co/xet-bridge-us/621ffdc036468d709f174338/44172b48ce7a9d951000a9d76ae331b155df32c9fd4e93239a233d720f471725?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Content-Sha256=UNSIGNED-PAYLOAD&X-Amz-Credential=cas%2F20250514%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20250514T162012Z&X-Amz-Expires=3600&X-Amz-Signature=dc2a650c10401ce35b95027bc72fe67077ecc138d44c9e26b4e66ca1d159405e&X-Amz-SignedHeaders=host&X-Xet-Cas-Uid=66707afb938266072c9f6bfb&response-content-disposition=inline%3B+filename*%3DUTF-8%27%27model.safetensors%3B+filename%3D%22model.safetensors%22%3B&x-id=GetObject&Expires=1747243212&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTc0NzI0MzIxMn19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2FzLWJyaWRnZS54ZXRodWIuaGYuY28veGV0LWJyaWRnZS11cy82MjFmZmRjMDM2NDY4ZDcwOWYxNzQzMzgvNDQxNzJiNDhjZTdhOWQ5NTEwMDBhOWQ3NmFlMzMxYjE1NWRmMzJjOWZkNGU5MzIzOWEyMzNkNzIwZjQ3MTcyNSoifV19&Signature=ok7KNQX6%7EJ5SE3ZlL%7E5biFvKh5-HPni

model.safetensors:  48%|####7     | 210M/440M [00:00<?, ?B/s]

Error while downloading from https://cas-bridge.xethub.hf.co/xet-bridge-us/621ffdc036468d709f174338/44172b48ce7a9d951000a9d76ae331b155df32c9fd4e93239a233d720f471725?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Content-Sha256=UNSIGNED-PAYLOAD&X-Amz-Credential=cas%2F20250514%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20250514T162012Z&X-Amz-Expires=3600&X-Amz-Signature=dc2a650c10401ce35b95027bc72fe67077ecc138d44c9e26b4e66ca1d159405e&X-Amz-SignedHeaders=host&X-Xet-Cas-Uid=66707afb938266072c9f6bfb&response-content-disposition=inline%3B+filename*%3DUTF-8%27%27model.safetensors%3B+filename%3D%22model.safetensors%22%3B&x-id=GetObject&Expires=1747243212&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTc0NzI0MzIxMn19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2FzLWJyaWRnZS54ZXRodWIuaGYuY28veGV0LWJyaWRnZS11cy82MjFmZmRjMDM2NDY4ZDcwOWYxNzQzMzgvNDQxNzJiNDhjZTdhOWQ5NTEwMDBhOWQ3NmFlMzMxYjE1NWRmMzJjOWZkNGU5MzIzOWEyMzNkNzIwZjQ3MTcyNSoifV19&Signature=ok7KNQX6%7EJ5SE3ZlL%7E5biFvKh5-HPni

model.safetensors:  48%|####7     | 210M/440M [00:00<?, ?B/s]

Error while downloading from https://cas-bridge.xethub.hf.co/xet-bridge-us/621ffdc036468d709f174338/44172b48ce7a9d951000a9d76ae331b155df32c9fd4e93239a233d720f471725?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Content-Sha256=UNSIGNED-PAYLOAD&X-Amz-Credential=cas%2F20250514%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20250514T162012Z&X-Amz-Expires=3600&X-Amz-Signature=dc2a650c10401ce35b95027bc72fe67077ecc138d44c9e26b4e66ca1d159405e&X-Amz-SignedHeaders=host&X-Xet-Cas-Uid=66707afb938266072c9f6bfb&response-content-disposition=inline%3B+filename*%3DUTF-8%27%27model.safetensors%3B+filename%3D%22model.safetensors%22%3B&x-id=GetObject&Expires=1747243212&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTc0NzI0MzIxMn19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2FzLWJyaWRnZS54ZXRodWIuaGYuY28veGV0LWJyaWRnZS11cy82MjFmZmRjMDM2NDY4ZDcwOWYxNzQzMzgvNDQxNzJiNDhjZTdhOWQ5NTEwMDBhOWQ3NmFlMzMxYjE1NWRmMzJjOWZkNGU5MzIzOWEyMzNkNzIwZjQ3MTcyNSoifV19&Signature=ok7KNQX6%7EJ5SE3ZlL%7E5biFvKh5-HPni

model.safetensors:  48%|####7     | 210M/440M [00:00<?, ?B/s]

Error while downloading from https://cas-bridge.xethub.hf.co/xet-bridge-us/621ffdc036468d709f174338/44172b48ce7a9d951000a9d76ae331b155df32c9fd4e93239a233d720f471725?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Content-Sha256=UNSIGNED-PAYLOAD&X-Amz-Credential=cas%2F20250514%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20250514T162012Z&X-Amz-Expires=3600&X-Amz-Signature=dc2a650c10401ce35b95027bc72fe67077ecc138d44c9e26b4e66ca1d159405e&X-Amz-SignedHeaders=host&X-Xet-Cas-Uid=66707afb938266072c9f6bfb&response-content-disposition=inline%3B+filename*%3DUTF-8%27%27model.safetensors%3B+filename%3D%22model.safetensors%22%3B&x-id=GetObject&Expires=1747243212&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTc0NzI0MzIxMn19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2FzLWJyaWRnZS54ZXRodWIuaGYuY28veGV0LWJyaWRnZS11cy82MjFmZmRjMDM2NDY4ZDcwOWYxNzQzMzgvNDQxNzJiNDhjZTdhOWQ5NTEwMDBhOWQ3NmFlMzMxYjE1NWRmMzJjOWZkNGU5MzIzOWEyMzNkNzIwZjQ3MTcyNSoifV19&Signature=ok7KNQX6%7EJ5SE3ZlL%7E5biFvKh5-HPni

model.safetensors:  52%|#####2    | 231M/440M [00:00<?, ?B/s]

Encoding locations with BERT: 100%|██████████| 5694/5694 [03:15<00:00, 29.19it/s]


In [3]:
# Predictions
y_pred = model.predict(X_test)

# Evaluation
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))
print("MAE:", mean_absolute_error(y_test, y_pred))
print("R^2:", r2_score(y_test, y_pred))


RMSE: 39.07194988607063
MAE: 13.590984081774033
R^2: 0.8162954788576631


In [4]:
# Lưu mô hình
joblib.dump(model, "house_price_model2.pkl")

# # Nếu bạn cần lưu cả one-hot column info để sau này dùng lại:
# joblib.dump(list(X.columns), "model_features.pkl")

['house_price_model2.pkl']

In [5]:
import pickle
# Save unique locations used for training
unique_locations = sorted(df["location"].unique().tolist())

# Save to file
with open("trained_locations.pkl", "wb") as f:
    pickle.dump(unique_locations, f)