In [1]:
import pandas as pd

df = pd.read_csv(r"C:\Users\palas\Downloads\house_price.csv")

print("✅ Shape:", df.shape)
df.head()


✅ Shape: (1460, 81)


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [2]:
# Total missing per column (top 15)
missing = df.isnull().sum().sort_values(ascending=False)
print(missing.head(15))

# Missing percentage (top 15)
missing_percent = (df.isnull().mean() * 100).sort_values(ascending=False)
print("\nMissing % (Top 15):\n", missing_percent.head(15))


PoolQC          1453
MiscFeature     1406
Alley           1369
Fence           1179
MasVnrType       872
FireplaceQu      690
LotFrontage      259
GarageYrBlt       81
GarageCond        81
GarageType        81
GarageFinish      81
GarageQual        81
BsmtFinType2      38
BsmtExposure      38
BsmtQual          37
dtype: int64

Missing % (Top 15):
 PoolQC          99.520548
MiscFeature     96.301370
Alley           93.767123
Fence           80.753425
MasVnrType      59.726027
FireplaceQu     47.260274
LotFrontage     17.739726
GarageYrBlt      5.547945
GarageCond       5.547945
GarageType       5.547945
GarageFinish     5.547945
GarageQual       5.547945
BsmtFinType2     2.602740
BsmtExposure     2.602740
BsmtQual         2.534247
dtype: float64


In [3]:
import numpy as np

df_clean = df.copy()

# 1) NaN মানে actually "No" (so fill with "None")
none_cols = [
    "PoolQC", "MiscFeature", "Alley", "Fence", "FireplaceQu",
    "GarageType", "GarageFinish", "GarageQual", "GarageCond",
    "BsmtQual", "BsmtCond", "BsmtExposure", "BsmtFinType1", "BsmtFinType2",
    "MasVnrType"
]

for col in none_cols:
    if col in df_clean.columns:
        df_clean[col] = df_clean[col].fillna("None")

# 2) LotFrontage → median by neighborhood (smart fill)
if "LotFrontage" in df_clean.columns and "Neighborhood" in df_clean.columns:
    df_clean["LotFrontage"] = df_clean.groupby("Neighborhood")["LotFrontage"].transform(
        lambda x: x.fillna(x.median())
    )

# 3) GarageYrBlt → fill with YearBuilt (reasonable)
if "GarageYrBlt" in df_clean.columns:
    df_clean["GarageYrBlt"] = df_clean["GarageYrBlt"].fillna(df_clean["YearBuilt"])

# 4) MasVnrArea → 0 (if no veneer)
if "MasVnrArea" in df_clean.columns:
    df_clean["MasVnrArea"] = df_clean["MasVnrArea"].fillna(0)

# 5) Any remaining numeric NaN → median
num_cols = df_clean.select_dtypes(include=["int64", "float64"]).columns
df_clean[num_cols] = df_clean[num_cols].fillna(df_clean[num_cols].median())

# 6) Any remaining categorical NaN → mode
cat_cols = df_clean.select_dtypes(include=["object"]).columns
for col in cat_cols:
    df_clean[col] = df_clean[col].fillna(df_clean[col].mode()[0])

print("✅ Missing after cleaning:", df_clean.isnull().sum().sum())


✅ Missing after cleaning: 0


In [4]:
import numpy as np

df2 = df_clean.copy()

# Outlier remove (SalePrice খুব বেশি হলে model disturb করে)
df2 = df2[df2["SalePrice"] < 500000]

# Log transform target (best for skewed price)
df2["SalePrice_log"] = np.log1p(df2["SalePrice"])

print("✅ New Shape:", df2.shape)
print("✅ SalePrice min/max:", df2["SalePrice"].min(), df2["SalePrice"].max())


✅ New Shape: (1451, 82)
✅ SalePrice min/max: 34900 485000


In [6]:
from sklearn.metrics import mean_absolute_error, r2_score, root_mean_squared_error
import numpy as np

pred_log = pipe.predict(X_test)

mae_log = mean_absolute_error(y_test, pred_log)
rmse_log = root_mean_squared_error(y_test, pred_log)
r2 = r2_score(y_test, pred_log)

print("✅ LOG Scale Metrics")
print("MAE :", mae_log)
print("RMSE:", rmse_log)
print("R2  :", r2)

# Back to price
pred_price = np.expm1(pred_log)
true_price = np.expm1(y_test)

mae_price = mean_absolute_error(true_price, pred_price)
rmse_price = root_mean_squared_error(true_price, pred_price)

print("\n✅ PRICE Scale Metrics")
print("MAE :", mae_price)
print("RMSE:", rmse_price)


✅ LOG Scale Metrics
MAE : 0.09170500497696274
RMSE: 0.13872560187354654
R2  : 0.8750900193165643

✅ PRICE Scale Metrics
MAE : 15096.506950788347
RMSE: 22249.489959123974


In [7]:
import joblib

joblib.dump(pipe, "house_price_pipeline.pkl")
print("✅ Saved: house_price_pipeline.pkl")


✅ Saved: house_price_pipeline.pkl


In [8]:
import numpy as np
import pandas as pd
import joblib

model = joblib.load("house_price_pipeline.pkl")

def predict_price(single_row_df):
    pred_log = model.predict(single_row_df)
    
    pred_price = np.expm1(pred_log)
    return pred_price

# Example: first row prediction
sample = X_test.iloc[[0]]
print("✅ Predicted Price:", predict_price(sample)[0])


✅ Predicted Price: 371671.6598606587


In [9]:
actual = np.expm1(y_test.iloc[0])
pred = predict_price(X_test.iloc[[0]])[0]

print("Actual :", actual)
print("Pred   :", pred)


Actual : 315000.0000000002
Pred   : 371671.6598606587
