In [1]:
# Task 8 : Improving House Price Prediction using Feature Engineering

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer


In [None]:
# 1: Load Dataset

In [3]:
df = pd.read_csv("train.csv")


Dataset Shape: (1460, 81)


In [4]:
df

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,175000
1456,1457,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,2,2010,WD,Normal,210000
1457,1458,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,...,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1458,1459,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2010,WD,Normal,142125


In [5]:
 df.shape

(1460, 81)

In [None]:
# 2: Handle Missing Values

In [6]:
num_cols = df.select_dtypes(include=['int64', 'float64']).columns
cat_cols = df.select_dtypes(include=['object']).columns

In [8]:
imputer_num = SimpleImputer(strategy="median")
df[num_cols] = imputer_num.fit_transform(df[num_cols])

In [9]:
imputer_cat = SimpleImputer(strategy="most_frequent")
df[cat_cols] = imputer_cat.fit_transform(df[cat_cols])

In [None]:
# 3: Log Transform Skewed Target Column

In [10]:
df["SalePrice"] = np.log1p(df["SalePrice"])


In [None]:
# 4: Encoding

In [11]:
ordinal_cols = [
    "ExterQual", "ExterCond",
    "BsmtQual", "BsmtCond",
    "KitchenQual", "HeatingQC"
]

label_encoder = LabelEncoder()
for col in ordinal_cols:
    if col in df.columns:
        df[col] = label_encoder.fit_transform(df[col])

df = pd.get_dummies(df, drop_first=True)

print("Encoding completed. Final shape:", df.shape)

Encoding completed. Final shape: (1460, 232)


In [None]:
#  5: Train-Test Split

In [12]:
X = df.drop("SalePrice", axis=1)
y = df["SalePrice"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [None]:
# 6: Model Training

In [13]:
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

In [None]:
# 7: Evaluation (RMSE)

In [14]:
preds = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, preds))
print("RMSE:", rmse)

RMSE: 0.1484839740715989


In [None]:
# 8: Feature Importance


In [15]:
importance_df = pd.DataFrame({
    "Feature": X.columns,
    "Importance": model.feature_importances_
}).sort_values(by="Importance", ascending=False)

print("Top 10 Important Features:")
print(importance_df.head(10))

Top 10 Important Features:
          Feature  Importance
4     OverallQual    0.542049
21      GrLivArea    0.114746
16    TotalBsmtSF    0.043443
32     GarageCars    0.038655
33     GarageArea    0.024869
13     BsmtFinSF1    0.021005
18       1stFlrSF    0.019997
178  CentralAir_Y    0.016896
3         LotArea    0.015275
6       YearBuilt    0.013252


# 4. Key EDA Insights (Short Summary)


-These insights match typical results from the House Price dataset:

-Higher Overall Quality strongly increases house price

-Above-ground living area (GrLivArea) is highly correlated with price

-Houses with more garage space tend to be more expensive

-Log-transform on SalePrice reduces skewness

-Missing values exist in many columns (basement, garage, masonry veneer)

-Many categorical variables require one-hot encoding