**Importing Required libraries**

In [52]:
# Importing necessary libraries for model building and preprocessing
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score


**Loading Dataset**

In [53]:
# File paths
train_path = r"D:\Repos\ML_Projects\House Price Predictor\Data\raw\train.csv"
test_path = r"D:\Repos\ML_Projects\House Price Predictor\Data\raw\test.csv"

# Load datasets
train = pd.read_csv(train_path)
test = pd.read_csv(test_path)

# Display dataset shape
print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")


Train shape: (1460, 81)
Test shape: (1459, 80)


**Checking Missing Values**

In [54]:
# Check missing values in train and test datasets
missing_train = train.isnull().sum().sort_values(ascending=False)
missing_test = test.isnull().sum().sort_values(ascending=False)

print("Missing values in train dataset:")
display(missing_train[missing_train > 0])

print("\nMissing values in test dataset:")
display(missing_test[missing_test > 0])


Missing values in train dataset:


PoolQC          1453
MiscFeature     1406
Alley           1369
Fence           1179
MasVnrType       872
FireplaceQu      690
LotFrontage      259
GarageYrBlt       81
GarageCond        81
GarageType        81
GarageFinish      81
GarageQual        81
BsmtFinType2      38
BsmtExposure      38
BsmtQual          37
BsmtCond          37
BsmtFinType1      37
MasVnrArea         8
Electrical         1
dtype: int64


Missing values in test dataset:


PoolQC          1456
MiscFeature     1408
Alley           1352
Fence           1169
MasVnrType       894
FireplaceQu      730
LotFrontage      227
GarageYrBlt       78
GarageQual        78
GarageFinish      78
GarageCond        78
GarageType        76
BsmtCond          45
BsmtQual          44
BsmtExposure      44
BsmtFinType1      42
BsmtFinType2      42
MasVnrArea        15
MSZoning           4
BsmtHalfBath       2
Utilities          2
Functional         2
BsmtFullBath       2
BsmtFinSF1         1
BsmtFinSF2         1
BsmtUnfSF          1
KitchenQual        1
TotalBsmtSF        1
Exterior2nd        1
GarageCars         1
Exterior1st        1
GarageArea         1
SaleType           1
dtype: int64

**Handling Missing Values**

In [55]:
# Assuming train and test datasets are already loaded
# Create a list of columns with missing values
missing_data = train.isnull().sum()

# Step 1: Drop columns with more than 30% missing values
threshold = 0.3
columns_to_drop = missing_data[missing_data / len(train) > threshold].index
train = train.drop(columns=columns_to_drop)
test = test.drop(columns=columns_to_drop)

# Step 2: Impute missing numerical values with the mean
numerical_cols = train.select_dtypes(include=['float64', 'int64']).columns
for col in numerical_cols:
    if train[col].isnull().sum() > 0:
        train[col].fillna(train[col].mean(), inplace=True)
        test[col].fillna(test[col].mean(), inplace=True)

# Step 3: Impute missing categorical values with the mode
categorical_cols = train.select_dtypes(include=['object']).columns
for col in categorical_cols:
    if train[col].isnull().sum() > 0:
        train[col].fillna(train[col].mode()[0], inplace=True)
        test[col].fillna(test[col].mode()[0], inplace=True)

# Special handling for specific columns
train['GarageType'].fillna('No Garage', inplace=True)
test['GarageType'].fillna('No Garage', inplace=True)

train['GarageFinish'].fillna('No Garage', inplace=True)
test['GarageFinish'].fillna('No Garage', inplace=True)

train['BsmtQual'].fillna('No Basement', inplace=True)
test['BsmtQual'].fillna('No Basement', inplace=True)

train['BsmtCond'].fillna('No Basement', inplace=True)
test['BsmtCond'].fillna('No Basement', inplace=True)

train['BsmtExposure'].fillna('No Basement', inplace=True)
test['BsmtExposure'].fillna('No Basement', inplace=True)

train['BsmtFinType1'].fillna('No Basement', inplace=True)
test['BsmtFinType1'].fillna('No Basement', inplace=True)

train['BsmtFinType2'].fillna('No Basement', inplace=True)
test['BsmtFinType2'].fillna('No Basement', inplace=True)




The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train[col].fillna(train[col].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test[col].fillna(test[col].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting 

In [56]:
# Verify no missing values remain
print("Missing values in train set after imputation:\n", train.isnull().sum())
print("Missing values in test set after imputation:\n", test.isnull().sum())

Missing values in train set after imputation:
 Id               0
MSSubClass       0
MSZoning         0
LotFrontage      0
LotArea          0
                ..
MoSold           0
YrSold           0
SaleType         0
SaleCondition    0
SalePrice        0
Length: 75, dtype: int64
Missing values in test set after imputation:
 Id               0
MSSubClass       0
MSZoning         4
LotFrontage      0
LotArea          0
                ..
MiscVal          0
MoSold           0
YrSold           0
SaleType         1
SaleCondition    0
Length: 74, dtype: int64


**Selecting Targeted Values**

In [58]:
# Target variable (SalePrice)
X = train.drop(columns=['SalePrice'])  
y = train['SalePrice']  


In [60]:
# Display dataset sh
X.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,...,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1,60,RL,65.0,8450,Pave,Reg,Lvl,AllPub,Inside,...,61,0,0,0,0,0,2,2008,WD,Normal
1,2,20,RL,80.0,9600,Pave,Reg,Lvl,AllPub,FR2,...,0,0,0,0,0,0,5,2007,WD,Normal
2,3,60,RL,68.0,11250,Pave,IR1,Lvl,AllPub,Inside,...,42,0,0,0,0,0,9,2008,WD,Normal
3,4,70,RL,60.0,9550,Pave,IR1,Lvl,AllPub,Corner,...,35,272,0,0,0,0,2,2006,WD,Abnorml
4,5,60,RL,84.0,14260,Pave,IR1,Lvl,AllPub,FR2,...,84,0,0,0,0,0,12,2008,WD,Normal


In [61]:
y.head()

0    208500
1    181500
2    223500
3    140000
4    250000
Name: SalePrice, dtype: int64

**Splitting Dataset**

In [62]:
# Step 2: Split the data into train and validation sets (80-20 split)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

**Identifying Numerical & Categorical COlumns**

In [63]:
# Step 3: Identify categorical and numerical columns
categorical_cols = X.select_dtypes(include=['object']).columns  # Identifying categorical columns
numerical_cols = X.select_dtypes(include=['float64', 'int64']).columns  # Identifying numerical columns

# Printing the columns to verify
print("Categorical Columns:", categorical_cols)
print("Numerical Columns:", numerical_cols)


Categorical Columns: Index(['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities',
       'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
       'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional',
       'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive',
       'SaleType', 'SaleCondition'],
      dtype='object')
Numerical Columns: Index(['Id', 'MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual',
       'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd',
       