## Import necessary libraries

In [97]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

## Load and explore the data

In [60]:
test_data = pd.read_csv('../data/test.csv')

test_data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal


In [62]:
test_data.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

In [64]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 80 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1459 non-null   int64  
 1   MSSubClass     1459 non-null   int64  
 2   MSZoning       1455 non-null   object 
 3   LotFrontage    1232 non-null   float64
 4   LotArea        1459 non-null   int64  
 5   Street         1459 non-null   object 
 6   Alley          107 non-null    object 
 7   LotShape       1459 non-null   object 
 8   LandContour    1459 non-null   object 
 9   Utilities      1457 non-null   object 
 10  LotConfig      1459 non-null   object 
 11  LandSlope      1459 non-null   object 
 12  Neighborhood   1459 non-null   object 
 13  Condition1     1459 non-null   object 
 14  Condition2     1459 non-null   object 
 15  BldgType       1459 non-null   object 
 16  HouseStyle     1459 non-null   object 
 17  OverallQual    1459 non-null   int64  
 18  OverallC

In [66]:
missing_values = test_data.isnull().sum()

print(missing_values[missing_values>0])

MSZoning           4
LotFrontage      227
Alley           1352
Utilities          2
Exterior1st        1
Exterior2nd        1
MasVnrType       894
MasVnrArea        15
BsmtQual          44
BsmtCond          45
BsmtExposure      44
BsmtFinType1      42
BsmtFinSF1         1
BsmtFinType2      42
BsmtFinSF2         1
BsmtUnfSF          1
TotalBsmtSF        1
BsmtFullBath       2
BsmtHalfBath       2
KitchenQual        1
Functional         2
FireplaceQu      730
GarageType        76
GarageYrBlt       78
GarageFinish      78
GarageCars         1
GarageArea         1
GarageQual        78
GarageCond        78
PoolQC          1456
Fence           1169
MiscFeature     1408
SaleType           1
dtype: int64


## Data Preprocessing

### Handling missing values

In [68]:
# Drop columns with too many missing values

test_data = test_data.drop(columns=['Alley', 'PoolQC', 'Fence', 'MiscFeature'])

In [70]:
test_data['LotFrontage'] = test_data.groupby('Neighborhood')['LotFrontage'].transform(lambda x: x.fillna(x.median()))
test_data['MasVnrType'].fillna("None", inplace=True)
test_data['MasVnrArea'].fillna(0, inplace=True)
test_data['BsmtQual'].fillna("None", inplace=True)
test_data['BsmtCond'].fillna("None", inplace=True)
test_data['BsmtExposure'].fillna("None", inplace=True)
test_data['BsmtFinType1'].fillna("None", inplace=True)
test_data['BsmtFinType2'].fillna("None", inplace=True)
test_data['Electrical'].fillna(test_data['Electrical'].mode()[0], inplace=True)
test_data['FireplaceQu'].fillna("None", inplace=True)
test_data['GarageType'].fillna("None", inplace=True)
test_data['GarageYrBlt'].fillna(0, inplace=True)
test_data['GarageFinish'].fillna("None", inplace=True)
test_data['GarageQual'].fillna("None", inplace=True)
test_data['GarageCond'].fillna("None", inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_data['MasVnrType'].fillna("None", inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_data['MasVnrArea'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting

In [74]:
missing_values = test_data.isnull().sum()

print(missing_values[missing_values>0])

MSZoning        4
Utilities       2
Exterior1st     1
Exterior2nd     1
BsmtFinSF1      1
BsmtFinSF2      1
BsmtUnfSF       1
TotalBsmtSF     1
BsmtFullBath    2
BsmtHalfBath    2
KitchenQual     1
Functional      2
GarageCars      1
GarageArea      1
SaleType        1
dtype: int64


In [76]:
# Filling categorical variables with mode (most frequent value)
test_data['MSZoning'].fillna(test_data['MSZoning'].mode()[0], inplace=True)
test_data['Utilities'].fillna(test_data['Utilities'].mode()[0], inplace=True)
test_data['Exterior1st'].fillna(test_data['Exterior1st'].mode()[0], inplace=True)
test_data['Exterior2nd'].fillna(test_data['Exterior2nd'].mode()[0], inplace=True)
test_data['KitchenQual'].fillna(test_data['KitchenQual'].mode()[0], inplace=True)
test_data['Functional'].fillna(test_data['Functional'].mode()[0], inplace=True)
test_data['SaleType'].fillna(test_data['SaleType'].mode()[0], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_data['MSZoning'].fillna(test_data['MSZoning'].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_data['Utilities'].fillna(test_data['Utilities'].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work be

In [78]:
# Filling basement-related and garage-related numerical features with 0
test_data['BsmtFinSF1'].fillna(0, inplace=True)
test_data['BsmtFinSF2'].fillna(0, inplace=True)
test_data['BsmtUnfSF'].fillna(0, inplace=True)
test_data['TotalBsmtSF'].fillna(0, inplace=True)
test_data['BsmtFullBath'].fillna(0, inplace=True)
test_data['BsmtHalfBath'].fillna(0, inplace=True)
test_data['GarageCars'].fillna(0, inplace=True)
test_data['GarageArea'].fillna(0, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_data['BsmtFinSF1'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_data['BsmtFinSF2'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting valu

In [80]:
missing_values = test_data.isnull().sum()

print(missing_values[missing_values>0])

Series([], dtype: int64)


In [84]:
print(test_data.isnull().sum().sum()) 

0


In [86]:
# save the cleaned test data
test_data.to_csv('../data/cleaned_test_data.csv', index=False)

### Encoding categorical data

In [89]:
# Create a dictionary to map ordinal features
quality_mapping = {
    'Ex': 5,  # Excellent
    'Gd': 4,  # Good
    'TA': 3,  # Typical/Average
    'Fa': 2,  # Fair
    'Po': 1   # Poor
}

# Apply the mapping manually
test_data['ExterQual'] = test_data['ExterQual'].map(quality_mapping)
test_data['BsmtQual'] = test_data['BsmtQual'].map(quality_mapping)
test_data['KitchenQual'] = test_data['KitchenQual'].map(quality_mapping)
test_data['FireplaceQu'] = test_data['FireplaceQu'].map(quality_mapping)
test_data['GarageQual'] = test_data['GarageQual'].map(quality_mapping)
test_data['GarageCond'] = test_data['GarageCond'].map(quality_mapping)
test_data['BsmtCond'] = test_data['BsmtCond'].map(quality_mapping)

In [91]:
# Apply one-hot encoding to categorical variables
categorical_columns = test_data.select_dtypes(include=['object']).columns
test_data = pd.get_dummies(test_data, columns=categorical_columns, drop_first=True)

### Create new features as per the training set

In [94]:
test_data['TotalSF'] = test_data['1stFlrSF'] + test_data['2ndFlrSF'] + test_data['TotalBsmtSF']
test_data['TotalBath'] = test_data['FullBath'] + (0.5 * test_data['HalfBath']) + test_data['BsmtFullBath'] + (0.5 * test_data['BsmtHalfBath'])
test_data['HouseAge'] = test_data['YrSold'] - test_data['YearBuilt']
test_data['Remodeled'] = (test_data['YearRemodAdd'] != test_data['YearBuilt']).astype(int)

### Feature scaling

In [99]:
# Apply standard scaling to numeric features
scaler = StandardScaler()
numeric_features = ['TotalSF', 'GrLivArea', 'LotArea', 'TotalBsmtSF', 'OverallQual', 'HouseAge', 'TotalBath']
test_data[numeric_features] = scaler.fit_transform(test_data[numeric_features])

In [101]:
# save the cleaned, encoded, scaled
test_data.to_csv('../data/cleaned_encoded_scaled_test_data.csv', index=False)