In [38]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

## Getting to know the data and processing it

In [39]:
housing_data = pd.read_csv("home-data-for-ml-course/train.csv")
test_data = pd.read_csv("home-data-for-ml-course/test.csv")
sample_sub = pd.read_csv("home-data-for-ml-course/sample_submission.csv")
print(housing_data.shape)
print(test_data.shape)
print(sample_sub.shape)

(1460, 81)
(1459, 80)
(1459, 2)


In [40]:
# training data columns & data
print("Training data columns:", housing_data.columns.tolist())
housing_data.head()

Training data columns: ['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond', 'PavedDrive', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [41]:
# sample submission data columns & data
print("Sample submission columns:", sample_sub.columns.tolist())
sample_sub.head()

Sample submission columns: ['Id', 'SalePrice']


Unnamed: 0,Id,SalePrice
0,1461,169277.052498
1,1462,187758.393989
2,1463,183583.68357
3,1464,179317.477511
4,1465,150730.079977


### We can conclude that our submission should be a csv of 2 columns (id + saleprice) the saleprice is also a float .6f

In [42]:
# data types + missing values
print(housing_data.dtypes)
housing_data.head(4)

Id                 int64
MSSubClass         int64
MSZoning          object
LotFrontage      float64
LotArea            int64
                  ...   
MoSold             int64
YrSold             int64
SaleType          object
SaleCondition     object
SalePrice          int64
Length: 81, dtype: object


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000


In [43]:
na_counts = housing_data.isna().sum()
columnsWithNa = na_counts[na_counts > 0]
print(columnsWithNa.sort_values(ascending=False))

PoolQC          1453
MiscFeature     1406
Alley           1369
Fence           1179
MasVnrType       872
FireplaceQu      690
LotFrontage      259
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
GarageCond        81
BsmtFinType2      38
BsmtExposure      38
BsmtFinType1      37
BsmtCond          37
BsmtQual          37
MasVnrArea         8
Electrical         1
dtype: int64


In [44]:
# dropping columns with too many missing values
housing_data = housing_data.drop(['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'MasVnrType', 'FireplaceQu'], axis=1)

In [45]:
#checking for values before imputing
housing_data.head(4)
housing_data['GarageType']

0       Attchd
1       Attchd
2       Attchd
3       Detchd
4       Attchd
         ...  
1455    Attchd
1456    Attchd
1457    Attchd
1458    Attchd
1459    Attchd
Name: GarageType, Length: 1460, dtype: object

In [46]:
# imputing missing values
housing_data['LotFrontage'] = housing_data['LotFrontage'].fillna(housing_data['LotFrontage'].median())


In [47]:
# GarageType
housing_data['GarageType'].nunique() # 6
housing_data['GarageType'].unique() # Attchd, Detchd, BuiltIn, Basment, CarPort, 2Types, NA
housing_data['GarageType'].value_counts()
mode_value = housing_data['GarageType'].mode()[0]
housing_data['GarageType'] = housing_data['GarageType'].fillna(mode_value)
housing_data['GarageType'] = housing_data['GarageType'].map({'Attchd':1, 'Detchd':2, 'BuiltIn':3, 'Basment':4, 'CarPort':5, '2Types':6})


In [48]:
# GarageYrBlt
housing_data['GarageYrBlt'].nunique() # 97
housing_data['GarageYrBlt'].unique()
housing_data['GarageYrBlt'].value_counts()
housing_data['GarageYrBlt'] = housing_data['GarageYrBlt'].fillna(housing_data['GarageYrBlt'].median())

In [49]:
# GarageFinish
housing_data['GarageFinish'].nunique() # 3
housing_data['GarageFinish'].unique()
housing_data['GarageFinish'].value_counts()
housing_data['GarageFinish'] = housing_data['GarageFinish'].fillna(housing_data['GarageFinish'].mode()[0])
housing_data['GarageFinish'] = housing_data['GarageFinish'].map({'Unf':1, 'RFn':2, 'Fin':3})


In [50]:
# GarageQual and GarageCond
quality_map = {'NA': 0, 'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5}
housing_data['GarageScore'] = (0.6*housing_data['GarageQual'].map(quality_map) + 0.4*housing_data['GarageCond'].map(quality_map))
housing_data['GarageScore'] = housing_data['GarageScore'].fillna(0)
housing_data = housing_data.drop(['GarageQual', 'GarageCond'], axis=1)

In [51]:
na_counts = housing_data.isna().sum()
columnsWithNa = na_counts[na_counts > 0]
print(columnsWithNa.sort_values(ascending=False))

BsmtExposure    38
BsmtFinType2    38
BsmtQual        37
BsmtCond        37
BsmtFinType1    37
MasVnrArea       8
Electrical       1
dtype: int64


In [52]:
housing_data['Electrical'].value_counts()
housing_data['Electrical'] = housing_data['Electrical'].fillna(housing_data['Electrical'].mode()[0])
mapping = {'SBrkr':1, 'FuseA':2, 'FuseF':3, 'FuseP':4, 'Mix':5}
housing_data['Electrical'] = housing_data['Electrical'].map(mapping)
housing_data["Electrical"]

0       1
1       1
2       1
3       1
4       1
       ..
1455    1
1456    1
1457    1
1458    2
1459    1
Name: Electrical, Length: 1460, dtype: int64

In [53]:
na_counts = housing_data.isna().sum()
columnsWithNa = na_counts[na_counts > 0]
print(columnsWithNa.sort_values(ascending=False))

BsmtExposure    38
BsmtFinType2    38
BsmtQual        37
BsmtCond        37
BsmtFinType1    37
MasVnrArea       8
dtype: int64


In [54]:
housing_data['MasVnrArea'] = housing_data['MasVnrArea'].fillna(housing_data['MasVnrArea'].median())


In [55]:
# mapping of basement features
bsmt_qual_map = {
    'Ex': 5,  
    'Gd': 4,
    'TA': 3,
    'Fa': 2,
    'Po': 1
}
bsmt_cond_map = {
    'Ex': 5,  
    'Gd': 4,  
    'TA': 3,  
    'Fa': 2,  
    'Po': 1   
}
bsmt_exposure_map = {
    'Gd': 4,
    'Av': 3,
    'Mn': 2,
    'No': 1
}
bsmt_fintype_map = {
    'GLQ': 6,  
    'ALQ': 5,
    'BLQ': 4,
    'Rec': 3,
    'LwQ': 2,
    'Unf': 1
}
housing_data["BsmtQual"] = housing_data["BsmtQual"].map(bsmt_qual_map)
housing_data["BsmtCond"] = housing_data["BsmtCond"].map(bsmt_cond_map)
housing_data["BsmtExposure"] = housing_data["BsmtExposure"].map(bsmt_exposure_map)
housing_data["BsmtFinType1"] = housing_data["BsmtFinType1"].map(bsmt_fintype_map)
housing_data["BsmtFinType2"] = housing_data["BsmtFinType2"].map(bsmt_fintype_map)
housing_data['BsmtScore'] = (
    housing_data['BsmtQual']*0.3 + housing_data['BsmtCond']*0.25 + housing_data['BsmtExposure']*0.1 +
    housing_data['BsmtFinType1']*0.3 + housing_data['BsmtFinType2']*0.05
)

In [56]:
na_counts = housing_data.isna().sum()
columnsWithNa = na_counts[na_counts > 0]
print(columnsWithNa.sort_values(ascending=False))

BsmtScore       39
BsmtExposure    38
BsmtFinType2    38
BsmtQual        37
BsmtCond        37
BsmtFinType1    37
dtype: int64


In [57]:
housing_data['BsmtScore'] = housing_data['BsmtScore'].fillna(0)
housing_data = housing_data.drop(['BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2'], axis=1)

In [58]:
na_counts = housing_data.isna().sum()
columnsWithNa = na_counts[na_counts > 0]
print(columnsWithNa.sort_values(ascending=False))

Series([], dtype: int64)


In [59]:
housing_data.head(4)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice,GarageScore,BsmtScore
0,1,60,RL,65.0,8450,Pave,Reg,Lvl,AllPub,Inside,...,0,0,0,2,2008,WD,Normal,208500,3.0,3.9
1,2,20,RL,80.0,9600,Pave,Reg,Lvl,AllPub,FR2,...,0,0,0,5,2007,WD,Normal,181500,3.0,3.9
2,3,60,RL,68.0,11250,Pave,IR1,Lvl,AllPub,Inside,...,0,0,0,9,2008,WD,Normal,223500,3.0,4.0
3,4,70,RL,60.0,9550,Pave,IR1,Lvl,AllPub,Corner,...,0,0,0,2,2006,WD,Abnorml,140000,3.0,3.55


In [60]:
housing_data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice,GarageScore,BsmtScore
0,1,60,RL,65.0,8450,Pave,Reg,Lvl,AllPub,Inside,...,0,0,0,2,2008,WD,Normal,208500,3.0,3.9
1,2,20,RL,80.0,9600,Pave,Reg,Lvl,AllPub,FR2,...,0,0,0,5,2007,WD,Normal,181500,3.0,3.9
2,3,60,RL,68.0,11250,Pave,IR1,Lvl,AllPub,Inside,...,0,0,0,9,2008,WD,Normal,223500,3.0,4.0
3,4,70,RL,60.0,9550,Pave,IR1,Lvl,AllPub,Corner,...,0,0,0,2,2006,WD,Abnorml,140000,3.0,3.55
4,5,60,RL,84.0,14260,Pave,IR1,Lvl,AllPub,FR2,...,0,0,0,12,2008,WD,Normal,250000,3.0,4.1


In [61]:
# Get columns that are not int or float types
categorical_columns = housing_data.select_dtypes(exclude=['int64', 'float64']).dtypes
print(categorical_columns)

MSZoning         object
Street           object
LotShape         object
LandContour      object
Utilities        object
LotConfig        object
LandSlope        object
Neighborhood     object
Condition1       object
Condition2       object
BldgType         object
HouseStyle       object
RoofStyle        object
RoofMatl         object
Exterior1st      object
Exterior2nd      object
ExterQual        object
ExterCond        object
Foundation       object
Heating          object
HeatingQC        object
CentralAir       object
KitchenQual      object
Functional       object
PavedDrive       object
SaleType         object
SaleCondition    object
dtype: object


In [62]:
# ordinal encoding for quality/ordered features
ordinal_mappings = {
    'ExterQual': {'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5},
        'ExterCond': {'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5},
        'HeatingQC': {'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5},
        'KitchenQual': {'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5},

        'Street': {'Grvl': 0, 'Pave': 1},
        'CentralAir': {'N': 0, 'Y': 1},
        'PavedDrive': {'N': 0, 'P': 1, 'Y': 2},
        
        'Utilities': {'ELO': 1, 'NoSeWa': 2, 'NoSewr': 3, 'AllPub': 4},
        
        'LotShape': {'IR3': 1, 'IR2': 2, 'IR1': 3, 'Reg': 4},

        'LandSlope': {'Sev': 1, 'Mod': 2, 'Gtl': 3},
        
        'Functional': {
            'Sal': 1, 'Sev': 2, 'Maj2': 3, 'Maj1': 4,
            'Mod': 5, 'Min2': 6, 'Min1': 7, 'Typ': 8
        }
}
for col, mapping in ordinal_mappings.items():
    if col in housing_data.columns:
        housing_data[col] = housing_data[col].map(mapping)
        

In [63]:
# frequency encoding for Neighborhood
frequency_encode = housing_data['Neighborhood']
for col in frequency_encode:
    if col in housing_data.columns:
        freq_map = housing_data[col].value_counts(normalize=True).to_dict()
        housing_data[col + '_FreqEnc'] = housing_data[col].map(freq_map)
        
housing_data.drop('Neighborhood', axis=1, inplace=True)


In [64]:
# one hot encoding for other nominal categorical features
nominal_features = [
    'MSZoning',
    'LandContour',
    'LotConfig',
    'Condition1',
    'Condition2',
    'BldgType',
    'HouseStyle',
    'RoofStyle',
    'RoofMatl',
    'Exterior1st',
    'Exterior2nd',
    'Foundation',
    'Heating',
    'SaleType',
    'SaleCondition'
]
nominal_features = [col for col in nominal_features if col in housing_data.columns]
housing_data = pd.get_dummies(housing_data, columns=nominal_features, drop_first=True)

In [65]:
categorical_columns = housing_data.select_dtypes(exclude=['int64', 'float64']).dtypes
print(categorical_columns)

MSZoning_FV              bool
MSZoning_RH              bool
MSZoning_RL              bool
MSZoning_RM              bool
LandContour_HLS          bool
                         ... 
SaleCondition_AdjLand    bool
SaleCondition_Alloca     bool
SaleCondition_Family     bool
SaleCondition_Normal     bool
SaleCondition_Partial    bool
Length: 101, dtype: object


In [66]:
housing_data.head()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,Street,LotShape,Utilities,LandSlope,OverallQual,OverallCond,...,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,1,60,65.0,8450,1,4,4,3,7,5,...,False,False,False,False,True,False,False,False,True,False
1,2,20,80.0,9600,1,4,4,3,6,8,...,False,False,False,False,True,False,False,False,True,False
2,3,60,68.0,11250,1,3,4,3,7,5,...,False,False,False,False,True,False,False,False,True,False
3,4,70,60.0,9550,1,3,4,3,7,5,...,False,False,False,False,True,False,False,False,False,False
4,5,60,84.0,14260,1,3,4,3,8,5,...,False,False,False,False,True,False,False,False,True,False


In [67]:
housing_data = housing_data.astype({col: int for col in housing_data.select_dtypes(include=['bool']).columns})

In [68]:
housing_data.dtypes


Id                         int64
MSSubClass                 int64
LotFrontage              float64
LotArea                    int64
Street                     int64
                          ...   
SaleCondition_AdjLand      int64
SaleCondition_Alloca       int64
SaleCondition_Family       int64
SaleCondition_Normal       int64
SaleCondition_Partial      int64
Length: 155, dtype: object

In [69]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt


In [70]:
X = housing_data.drop(['SalePrice'], axis=1)
Y = housing_data['SalePrice']
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)


In [71]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)

In [72]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import numpy as np

model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train_scaled, Y_train)
Y_pred = model.predict(X_test_scaled)
mse = mean_squared_error(Y_test, Y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(Y_test, Y_pred)
mae = mean_absolute_error(Y_test, Y_pred)
print(f"RMSE: {rmse}")
print(f"R^2: {r2}")
print(f"MAE: {mae}")

RMSE: 28810.44955935171
R^2: 0.8917853059451114
MAE: 17474.593253424657


In [80]:
# Load test data FRESH (important!)
test_data = pd.read_csv("home-data-for-ml-course/test.csv")
test_ids = test_data['Id'].copy()

# Drop same columns as training
test_data = test_data.drop(['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'MasVnrType', 'FireplaceQu'], axis=1, errors='ignore')

# Impute LotFrontage
test_data['LotFrontage'] = test_data['LotFrontage'].fillna(test_data['LotFrontage'].median())

# Impute MasVnrArea
test_data['MasVnrArea'] = test_data['MasVnrArea'].fillna(test_data['MasVnrArea'].median())

# GarageType
mode_value = test_data['GarageType'].mode()[0]
test_data['GarageType'] = test_data['GarageType'].fillna(mode_value)
test_data['GarageType'] = test_data['GarageType'].map({'Attchd':1, 'Detchd':2, 'BuiltIn':3, 'Basment':4, 'CarPort':5, '2Types':6})
test_data['GarageType'] = test_data['GarageType'].fillna(1)

# GarageYrBlt
test_data['GarageYrBlt'] = test_data['GarageYrBlt'].fillna(test_data['GarageYrBlt'].median())

# GarageFinish
mode_value = test_data['GarageFinish'].mode()[0]
test_data['GarageFinish'] = test_data['GarageFinish'].fillna(mode_value)
test_data['GarageFinish'] = test_data['GarageFinish'].map({'Unf':1, 'RFn':2, 'Fin':3})
test_data['GarageFinish'] = test_data['GarageFinish'].fillna(1)

# GarageScore (using EXACT same mapping as training)
quality_map = {'NA': 0, 'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5}
test_data['GarageScore'] = (0.6*test_data['GarageQual'].map(quality_map) + 0.4*test_data['GarageCond'].map(quality_map))
test_data['GarageScore'] = test_data['GarageScore'].fillna(0)
test_data = test_data.drop(['GarageQual', 'GarageCond'], axis=1)

# Electrical
test_data['Electrical'] = test_data['Electrical'].fillna(test_data['Electrical'].mode()[0])
mapping = {'SBrkr':1, 'FuseA':2, 'FuseF':3, 'FuseP':4, 'Mix':5}
test_data['Electrical'] = test_data['Electrical'].map(mapping)
test_data['Electrical'] = test_data['Electrical'].fillna(1)

# Basement features
bsmt_qual_map = {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1}
bsmt_cond_map = {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1}
bsmt_exposure_map = {'Gd': 4, 'Av': 3, 'Mn': 2, 'No': 1}
bsmt_fintype_map = {'GLQ': 6, 'ALQ': 5, 'BLQ': 4, 'Rec': 3, 'LwQ': 2, 'Unf': 1}

test_data["BsmtQual"] = test_data["BsmtQual"].map(bsmt_qual_map)
test_data["BsmtCond"] = test_data["BsmtCond"].map(bsmt_cond_map)
test_data["BsmtExposure"] = test_data["BsmtExposure"].map(bsmt_exposure_map)
test_data["BsmtFinType1"] = test_data["BsmtFinType1"].map(bsmt_fintype_map)
test_data["BsmtFinType2"] = test_data["BsmtFinType2"].map(bsmt_fintype_map)

test_data['BsmtScore'] = (
    test_data['BsmtQual']*0.3 + test_data['BsmtCond']*0.25 + test_data['BsmtExposure']*0.1 +
    test_data['BsmtFinType1']*0.3 + test_data['BsmtFinType2']*0.05
)
test_data['BsmtScore'] = test_data['BsmtScore'].fillna(0)
test_data = test_data.drop(['BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2'], axis=1)

# Ordinal encoding for quality/ordered features
ordinal_mappings = {
    'ExterQual': {'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5},
    'ExterCond': {'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5},
    'HeatingQC': {'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5},
    'KitchenQual': {'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5},
    'Street': {'Grvl': 0, 'Pave': 1},
    'CentralAir': {'N': 0, 'Y': 1},
    'PavedDrive': {'N': 0, 'P': 1, 'Y': 2},
    'Utilities': {'ELO': 1, 'NoSeWa': 2, 'NoSewr': 3, 'AllPub': 4},
    'LotShape': {'IR3': 1, 'IR2': 2, 'IR1': 3, 'Reg': 4},
    'LandSlope': {'Sev': 1, 'Mod': 2, 'Gtl': 3},
    'Functional': {
        'Sal': 1, 'Sev': 2, 'Maj2': 3, 'Maj1': 4,
        'Mod': 5, 'Min2': 6, 'Min1': 7, 'Typ': 8
    }
}

for col, mapping in ordinal_mappings.items():
    if col in test_data.columns:
        test_data[col] = test_data[col].map(mapping)
        # Fill NaNs that result from mapping
        if test_data[col].isna().sum() > 0:
            test_data[col] = test_data[col].fillna(test_data[col].median())

# Fill missing values for categorical columns that will be one-hot encoded
for col in ['MSZoning', 'Exterior1st', 'Exterior2nd', 'SaleType']:
    if col in test_data.columns and test_data[col].isna().sum() > 0:
        test_data[col] = test_data[col].fillna(test_data[col].mode()[0])

# Fill any remaining numeric NaNs
numeric_cols = test_data.select_dtypes(include=[np.number]).columns
for col in numeric_cols:
    if test_data[col].isna().sum() > 0:
        test_data[col] = test_data[col].fillna(test_data[col].median())

# Frequency encoding for Neighborhood
frequency_encode = test_data['Neighborhood']
for col in frequency_encode:
    if col in test_data.columns:
        freq_map = test_data[col].value_counts(normalize=True).to_dict()
        test_data[col + '_FreqEnc'] = test_data[col].map(freq_map)
        
test_data.drop('Neighborhood', axis=1, inplace=True)

# One hot encoding for other nominal categorical features
nominal_features = [
    'MSZoning',
    'LandContour',
    'LotConfig',
    'Condition1',
    'Condition2',
    'BldgType',
    'HouseStyle',
    'RoofStyle',
    'RoofMatl',
    'Exterior1st',
    'Exterior2nd',
    'Foundation',
    'Heating',
    'SaleType',
    'SaleCondition'
]

nominal_features = [col for col in nominal_features if col in test_data.columns]
test_data = pd.get_dummies(test_data, columns=nominal_features, drop_first=True)

# Convert bool to int
test_data = test_data.astype({col: int for col in test_data.select_dtypes(include=['bool']).columns})

# CRITICAL: Align test_data columns with training data columns
# Get the training columns (excluding SalePrice and Id)
train_cols = [col for col in housing_data.columns if col not in ['SalePrice', 'Id']]

# Add missing columns to test_data (fill with 0)
for col in train_cols:
    if col not in test_data.columns:
        test_data[col] = 0

# Remove extra columns from test_data
test_data = test_data[train_cols]

# Now scale the test data using the same scaler
test_data_scaled = scaler.transform(test_data)

# Make predictions
predictions = model.predict(test_data_scaled)

# Create submission
submission = pd.DataFrame({
    'Id': test_ids,
    'SalePrice': predictions
})

submission.to_csv('submission.csv', index=False)
print("Submission file created!")
print(f"Predicted prices range: ${predictions.min():.2f} to ${predictions.max():.2f}")

  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0
  test_data[col] = 0


ValueError: The feature names should match those that were passed during fit.
Feature names seen at fit time, yet now missing:
- Id
