# Ames Final Modeling

Load the dataset, begin modeling.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import xgboost as xgb
from itertools import combinations

In [2]:
#housing_nick = pd.read_csv('data/NH_DF.csv')
#housing_cats = pd.read_csv('data/housing_with_cats.csv')
housing = pd.read_csv('data/housing_numerical.csv')

housing.drop('latitude',axis=1,inplace=True)
housing.drop('longitude',axis=1,inplace=True)
housing.drop('DateSold',axis=1,inplace=True)


In [3]:
#housing_nick.columns

In [4]:
# Fill NaN with 'None' for categorical columns, and fill NaN with 0 for numerical features.
housing_na = housing.copy()
for i, feature in enumerate(housing.columns):
    if housing[feature].dtype=='object':
        housing[feature]=housing[feature].fillna('None')
    else:
        h_mean = housing[feature].mean()
        housing[feature] = housing[feature].fillna(h_mean)


# RandomForestRegressor

In [5]:
X = housing.drop('SalePrice',axis=1) # Features
y = housing['SalePrice'] # Target variable

# Run the model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Evaluate the model
print(f'R^2 Score: {r2_score(y_test, y_pred)}\n')

# Print R^2 score for each fold
scores = cross_val_score(model, X_train, y_train, cv=5, scoring='r2')

for i, score in enumerate(scores, 1):
    print(f"   R^2 Score for fold {i}: {score:.4f}")
print(f"Average R^2 Score across the 5 folds: {scores.mean():.4f}")

R^2 Score: 0.9099304382867012

   R^2 Score for fold 1: 0.8919
   R^2 Score for fold 2: 0.9188
   R^2 Score for fold 3: 0.8961
   R^2 Score for fold 4: 0.9074
   R^2 Score for fold 5: 0.9134
Average R^2 Score across the 5 folds: 0.9055


# XGBoost Regressor

## Default tuning

In [6]:
X = housing_na.drop('SalePrice',axis=1) # Features
y = housing_na['SalePrice'] # Target variable

# Initialize the KFold cross-validator
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Iterate over the folds
r2_scores = []
fold = 1
for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    model = xgb.XGBRegressor(objective="reg:squarederror", n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    r2_scores.append(r2_score(y_test, y_pred))
    
    print(f"Fold {fold}: R2 = {r2_score(y_test, y_pred):.4f}")
    fold += 1

mean_r2 = np.mean(r2_scores)
print(f"Mean R2: {mean_r2:.4f}")


Fold 1: R2 = 0.9102
Fold 2: R2 = 0.9323
Fold 3: R2 = 0.9054
Fold 4: R2 = 0.9119
Fold 5: R2 = 0.9031
Mean R2: 0.9126


## Hyperparameter tuning

In [7]:
param_grid = {
    'max_depth': [3, 5, 7, 9],
    'min_child_weight': [1, 3, 5],
    'gamma': [0.0, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'lambda': [0, 0.5, 1],
    'alpha': [0, 0.5, 1]
}

xgb_model = xgb.XGBRegressor(objective="reg:squarederror", n_estimators=100, random_state=42)
grid_search = GridSearchCV(xgb_model, param_grid, cv=3, scoring='r2', verbose=1, n_jobs=-1)
grid_search.fit(X, y)
print(f"Best hyperparameters: {grid_search.best_params_}")

Fitting 3 folds for each of 2916 candidates, totalling 8748 fits
Best hyperparameters: {'alpha': 1, 'colsample_bytree': 0.6, 'gamma': 0.0, 'lambda': 0, 'max_depth': 3, 'min_child_weight': 5, 'subsample': 0.8}


In [8]:
#best_params = {
#    'alpha': 1, 
#    'colsample_bytree': 0.8, 
#    'gamma': 0.0, 
#    'lambda': 0.5, 
#    'max_depth': 3, 
#    'min_child_weight': 1, 
#    'subsample': 0.8,
#    'objective': 'reg:squarederror'  # This is the default for regression tasks in XGBoost
#}

#grid_search.best_params_
model = xgb.XGBRegressor(**grid_search.best_params_, objective="reg:squarederror", n_estimators=100, random_state=42)

kf = KFold(n_splits=5, shuffle=True, random_state=0)
mse_scores = []
r2_scores = []
for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    mse_scores.append(mean_squared_error(y_test, y_pred))
    r2_scores.append(r2_score(y_test, y_pred))
    

print("MSE for each fold:", mse_scores)
print("R^2 for each fold:", r2_scores)
print("Average MSE:", sum(mse_scores) / len(mse_scores))
print("Average R^2:", sum(r2_scores) / len(r2_scores))



MSE for each fold: [410427637.86549234, 300817950.69152856, 462809105.575149, 416694091.3130442, 520440375.4904194]
R^2 for each fold: [0.9333208258067247, 0.9307865326950111, 0.9255768549402106, 0.9247190661243022, 0.8999696645837336]
Average MSE: 422237832.18712676
Average R^2: 0.9228745888299963


In [9]:
# Create an XGBoost regressor model (if you haven't already)
model = xgb.XGBRegressor(objective="reg:squarederror", n_estimators=100, random_state=42)
model.fit(X, y)
feature_importances = model.feature_importances_

# Create a DataFrame to display feature names and their importances
feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importances})

# Sort the features by importance in descending order
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
print(feature_importance_df.head(25))


          Feature  Importance
16    OverallQual    0.433660
77        TotalSF    0.224078
53     GarageCars    0.039195
46    KitchenQual    0.027594
28       BsmtQual    0.025722
39     CentralAir    0.024285
76            Age    0.021036
56     GarageCond    0.020705
25      ExterQual    0.016866
48     Fireplaces    0.012157
78       BsmtBath    0.009970
11   Neighborhood    0.009317
17    OverallCond    0.007800
18   YearRemodAdd    0.006844
14       BldgType    0.006704
79           Bath    0.006412
62    ScreenPorch    0.006141
32     BsmtFinSF1    0.005327
10      LandSlope    0.004447
4         LotArea    0.004343
54     GarageArea    0.004023
2        MSZoning    0.003758
50     GarageType    0.003640
49    FireplaceQu    0.003368
71  SaleCondition    0.003068


# Gradient Boosting Regressor

## First run with no tuning

In [10]:
X = housing.drop('SalePrice',axis=1) # Features
y = housing['SalePrice'] # Target variable

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = GradientBoostingRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

scores = cross_val_score(model, X_train, y_train, cv=5, scoring='r2')
# Print average R^2 score
print(f"\nAverage R^2 Score across the 5 folds: {scores.mean():.4f}")


Average R^2 Score across the 5 folds: 0.9241


## Hyperparameter tuning

In [13]:
param_grid = {
    'learning_rate': [0.1, 0.05, 0.01, 0.001],
    'n_estimators': [100, 200, 300, 400, 500, 600],
    'subsample': [0.3, 0.7, 0.8, 0.9],   
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'min_weight_fraction_leaf': [0, 0.25, 0.5],    
    'max_depth': [3, 5, 6, 7],
    'max_leaf_nodes': [5, 10, 50]
}

X = housing.drop('SalePrice',axis=1) # Features
y = housing['SalePrice'] # Target variable

model = GradientBoostingRegressor(random_state=42, max_features=0.4)
grid_search = GridSearchCV(model, param_grid, cv=3, scoring='r2', verbose=1, n_jobs=-1)
grid_search.fit(X, y)
print(f"Best hyperparameters: {grid_search.best_params_}")



Fitting 3 folds for each of 31104 candidates, totalling 93312 fits


KeyboardInterrupt: 

In [11]:
#My best hyperparameters: {'alpha': 0.5, 'max_depth': 5, 'min_weight_fraction_leaf': 0, 'subsample': 0.8}
#0.9219231402135216
#Best hyperparameters: {'alpha': 0.5, 'learning_rate': 0.05, 'max_depth': 3, 'min_samples_leaf': 1, 
#'min_samples_split': 10, 'min_weight_fraction_leaf': 0, 'n_estimators': 600, 'subsample': 0.7}
#0.9279361418087255
test_param = {
    'learning_rate': 0.008,
    'n_estimators': 7000,
    'subsample': 0.3,   
    'min_samples_split': 4,
    'min_samples_leaf': 1,   
    'max_depth': 5,
    'max_leaf_nodes': 5,
    'max_features': 0.9,
    'random_state': 42
}

best_paramGRIDresult = {
    'learning_rate': 0.011,
    'n_estimators': 4000,
    'subsample': 0.3,
    'min_samples_split': 5,
    'max_depth': 3,
    'max_features': 0.2,
    'random_state': 42
}

X = housing.drop('SalePrice',axis=1) # Features
y = housing['SalePrice'] # Target variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = GradientBoostingRegressor(**best_paramGRIDresult)
model.fit(X_train, y_train)
scores = cross_val_score(model, X_train, y_train, cv=5, scoring='r2')
print(f"\nAverage R^2 Score across the 5 folds: {scores.mean():.4f}")


Average R^2 Score across the 5 folds: 0.9407


In [12]:
feature_importances = model.feature_importances_

# Create a DataFrame to display feature names and their importances
feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importances})

# Sort the features by importance in descending order
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
print(feature_importance_df.head(25))

          Feature  Importance
77        TotalSF    0.172690
16    OverallQual    0.118732
0       GrLivArea    0.071330
53     GarageCars    0.061269
54     GarageArea    0.056724
28       BsmtQual    0.053432
25      ExterQual    0.045868
36    TotalBsmtSF    0.039132
41       1stFlrSF    0.034352
76            Age    0.034191
46    KitchenQual    0.026633
32     BsmtFinSF1    0.026365
24     MasVnrArea    0.020024
79           Bath    0.018192
4         LotArea    0.016399
18   YearRemodAdd    0.015493
48     Fireplaces    0.015469
80  MiscRmsAbvGrd    0.014483
51    GarageYrBlt    0.013150
11   Neighborhood    0.012266
42       2ndFlrSF    0.011249
17    OverallCond    0.009239
3     LotFrontage    0.008148
71  SaleCondition    0.008013
35      BsmtUnfSF    0.006707


In [None]:
# stratified kfold