## Kaggle Housing Prices Competition

### Assess Machine Learning Algorithms
* Train and evaluate multiple models on the training set
* Comparative analysis of the models and their errors
* Evaluation Using Cross-Validation

In [62]:
# Load Data
import pickle
import os

# Define the folder path
folder_path = 'housing_price_data_processed'

# Load the data
with open(os.path.join(folder_path, 'housing_X_prepared.pkl'), 'rb') as f:
    housing_X_prepared = pickle.load(f)

with open(os.path.join(folder_path, 'housing_y_prepared.pkl'), 'rb') as f:
    housing_y_prepared = pickle.load(f)

In [63]:
# Split data into train and test sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(housing_X_prepared, housing_y_prepared,
                                                    test_size = 0.2, random_state = 0)

In [64]:
# Start with linear regression model
from sklearn.linear_model import LinearRegression

# Train model on training data
model = LinearRegression()
model.fit(X_train, y_train)

# Evaluate model on test data
print("Accuracy:", model.score(X_test, y_test) * 100, "%")

Accuracy: 92.67912471348792 %


In [65]:
# Invert the log-transformed target variable
def inv_y(transformed_y):
    return np.exp(transformed_y)

import pandas as pd

# Initialize series to collect RMSE values for model comparison
rmse_compare = pd.Series()
rmse_compare.index.name = 'Model'

# Initialize series to collect accuracy scores for model comparison
accuracy_compare = pd.Series()
accuracy_compare.index.name = 'Model'

#### Linear Regression Model

In [66]:
from sklearn.metrics import root_mean_squared_error
from sklearn.linear_model import LinearRegression
import numpy as np

linear_model = LinearRegression()
linear_model.fit(X_train, y_train)

linear_predictions = linear_model.predict(X_test)

linear_rmse = root_mean_squared_error(inv_y(linear_predictions), inv_y(y_test))
rmse_compare['LinearRegression'] = linear_rmse
print("RMSE:", linear_rmse)

linear_accuracy = linear_model.score(X_test, y_test) * 100
accuracy_compare['LinearRegression'] = linear_accuracy
print("Accuracy:", linear_accuracy)

RMSE: 20289.523965880635
Accuracy: 92.67912471348792


#### Decision Tree Model

In [67]:
from sklearn.tree import DecisionTreeRegressor

decision_tree = DecisionTreeRegressor(random_state = 0)
decision_tree.fit(X_train, y_train)

decision_tree_predictions = decision_tree.predict(X_test)

decision_tree_rmse = root_mean_squared_error(inv_y(decision_tree_predictions), inv_y(y_test))
rmse_compare['DecisionTree'] = decision_tree_rmse
print("RMSE:", decision_tree_rmse)

decision_tree_accuracy = decision_tree.score(X_test, y_test) * 100
accuracy_compare['DecisionTree'] = decision_tree_accuracy
print("Accuracy:", decision_tree_accuracy)

RMSE: 35336.062578786696
Accuracy: 76.26657073636493


#### Random Forest

In [68]:
from sklearn.ensemble import RandomForestRegressor

random_forest = RandomForestRegressor(random_state = 0)
random_forest.fit(X_train, y_train)

random_forest_predictions = random_forest.predict(X_test)

random_forest_rmse = root_mean_squared_error(inv_y(random_forest_predictions), inv_y(y_test))
rmse_compare['RandomForest'] = random_forest_rmse
print("RMSE:", random_forest_rmse)

random_forest_accuracy = random_forest.score(X_test, y_test) * 100
accuracy_compare['RandomForest'] = random_forest_accuracy
print("Accuracy:", random_forest_accuracy)

RMSE: 27318.888892787993
Accuracy: 89.44309219207133


#### Gradient Boosting Regression

In [69]:
from sklearn.ensemble import GradientBoostingRegressor

gradient_boosting = GradientBoostingRegressor(n_estimators = 300, learning_rate = 0.05,
                                              max_depth = 4, random_state = 0)
gradient_boosting.fit(X_train, y_train)

gradient_boosting_predictions = gradient_boosting.predict(X_test)

gradient_boosting_rmse = root_mean_squared_error(inv_y(gradient_boosting_predictions), inv_y(y_test))
rmse_compare['GradientBoosting'] = gradient_boosting_rmse
print("RMSE:", gradient_boosting_rmse)

gradient_boosting_accuracy = gradient_boosting.score(X_test, y_test) * 100
accuracy_compare['GradientBoosting'] = gradient_boosting_accuracy
print("Accuracy:", gradient_boosting_accuracy)

RMSE: 23650.011356649014
Accuracy: 92.42449141041659


#### Compare Model Results

In [70]:
# The smaller the RMSE value, the better (RMSE is the value by which our model's predictions deviate from the true prices)
print("RMSE per Model:")
rmse_compare.sort_values(ascending = True).round()

RMSE per Model:


Model
LinearRegression    20290.0
GradientBoosting    23650.0
RandomForest        27319.0
DecisionTree        35336.0
dtype: float64

In [71]:
# The higher the accuracy, the better
print("Accuracy per Model:")
accuracy_compare.sort_values(ascending = False).round(2)

Accuracy per Model:


Model
LinearRegression    92.68
GradientBoosting    92.42
RandomForest        89.44
DecisionTree        76.27
dtype: float64

#### Cross-Validation - Linear Regression

In [72]:
from sklearn.model_selection import cross_val_score

# Perform K fold cross-validation, where K = 10, on best model 
linear_model = LinearRegression()

scores = cross_val_score(linear_model, X_train, y_train,
                         scoring = "neg_mean_squared_error", cv = 10)
linear_rmse_scores = np.sqrt(-scores)

# Evaluate results via RMSE (Smaller, the better)
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())
    
display_scores(linear_rmse_scores)

Scores: [0.14244617 0.16244261 0.12236011 0.13203988 0.12853196 0.12142648
 0.11156017 0.10115619 0.12061519 0.0988819 ]
Mean: 0.12414606641946264
Standard deviation: 0.017975921198532602


In [128]:
from sklearn.model_selection import cross_val_score

# Perform K-fold cross-validation, where K = 10, on best model
linear_model = LinearRegression()

r2_scores = cross_val_score(linear_model, X_train, y_train, scoring = "r2", cv = 10)

# Evaluate results via R-Squared (Closer to 1, the better)
def display_r2_scores(scores):
    print("R-squared Scores:", scores)
    print("Mean R-squared:", scores.mean())
    print("Standard Deviation:", scores.std())

# Display the R-squared scores
display_r2_scores(r2_scores)

R-squared Scores: [0.88368174 0.83636119 0.88509478 0.89616134 0.85974127 0.90551255
 0.88983487 0.93323584 0.8910029  0.94773103]
Mean R-squared: 0.892835749580841
Standard Deviation: 0.03042716550342749


#### Grid Search, Feature Selection, & Regularization - Linear Regression

In [120]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import Ridge
from sklearn.pipeline import Pipeline

# Define the parameters to explore via GridSearch
parameters = {
    'polynomialfeatures__degree': [1, 2, 3],
    'ridge__alpha': [0.1, 1.0, 10.0]
}

# Define Pipeline with Polynomial Features and Ridge Rgression
pipeline = Pipeline([
    ('polynomialfeatures', PolynomialFeatures(include_bias=False)),
    ('ridge', Ridge())
])

# Define model
ridge_model_grid_search = GridSearchCV(pipeline, parameters, cv = 5, 
                                       scoring = 'neg_root_mean_squared_error')

# Fit the model
ridge_model_grid_search.fit(X_train, y_train)

In [121]:
# Best parameters
print("Best parameters:", ridge_model_grid_search.best_params_)

Best parameters: {'polynomialfeatures__degree': 1, 'ridge__alpha': 10.0}


In [123]:
# Evaluate using Fine-Tuned Model Parameters
final_regression_model = Ridge(alpha = 10.0)
poly = PolynomialFeatures(degree = 1)

# Transform via PolynomialFeatures
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)

# Fit the model
final_regression_model.fit(X_train_poly, y_train)

ridge_predictions = final_regression_model.predict(X_test_poly)

# Calculate RMSE
linear_final_rmse = root_mean_squared_error(inv_y(ridge_predictions), inv_y(y_test))
print("RMSE:", linear_final_rmse)

# Calculate R² score (accuracy)
linear_final_accuracy = final_regression_model.score(X_test_poly, y_test) * 100
print("Accuracy:", linear_final_accuracy)

RMSE: 20291.689362419223
Accuracy: 93.10406522796562


By fine-tuning the model through the use of grid search, feature selection, and regularization we were able to improve the accuracy from 92.68% to 93.10% and maintained the same RMSE value, to improve the model. 

#### Grid Search - Decision Tree

In [111]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV

# Define the parameters to explore via GridSearch
parameters = {
    'max_depth': [None, 5, 10, 15, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4, 5],
    'max_features': [None, 'sqrt', 'log2']
}

# Define the model
dt_model = DecisionTreeRegressor(random_state = 0)

dt_model_grid_search = GridSearchCV(dt_model, parameters, cv = 5, 
                           scoring = 'neg_mean_squared_error', 
                           return_train_score = True)

# Fit the model
dt_model_grid_search.fit(X_train, y_train)

In [112]:
# Best parameters
print("Best parameters:", dt_model_grid_search.best_params_)

Best parameters: {'max_depth': 15, 'max_features': None, 'min_samples_leaf': 2, 'min_samples_split': 10}


In [113]:
# Evaluate using Fine-Tuned Model Parameters
dt_model_final = DecisionTreeRegressor(max_depth = 15, max_features = None, 
                                       min_samples_leaf = 2, min_samples_split = 10,
                                       random_state = 0)

dt_model_final.fit(X_train, y_train)
dt_final_predictions = dt_model_final.predict(X_test)

dt_final_rmse = root_mean_squared_error(inv_y(dt_final_predictions), inv_y(y_test))
print("RMSE:", dt_final_rmse)

dt_final_accuracy = dt_model_final.score(X_test, y_test) * 100
print("Accuracy:", dt_final_accuracy)

RMSE: 34606.706171585174
Accuracy: 80.49425988390664


By fine-tuning the model through the use of grid search we were able to improve the accuracy from 76.27% to 80.49% and decreased RMSE by about 700, improving the overall model performance. 

#### Grid Search - Random Forest

In [74]:
from sklearn.model_selection import GridSearchCV

# Define parameters to explore via GridSearch 
parameters = [
    {'n_estimators': [10, 50, 100, 150], 'max_features': [10, 20, 30, 40 ,50, 100, 150]},
    {'bootstrap': [False], 'n_estimators': [10, 50, 100, 150], 'max_features': [10, 20, 30, 40, 50, 100, 150]}
]

# Define the model
rf_model = RandomForestRegressor()

rf_model_grid_search = GridSearchCV(rf_model, parameters, cv = 5, 
                                    scoring = 'neg_mean_squared_error',
                                    return_train_score = True)

rf_model_grid_search.fit(X_train, y_train)

In [75]:
# Best parameters
print("Best parameters:", rf_model_grid_search.best_params_)

Best parameters: {'bootstrap': False, 'max_features': 50, 'n_estimators': 100}


In [114]:
# Evaluate using Fine-Tuned Model Parameters
rf_model_final = RandomForestRegressor(bootstrap = False, max_features = 50,
                                       n_estimators = 150, random_state = 0)

rf_model_final.fit(X_train, y_train)
rf_final_predictions = rf_model_final.predict(X_test)

rf_final_rmse = root_mean_squared_error(inv_y(rf_final_predictions), inv_y(y_test))
print("RMSE:", rf_final_rmse)

rf_final_accuracy = rf_model_final.score(X_test, y_test) * 100
print("Accuracy:", rf_final_accuracy)

RMSE: 24815.18549634231
Accuracy: 91.72458252269293


By fine-tuning the model through the use of grid search we were able to improve the accuracy from 89.44% to 91.72% and reduce the RMSE value by about 2500, resulting in an improved model. 

#### Grid Search - Gradient Boosting Regressor

In [83]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV

# Define the parameters to explore via GridSearch
parameters = [
    {'n_estimators': [100, 200, 300], 
     'learning_rate': [0.01, 0.05, 0.1],
     'max_depth': [3, 4, 5],
     'min_samples_split': [2, 5, 10],
     'min_samples_leaf': [1, 2, 4]}
]

# Define the model
gb_model = GradientBoostingRegressor()

gb_model_grid_search = GridSearchCV(gb_model, parameters, cv = 5,
                           scoring = 'neg_mean_squared_error',
                           return_train_score = True)

gb_model_grid_search.fit(X_train, y_train)

In [84]:
# To see the best parameters
print("Best parameters:", gb_model_grid_search.best_params_)

Best parameters: {'learning_rate': 0.1, 'max_depth': 3, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 300}


In [116]:
from sklearn.ensemble import GradientBoostingRegressor

gb_model_final = GradientBoostingRegressor(n_estimators = 300, learning_rate = 0.1,
                                              max_depth = 3, min_samples_leaf = 5, 
                                              min_samples_split = 10, random_state = 0)
gb_model_final.fit(X_train, y_train)

gb_final_predictions = gb_model_final.predict(X_test)

gb_final_rmse = root_mean_squared_error(inv_y(gb_final_predictions), inv_y(y_test))
print("RMSE:", gb_final_rmse)

gb_final_accuracy = gb_model_final.score(X_test, y_test) * 100
print("Accuracy:", gb_final_accuracy)

RMSE: 22827.80643409716
Accuracy: 92.31349616257367


Unfortunately the accuracy for the gradient boosting regressor model did decrease by 0.1%, but we did reduce the RMSE value by about 800.