In [1]:
#Standard modules for datasets, plotting, and math used in almost all python ML projects.
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

#Functions required for the model secetion process.
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error

#The models themselves.
#linear
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
#nonlinear
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.gaussian_process import GaussianProcessRegressor

In [2]:
#Define any functions we will need

#Kfold cross validation method using root mean squared error that can automatically handle data 
#sets that are log scaled w/o giving artificially low error relative to the non scaled model.

#Usefull only when compairing model trained on log tranformed data with model trained without.
#No longer used in this project at this time.
def Kfold_RMSE(X,y,log_data,n,model):
    
    regressor = model
    
    X = np.array(X)
    y = np.array(y)

    kf = KFold(n_splits=n)
    kf.get_n_splits(X)
    
    rmse_list = []

    for train_index, test_index in kf.split(X):
        train_index = train_index 
        test_index =test_index 
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        regressor.fit(X_train,y_train)
        predictions = regressor.predict(X_test)
        if log_data == True:
            error = mean_squared_error(np.exp(y_test), np.exp(predictions))
        else:
            error = mean_squared_error(y_test, predictions)
        rmse = np.sqrt(error)
        rmse_list.append(rmse)
    
    err = np.array(rmse_list)
    mean_err = np.sum(err)/len(err)
    std = err.std()
    print(f"mean error is: {mean_err:.2f} with a standard deviation of {std:.2f}")
    return err, mean_err, std  

# Load the preprocessed datasets

In [3]:
#load the and test/train datasets and split off the target attribute.
train_set = pd.read_csv("datasets\\processed\\train.csv", index_col=[0])
test_set = pd.read_csv("datasets\\processed\\test.csv", index_col=[0])


#split the target off from the training set
labels_train = train_set["median_house_value"].copy()
train = train_set.drop("median_house_value", axis = 1)


#split the target off from the testing set
labels_test = test_set["median_house_value"].copy()
test = test_set.drop("median_house_value", axis = 1)

# MODEL SELECTION:
Now that the data is processed and ready for use, we need to explore models that we may use for the regression task.

# kfold cross validation and grid search:
We will be evaluating each of the possible models using grid search to test each combination of hyperparameters k-fold times. This is time consuming but is very effective in determining the best model and hyperparameters for your data. We will be evaluating support vector regression, random forest regression, and gaussian process regression as our nonlinear models and linear regression, lasso, and elasticnet as our linear models.

In [None]:
#Determine best hyperparameters for SVR
gamma = np.zeros(10)
for i in range(1,10):
    gamma[i] = 1/i**2
gamma=gamma[1:9] 

param_grid_SVR = [
    {"kernel": ["linear", "poly", "rbf", "sigmoid"],
    "degree": [1,2,3,4,5], "gamma": gamma}]

#find best set of hyperparameters and preform cross validation
SVR_m = SVR()
grid_search_SVR = GridSearchCV(SVR_m,param_grid_SVR,cv = 5,scoring = 'neg_mean_squared_error', return_train_score = True)
grid_search_SVR.fit(train, labels_train)


#print mean cross validation score for model using best hyper parameters
results = grid_search_SVR.cv_results_
score_param = []
for mean_score, params in zip(results["mean_test_score"], results["params"]):
    score_param.append(np.sqrt(-mean_score))
score = np.array(score_param)
best_score_SVR = score.min()
best_param = grid_search_SVR.best_params_

print(f"Best mean kfold cross validation error for SVR was {best_score_SVR} using {best_param}")

In [None]:
#Determine best hyperparameters for Random Forest
param_grid_RF = [
    {"n_estimators": [100, 200, 300, 400], "max_features": [13,14,15,16],
    "criterion": ["mse", "mae"]}]


#find best set of hyperparameters and preform cross validation
RF_m = RandomForestRegressor()
grid_search_RF = GridSearchCV(RF_m,param_grid_RF,cv = 5,scoring = 'neg_mean_squared_error', return_train_score = True)
grid_search_RF.fit(train, labels_train)


#print mean cross validation score for model using best hyper parameters
results = grid_search_RF.cv_results_
score_param = []
for mean_score, params in zip(results["mean_test_score"], results["params"]):
    score_param.append(np.sqrt(-mean_score))
score = np.array(score_param)
best_score = score.min()
best_param = grid_search_RF.best_params_

print(f"Best mean kfold cross validation error for RF was {best_score_RF} using {best_param}")

In [None]:
#Determine best hyperparameters for Gaussian Process Regression
param_grid_GP = [
    {"optimizer": ["fmin_l_bfgs_b"]}]


#find best set of hyperparameters and preform cross validation
GP_m = GaussianProcessRegressor()
grid_search_GP = GridSearchCV(GP_m,param_grid_GP,cv = 5,scoring = 'neg_mean_squared_error', return_train_score = True)
grid_search_GP.fit(train, labels_train)


#print mean cross validation score for model using best hyper parameters
results = grid_search_GP.cv_results_
score_param = []
for mean_score, params in zip(results["mean_test_score"], results["params"]):
    score_param.append(np.sqrt(-mean_score))
score = np.array(score_param)
best_score = score.min()
best_param = grid_search_GP.best_params_

print(f"Best mean kfold cross validation error for RF was {best_score_GP} using {best_param}")

In [None]:
#Determine best hyperparameters for Linear Regression
param_grid_Linear = [
    {"fit_intercept": [True,False]}
]

#find best set of hyperparameters and preform cross validation
Linear_m = LinearRegression()
grid_search_linear = GridSearchCV(Linear_m,param_grid_Linear,cv = 5,scoring = 'neg_mean_squared_error', return_train_score = True)
grid_search_linear.fit(train, labels_train)


#print mean cross validation score for model using best hyper parameters
results = grid_search_linear.cv_results_
score_param = []
for mean_score, params in zip(results["mean_test_score"], results["params"]):
    score_param.append(np.sqrt(-mean_score))
score = np.array(score_param)
best_score_L = score.min()
best_param = grid_search_linear.best_params_

print(f"Best mean kfold cross validation error was {best_score_L} using {best_param}")

In [None]:
#Determine best hyperparameters for ElasticNet
param_grid_Elastic = [
    {"alpha": np.linspace(0.01,1,10),
    "l1_ratio": np.linspace(0.01,1,10)}
]

#find best set of hyperparameters and preform cross validation
Elastic_m = ElasticNet()
grid_search_Elastic = GridSearchCV(Elastic_m,param_grid_Elastic,cv = 5,scoring = 'neg_mean_squared_error', return_train_score = True)
grid_search_Elastic.fit(train, labels_train)

#print mean cross validation score for model using best hyper parameters
results = grid_search_Elastic.cv_results_
score_param = []
for mean_score, params in zip(results["mean_test_score"], results["params"]):
    score_param.append(np.sqrt(-mean_score))
score = np.array(score_param)
best_score_Elastic = score.min()
best_param = grid_search_Elastic.best_params_

print(f"Best mean kfold cross validation error was {best_score_Elastic} using {best_param}")

In [None]:
#Determine best hyperparameters for LASSO
param_grid_lasso = [
    {"alpha": np.linspace(0.01,1,10)},
    {"fit_intercept": [True,False]}
]

#find best set of hyperparameters and preform cross validation
Lasso_m = Lasso()
grid_search_lasso = GridSearchCV(Lasso_m,param_grid_lasso,cv = 5,scoring = 'neg_mean_squared_error', return_train_score = True)
grid_search_lasso.fit(train, labels_train)

#print mean cross validation score for model using best hyper parameters
results = grid_search_lasso.cv_results_
score_param = []
for mean_score, params in zip(results["mean_test_score"], results["params"]):
    score_param.append(np.sqrt(-mean_score))
score = np.array(score_param)
best_score = score.min()
best_param_lasso = grid_search_lasso.best_params_

print(f"Best mean kfold cross validation error was {best_score_lasso} using {best_param}")

# Generalization error using best hyperparameters
Durring the cross validation step, it became clear that linear models do not preform well on this problem. As a result, we will only be considering random forest regression, support vector regression, and gaussian process regression for our final model. It seems that all of the linear models preformed poorly. This is not surprising considering the complex nature of the problem. What may be surprising is that basic linear outpreformed lasso and even elasticnet. This could be explained by the fact that our data is both low in ojects and features. Since both lasso and elasticnet preform feature reduction, they might be prefrorming poorly since there is so little data that any reduction constitutes a notable redution in information.

# final model evaluation


#Random Forest model model generalization error
predictions = grid_search_RF.best_estimator_.predict(test)
error = mean_squared_error(np.exp(labels_test), np.exp(predictions))
rmse_RF = np.sqrt(error)
print(f"Random Forest RMSE generaliziation error: {np.sqrt(error):.2f}")