In [1]:
# Necessary Imports
import pickle
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn import linear_model,ensemble, tree, model_selection, datasets
from sklearn.metrics import r2_score, mean_squared_error

# Load in dataset
df = pickle.load(open("electric_guitar_dataframe_dummy.pkl", "rb"))



In [2]:
# Create X and y 
y = df['Final']
X = df.drop('Final', axis = 1)

# Create test train splits
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=42)
y_true = y_test

In [3]:
# Create dataframe to store data of different models
df_models = pd.DataFrame(columns=['Model Name', 
                                  'R-Squared 10-Fold C.V. Train', 
                                  'R-Squared Holdout',
                                  'R-Squared (adj) Holdout'])

In [4]:
# Running multiple models
models = {}
parameters = {}

# Initializing models dict with different regressors
models['Linear Regression'] = linear_model.LinearRegression()
models['Ridge Regression'] = linear_model.Ridge()
models['Lasso Regression'] = linear_model.Lasso(alpha=.5)
models['Huber Regression'] = linear_model.SGDRegressor(loss='huber',max_iter=2000)
models['Decision Tree Regression'] = tree.DecisionTreeRegressor(max_depth=7)
models['Extra Trees Regression'] = tree.ExtraTreeRegressor(max_depth=7)
models['Random Forest Regression'] = ensemble.RandomForestRegressor()
models['AdaBoost Regression'] = ensemble.AdaBoostRegressor()
models['Gradient Boosting Regression'] = ensemble.GradientBoostingRegressor()

for key, value in models.items():
    
    # Running 10-fold cross validation and calculating mean R2 of the 10 trails
    scores = model_selection.cross_val_score(value, X_train, y_train, cv = 10, scoring = 'r2')
    R2_CV = np.mean(scores)
    
    # Training the regression on the entire train set, and testing it on the holdout set
    regression = value
    regression.fit(X_train, y_train)
    y_pred = regression.predict(X_test)
    R2_holdout = r2_score(y_true, y_pred)
    
    # Calculating adjusted R^2 on holdout set
    n = len(y_train)
    p = len(X_train.columns)
    adj_R2_holdout = 1 - (1 - R2_holdout) * (n - 1) / (n - p - 1)
    
    # Round numbers to 3 decimals
    R2_CV = round(R2_CV, 3)
    R2_holdout = round(R2_holdout, 3)
    adj_R2_holdout = round(adj_R2_holdout, 3)
    
    # Printing output
    print('Model: ' + key)
    print('C.V. R^2: ' + str(R2_CV))
    print('Holdout R^2:' + str(R2_holdout))
    print('Holdout adj R^2:' + str(adj_R2_holdout))
    print()
    
    # Creating model dictionary to add to a dataframe row
    model_dict = {'Model Name':key,
                  'R-Squared 10-Fold C.V. Train': R2_CV,
                  'R-Squared Holdout':R2_holdout,
                  'R-Squared (adj) Holdout':adj_R2_holdout
                 }
    
    # Converting from dictionary to a pandas dataframe
    model_df = pd.DataFrame(model_dict, index = [0])
    
    # Appending to the overall dataframe
    df_models = df_models.append(model_df)

Model: Linear Regression
C.V. R^2: 0.974
Holdout R^2:0.979
Holdout adj R^2:0.979

Model: Ridge Regression
C.V. R^2: 0.974
Holdout R^2:0.979
Holdout adj R^2:0.979

Model: Lasso Regression
C.V. R^2: 0.974
Holdout R^2:0.979
Holdout adj R^2:0.979

Model: Huber Regression
C.V. R^2: 0.965
Holdout R^2:0.956
Holdout adj R^2:0.956

Model: Decision Tree Regression
C.V. R^2: 0.966
Holdout R^2:0.973
Holdout adj R^2:0.973

Model: Extra Trees Regression
C.V. R^2: 0.931
Holdout R^2:0.971
Holdout adj R^2:0.971

Model: Random Forest Regression
C.V. R^2: 0.97
Holdout R^2:0.976
Holdout adj R^2:0.976

Model: AdaBoost Regression
C.V. R^2: 0.942
Holdout R^2:0.956
Holdout adj R^2:0.956

Model: Gradient Boosting Regression
C.V. R^2: 0.973
Holdout R^2:0.978
Holdout adj R^2:0.978



In [6]:
# Reorder for easy viewing
df_models = df_models[['Model Name', 
                       'R-Squared 10-Fold C.V. Train', 
                       'R-Squared Holdout', 
                       'R-Squared (adj) Holdout']]

# Sort by R-Squared (adj)
df_models.sort_values(by = ['R-Squared (adj) Holdout'], inplace = True, ascending = False)

# Reset the indeces
df_models = df_models.reset_index(drop=True)

# Print the dataframe
df_models

Unnamed: 0,Model Name,R-Squared 10-Fold C.V. Train,R-Squared Holdout,R-Squared (adj) Holdout
0,Linear Regression,0.974,0.979,0.979
1,Ridge Regression,0.974,0.979,0.979
2,Lasso Regression,0.974,0.979,0.979
3,Gradient Boosting Regression,0.973,0.978,0.978
4,Random Forest Regression,0.97,0.976,0.976
5,Decision Tree Regression,0.966,0.973,0.973
6,Extra Trees Regression,0.931,0.971,0.971
7,Huber Regression,0.965,0.956,0.956
8,AdaBoost Regression,0.942,0.956,0.956
