In [1]:
# Import necessary libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
pd.set_option('display.max_columns',None)
plt.style.use('ggplot')

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.dummy import DummyRegressor
from sklearn.metrics import mean_squared_error

import sys
sys.path.append('../../../../DS_Mentorship_Projects/')
from utils import linear_model_selection

from csv import DictWriter

In [2]:
# Read csv into a pandas dataframe
df = pd.read_csv('../data/Concrete_Data_Yeh.csv')
df.head()

Unnamed: 0,cement,slag,flyash,water,superplasticizer,coarseaggregate,fineaggregate,age,csMPa
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.99
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.89
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.27
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.3


In [3]:
# Split data into predictor features (X) and target feature (y)
X = df.drop('csMPa', axis = 1)
y = df['csMPa']

In [4]:
# Split the data into a training and a test set (random state set for reproducability)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=30)

In [5]:
dumb_reg = DummyRegressor(strategy='mean')
dumb_cv_scores = cross_val_score(dumb_reg, X_train, y_train, cv = 5, scoring = 'neg_mean_squared_error')
dumb_cv_score_mean = dumb_cv_scores.mean()
dumb_cv_score_std = dumb_cv_scores.std()

dumb_reg.fit(X_train, y_train)
dumb_reg_constant = dumb_reg.constant_[0][0]

y_predict_baseline = dumb_reg.predict(X_test)
dumb_reg_test_mse = mean_squared_error(y_test, y_predict_baseline)

In [6]:
best_models_backward_select = pd.DataFrame({'model': dumb_reg, 'is_ols': False, 'scaled_data': False, 'n_params': [0], 'parameters': None, 
            'coefficients': None, 'intercept': dumb_reg.constant, 
            'cv_score': dumb_cv_score_mean, 'cv_score_std': dumb_cv_score_std, 'test_mse': dumb_reg_test_mse})

predictors = X_train.columns

for i in range(1, 8):
    new_model = linear_model_selection.backwardSelection(X_train, y_train, X_test, y_test, predictors, regressor='any')
    best_models_backward_select = pd.concat([best_models_backward_select, new_model], axis=0, ignore_index=True)
    predictors = best_models_backward_select.loc[i]['parameters']
    
all_param_model = linear_model_selection.getBestKModel(X_train, y_train, X_test, y_test, k=8, regressor = 'any')
best_models_backward_select = pd.concat([best_models_backward_select, all_param_model], axis=0, ignore_index=True)

Processed 16 model(s) on 7 predictor(s).
Processed 14 model(s) on 6 predictor(s).
Processed 12 model(s) on 5 predictor(s).
Processed 10 model(s) on 4 predictor(s).
Processed 8 model(s) on 3 predictor(s).
Processed 6 model(s) on 2 predictor(s).
Processed 4 model(s) on 1 predictor(s).
Processed 2 model(s) on 8 predictor(s)


In [7]:
best_models_backward_select.sort_values('n_params', inplace=True)
best_models_backward_select.reset_index(drop=True, inplace=True)
best_models_backward_select

Unnamed: 0,model,is_ols,scaled_data,n_params,parameters,coefficients,intercept,cv_score,cv_score_std,test_mse
0,DummyRegressor(),False,False,0,,,,-278.676149,26.268613,283.728777
1,Ridge(alpha=1.7555958671075638),False,True,1,[cement],[8.002292486091873],35.9221,-214.700275,7.414997,195.596219
2,Ridge(alpha=70873.70814634024),False,False,2,"[cement, age]","[0.0741102967614365, 0.08851746911713738]",10.945152,-184.307744,8.204196,203.841357
3,Ridge(alpha=5748.784976988678),False,False,3,"[cement, water, age]","[0.06939559179344855, -0.26584723498880974, 0....",59.397682,-153.085886,10.217746,163.894645
4,Ridge(alpha=3289.6661232878414),False,False,4,"[cement, slag, water, age]","[0.08419458706311836, 0.06803401914743222, -0....",55.17505,-121.746396,6.594036,139.889049
5,Ridge(alpha=4348.745013088917),False,False,5,"[cement, slag, flyash, water, age]","[0.11311823771455791, 0.09706602247209471, 0.0...",28.730841,-104.597275,6.563461,135.429931
6,Ridge(alpha=1.7555958671075638),False,True,6,"[cement, slag, flyash, water, superplasticizer...","[10.844454018535085, 7.483813268180326, 4.4230...",35.9221,-102.894382,7.183763,138.869663
7,Lasso(alpha=0.0466301673441609),False,True,7,"[cement, slag, flyash, water, superplasticizer...","[10.751217400022844, 7.38301776146781, 4.30654...",35.9221,-103.101109,7.275616,138.528439
8,Lasso(alpha=0.0466301673441609),False,True,8,"[cement, slag, flyash, water, superplasticizer...","[10.751431457234142, 7.383213594871842, 4.3067...",35.9221,-103.155453,7.267941,138.528236


In [8]:
ridge_model = best_models_backward_select.iloc[5]
ridge_model

model                              Ridge(alpha=4348.745013088917)
is_ols                                                      False
scaled_data                                                 False
n_params                                                        5
parameters                     [cement, slag, flyash, water, age]
coefficients    [0.11311823771455791, 0.09706602247209471, 0.0...
intercept                                               28.730841
cv_score                                              -104.597275
cv_score_std                                             6.563461
test_mse                                               135.429931
Name: 5, dtype: object

In [9]:
ridge_model_params = {'intercept': ridge_model.intercept, 'parameters': ridge_model.parameters, 'coefficients': ridge_model.coefficients}
ridge_model_cv_score = -1*ridge_model.cv_score
ridge_model_test_mse = ridge_model.test_mse

In [10]:
final_ridge_performance = {'model': 'ridge_regression', 
                          'params': ridge_model_params, 
                          'train_score_cv_mse': ridge_model_cv_score, 
                          'test_score_mse': ridge_model_test_mse}

field_names = final_ridge_performance.keys()

with open('../data/models.csv', 'a') as file:
    writer = DictWriter(file, fieldnames=field_names)
    writer.writerow(final_ridge_performance)