In [12]:
# Data prep
import os
import csv

# Import libraries
import numpy as np
import pandas as pd

from sklearn import preprocessing
from sklearn.preprocessing import scale

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error

In [13]:
# Grab the data
dataset_url= 'http://mlr.cs.umass.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv'
red_wine_data = pd.read_csv(dataset_url, sep=';')
red_wine_data.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [19]:
# Scale data
red_wine_scaled = pd.DataFrame(preprocessing.StandardScaler().fit_transform(red_wine_data), columns=red_wine_data.columns)
red_wine_scaled.head()

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,-0.52836,0.961877,-1.391472,-0.453218,-0.243707,-0.466193,-0.379133,0.558274,1.288643,-0.579207,-0.960246,-0.787823
1,-0.298547,1.967442,-1.391472,0.043416,0.223875,0.872638,0.624363,0.028261,-0.719933,0.12895,-0.584777,-0.787823
2,-0.298547,1.297065,-1.18607,-0.169427,0.096353,-0.083669,0.229047,0.134264,-0.331177,-0.048089,-0.584777,-0.787823
3,1.654856,-1.384443,1.484154,-0.453218,-0.26496,0.107592,0.4115,0.664277,-0.979104,-0.46118,-0.584777,0.450848
4,-0.52836,0.961877,-1.391472,-0.453218,-0.243707,-0.466193,-0.379133,0.558274,1.288643,-0.579207,-0.960246,-0.787823


In [20]:
# Define input and output variables
X = red_wine_scaled[['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol']]
y = red_wine_scaled['quality'].values.reshape(-1, 1)
print(X.shape, y.shape)

(1599, 11) (1599, 1)


In [21]:
# Separate data into training set and test set
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

X_train.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
582,1.942121,-0.211283,1.1247,-0.240375,-0.094931,-1.039977,-0.956903,1.724305,-0.784726,-1.346377,-1.147981
626,0.275983,0.403229,0.097691,-0.240375,0.223875,-1.039977,-0.956903,1.088288,0.316751,-0.992298,-1.241848
1030,-0.700719,0.347364,-1.391472,-0.311323,0.075099,-0.657454,-0.987312,-0.989366,0.705508,-0.63822,1.010966
620,-0.011282,0.06804,-0.159061,0.610998,-0.243707,0.011961,1.992767,0.452272,-0.266384,-0.284141,-0.960246
490,0.563248,1.380862,-0.00501,0.185312,-0.201199,0.777007,0.289865,0.876283,-0.007213,0.069937,0.166161


In [22]:
print(y)

[[-0.78782264]
 [-0.78782264]
 [-0.78782264]
 ...
 [ 0.45084835]
 [-0.78782264]
 [ 0.45084835]]


Linear Regression

In [23]:
# Linear regression: fit the data then calculate MSE for prediction
regression=LinearRegression().fit(X,y)
first_model=(mean_squared_error(y_true=y,y_pred=regression.predict(X)))
print(first_model)

0.6394482969613119


In [25]:
# regression coefficients 
print('Coefficients: \n', regression.coef_) 

# variance score: 1 means perfect prediction 
print('Variance score: {}'.format(regression.score(X_test, y_test))) 

Coefficients: 
 [[ 0.05387891 -0.24026089 -0.0440379   0.02851261 -0.1092302   0.05649078
  -0.13297854 -0.04178926 -0.07907982  0.1923365   0.36447012]]
Variance score: 0.3821367476163077


Lasso Regression

In [26]:
# Preform Lasso Regression, fit the data then preform GridSearch to find best parameters
lasso=Lasso(normalize=True)
lasso.fit(X,y)
search=GridSearchCV(estimator=lasso,param_grid={'alpha':np.logspace(-5,2,8)},scoring='neg_mean_squared_error',n_jobs=1,refit=True,cv=10)
search.fit(X,y)

GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=True, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False),
       fit_params=None, iid='warn', n_jobs=1,
       param_grid={'alpha': array([1.e-05, 1.e-04, 1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_mean_squared_error', verbose=0)

In [27]:
search.best_params_

{'alpha': 0.0001}

In [28]:
abs(search.best_score_)

0.6672501012181844

In [29]:
# Re-fit the data then calculate MSE on prediction
lasso=Lasso(normalize=True,alpha=1e-05)
lasso.fit(X,y)
second_model=(mean_squared_error(y_true=y,y_pred=lasso.predict(X)))
print(second_model)

0.639452909554914


In [30]:
# regression coefficients 
print('Coefficients: \n', lasso.coef_) 
  
# variance score: 1 means perfect prediction 
print('Variance score: {}'.format(lasso.score(X_test, y_test)))  

Coefficients: 
 [ 0.05047198 -0.23970536 -0.0420058   0.02712838 -0.10899451  0.05564889
 -0.13221562 -0.0389504  -0.07940794  0.19141644  0.36552019]
Variance score: 0.38199625949255067


Ridge Regression

In [31]:
# Run Ridge Regression, fit the model, then preform GridSearch to find best parameters
ridge=Ridge(normalize=True)
ridge.fit(X,y)
search=GridSearchCV(estimator=ridge,param_grid={'alpha':np.logspace(-5,2,8)},scoring='neg_mean_squared_error',n_jobs=1,refit=True,cv=10)

In [32]:
search.fit(X,y)
search.best_params_

{'alpha': 0.1}

In [33]:
abs(search.best_score_)

0.6660002856922569

In [34]:
# Fit the data then preform MSE on prediction
ridge=Ridge(normalize=True,alpha=0.01)
ridge.fit(X,y)
third_model=(mean_squared_error(y_true=y,y_pred=ridge.predict(X)))
print(third_model)

0.6394883313746165


In [35]:
# regression coefficients 
print('Coefficients: \n', ridge.coef_) 
  
# variance score: 1 means perfect prediction 
print('Variance score: {}'.format(ridge.score(X_test, y_test)))  

Coefficients: 
 [[ 0.05855275 -0.23687213 -0.03875801  0.03027236 -0.10843037  0.05455428
  -0.13130774 -0.04909282 -0.07361195  0.19138594  0.35694686]]
Variance score: 0.3818518117566433


In [36]:
# regression coefficients 
coef_dict_baseline = {}
for coef, feat in zip(ridge.coef_,X.columns):
    coef_dict_baseline[feat] = coef
coef_dict_baseline

{'fixed acidity': array([ 0.05855275, -0.23687213, -0.03875801,  0.03027236, -0.10843037,
         0.05455428, -0.13130774, -0.04909282, -0.07361195,  0.19138594,
         0.35694686])}

Elastic Net Regression

In [37]:
# Elastic Net Regression and GridSearch
elastic=ElasticNet(normalize=True)
search=GridSearchCV(estimator=elastic,param_grid={'alpha':np.logspace(-5,2,8),'l1_ratio':[.2,.4,.6,.8]},scoring='neg_mean_squared_error',n_jobs=1,refit=True,cv=10)

In [38]:
# Find best parameters for our data
search.fit(X,y)
search.best_params_

{'alpha': 0.0001, 'l1_ratio': 0.4}

In [39]:
abs(search.best_score_)

0.6659890184356346

In [40]:
# Re-run elastic net using new parameters
elastic=ElasticNet(normalize=True,alpha=0.0001,l1_ratio=0.4)
elastic.fit(X,y)
fourth_model=(mean_squared_error(y_true=y,y_pred=elastic.predict(X)))
print(fourth_model)

0.6421579166479627


In [41]:
# regression coefficients 
print('Coefficients: \n', elastic.coef_) 
  
# variance score: 1 means perfect prediction 
print('Variance score: {}'.format(elastic.score(X_test, y_test)))   

Coefficients: 
 [ 0.06019393 -0.21384222 -0.          0.03047625 -0.10091576  0.03882517
 -0.11681798 -0.06834165 -0.04888394  0.17697639  0.31906065]
Variance score: 0.37705090695471577


In [42]:
# Find coefficients for elastic net
coef_dict_baseline = {}
for coef, feat in zip(elastic.coef_,X.columns):
    coef_dict_baseline[feat] = coef
coef_dict_baseline

{'fixed acidity': 0.0601939304086483,
 'volatile acidity': -0.21384222184255508,
 'citric acid': -0.0,
 'residual sugar': 0.030476246473087224,
 'chlorides': -0.10091575656237428,
 'free sulfur dioxide': 0.03882517411312841,
 'total sulfur dioxide': -0.11681797861651294,
 'density': -0.0683416451739608,
 'pH': -0.04888393526513757,
 'sulphates': 0.1769763887237258,
 'alcohol': 0.319060649875555}

In [46]:
# Data frame containing coefficients and MSE for each regression model
red_wine_coefs = "red_wine_coefficients.csv"

red_wine_coefs = pd.read_csv(red_wine_coefs, encoding="utf-8")
red_wine_coefs

Unnamed: 0,feature,linear,ridge,lasso,elastic net
0,fixed acidity,0.053879,0.058553,0.050472,0.060194
1,volitale acidity,-0.240261,-0.236872,-0.239705,-0.213842
2,citric acid,-0.044038,-0.038758,-0.042006,0.0
3,residual sugar,0.028513,0.030272,0.027128,0.030476
4,chlorides,-0.10923,-0.10843,-0.108995,-0.100916
5,free sulfur dioxide,0.056491,0.054554,0.055649,0.038825
6,total sulfur dioxide,-0.132979,-0.131308,-0.132216,-0.116818
7,density,-0.041789,-0.049093,-0.03895,-0.068342
8,pH,-0.07908,-0.073612,-0.079408,-0.048884
9,sulphates,0.192336,0.191386,0.191416,0.176976
