In [1]:
# Data prep
import os
import csv

# Import libraries
import numpy as np
import pandas as pd


from sklearn import preprocessing
from sklearn.preprocessing import scale

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error

In [2]:
# Grab data
dataset_url= 'http://mlr.cs.umass.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv'
white_wine_data = pd.read_csv(dataset_url, sep=';')
white_wine_data.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [3]:
# Scale data
white_wine_scaled = pd.DataFrame(preprocessing.StandardScaler().fit_transform(white_wine_data), columns=white_wine_data.columns)
white_wine_scaled.head()

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,0.172097,-0.08177,0.21328,2.821349,-0.035355,0.569932,0.744565,2.331512,-1.246921,-0.349184,-1.393152,0.13787
1,-0.657501,0.215896,0.048001,-0.944765,0.147747,-1.253019,-0.149685,-0.009154,0.740029,0.001342,-0.824276,0.13787
2,1.475751,0.017452,0.543838,0.100282,0.193523,-0.312141,-0.973336,0.358665,0.475102,-0.436816,-0.336667,0.13787
3,0.409125,-0.478657,-0.117278,0.415768,0.559727,0.687541,1.121091,0.525855,0.01148,-0.787342,-0.499203,0.13787
4,0.409125,-0.478657,-0.117278,0.415768,0.559727,0.687541,1.121091,0.525855,0.01148,-0.787342,-0.499203,0.13787


In [4]:
# Define input and output variables
X = white_wine_scaled[['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol']]
y = white_wine_scaled['quality'].values.reshape(-1, 1)
print(X.shape, y.shape)

(4898, 11) (4898, 1)


In [5]:
# Separate data into training set and test set
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

X_train.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
2835,-0.657501,-0.280214,-0.943673,-0.609561,0.101972,0.334712,0.532769,-0.490663,-0.187215,0.088973,-0.011595
1157,1.120209,0.215896,-0.365197,2.062211,0.376625,-0.723775,-0.079086,1.930255,-0.187215,-0.962605,-1.230616
744,0.646153,1.00967,-0.530476,0.218589,-0.218457,-0.66497,0.509236,0.44226,-0.120983,-0.524447,-0.417935
1448,0.646153,-1.17321,1.287594,-1.023637,0.4224,-1.017799,0.273907,-0.77823,0.276407,-0.173921,0.557282
3338,0.409125,-0.08177,-0.447836,1.736866,0.010421,-1.723457,-2.291177,0.876955,-0.120983,-0.874973,0.313478


In [6]:
print(y)

[[0.13787014]
 [0.13787014]
 [0.13787014]
 ...
 [0.13787014]
 [1.2671142 ]
 [0.13787014]]


Linear Regression

In [7]:
# Linear regression: fit the data then calculate MSE for prediction
regression=LinearRegression().fit(X,y)
first_model=(mean_squared_error(y_true=y,y_pred=regression.predict(X)))
print(first_model)

0.7181296358667146


In [8]:
# regression coefficients 
print('Coefficients: \n', regression.coef_) 
  
# variance score: 1 means perfect prediction 
print('Variance score: {}'.format(regression.score(X_test, y_test))) 

Coefficients: 
 [[ 0.06242977 -0.21204823  0.00301856  0.46665253 -0.00610011  0.07168122
  -0.01371181 -0.50752757  0.11702101  0.0813738   0.26884011]]
Variance score: 0.28161387455498454


Lasso Regression

In [9]:
# Preform Lasso Regression, fit the data then preform GridSearch to find best parameters
lasso=Lasso(normalize=True)
lasso.fit(X,y)
search=GridSearchCV(estimator=lasso,param_grid={'alpha':np.logspace(-5,2,8)},scoring='neg_mean_squared_error',n_jobs=1,refit=True,cv=10)
search.fit(X,y)

GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=True, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False),
       fit_params=None, iid='warn', n_jobs=1,
       param_grid={'alpha': array([1.e-05, 1.e-04, 1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_mean_squared_error', verbose=0)

In [10]:
search.best_params_

{'alpha': 0.0001}

In [11]:
abs(search.best_score_)

0.7330163891762923

In [12]:
# Re-fit the data then calculate MSE on prediction
lasso=Lasso(normalize=True,alpha=1e-05)
lasso.fit(X,y)
second_model=(mean_squared_error(y_true=y,y_pred=lasso.predict(X)))
print(second_model)

0.7181685334677781


In [13]:
# regression coefficients 
print('Coefficients: \n', lasso.coef_) 
  
# variance score: 1 means perfect prediction 
print('Variance score: {}'.format(lasso.score(X_test, y_test)))  

Coefficients: 
 [ 0.05453013 -0.2123947   0.00213752  0.44637282 -0.00690575  0.07100362
 -0.01301344 -0.47795743  0.1103563   0.07906079  0.28169501]
Variance score: 0.2811656011597178


Ridge Regression

In [14]:
# Run Ridge Regression, fit the model, then preform GridSearch to find best parameters
ridge=Ridge(normalize=True)
ridge.fit(X,y)
search=GridSearchCV(estimator=ridge,param_grid={'alpha':np.logspace(-5,2,8)},scoring='neg_mean_squared_error',n_jobs=1,refit=True,cv=10)

In [15]:
search.fit(X,y)
search.best_params_

{'alpha': 0.01}

In [16]:
abs(search.best_score_)

0.732640495492876

In [17]:
# Fit the data then preform MSE on prediction
ridge=Ridge(normalize=True,alpha=0.01)
ridge.fit(X,y)
third_model=(mean_squared_error(y_true=y,y_pred=ridge.predict(X)))
print(third_model)

0.7188333224880965


In [18]:
# regression coefficients 
print('Coefficients: \n', ridge.coef_) 
  
# variance score: 1 means perfect prediction 
print('Variance score: {}'.format(ridge.score(X_test, y_test)))  

Coefficients: 
 [[ 0.03155145 -0.21163341  0.00222641  0.37430367 -0.01314957  0.07683601
  -0.02090625 -0.37296743  0.0914774   0.07340801  0.32362683]]
Variance score: 0.2791813840466516


In [370]:
# regression coefficients 
coef_dict_baseline = {}
for coef, feat in zip(ridge.coef_,X.columns):
    coef_dict_baseline[feat] = coef
coef_dict_baseline

{'fixed acidity': array([ 0.03155145, -0.21163341,  0.00222641,  0.37430367, -0.01314957,
         0.07683601, -0.02090625, -0.37296743,  0.0914774 ,  0.07340801,
         0.32362683])}

Elastic Net Regression

In [371]:
# Elastic Net Regression and GridSearch
elastic=ElasticNet(normalize=True)
search=GridSearchCV(estimator=elastic,param_grid={'alpha':np.logspace(-5,2,8),'l1_ratio':[.2,.4,.6,.8]},scoring='neg_mean_squared_error',n_jobs=1,refit=True,cv=10)

In [372]:
# Find best parameters for our data
search.fit(X,y)
search.best_params_

{'alpha': 1e-05, 'l1_ratio': 0.8}

In [373]:
abs(search.best_score_)

0.732694638178483

In [374]:
# Re-run elastic net using new parameters
elastic=ElasticNet(normalize=True,alpha=1e-05,l1_ratio=0.8)
elastic.fit(X,y)
fourth_model=(mean_squared_error(y_true=y,y_pred=elastic.predict(X)))
print(fourth_model)

0.7189976075895201


In [375]:
# regression coefficients 
print('Coefficients: \n', elastic.coef_) 
  
# variance score: 1 means perfect prediction 
print('Variance score: {}'.format(elastic.score(X_test, y_test)))   

Coefficients: 
 [ 0.02742201 -0.21175902  0.00162681  0.36438061 -0.01341763  0.07595504
 -0.01993467 -0.35887371  0.08795487  0.07210275  0.32958616]
Variance score: 0.2788710241926744


In [376]:
# Find coefficients for elastic net
coef_dict_baseline = {}
for coef, feat in zip(elastic.coef_,X.columns):
    coef_dict_baseline[feat] = coef
coef_dict_baseline

{'fixed acidity': 0.027422008174289674,
 'volatile acidity': -0.21175901732514635,
 'citric acid': 0.0016268083275489196,
 'residual sugar': 0.3643806111053035,
 'chlorides': -0.013417632270837363,
 'free sulfur dioxide': 0.07595504043534268,
 'total sulfur dioxide': -0.019934665497964695,
 'density': -0.35887370802269714,
 'pH': 0.08795486622029436,
 'sulphates': 0.07210275103591919,
 'alcohol': 0.3295861553836825}

In [9]:
# Data frame containing coefficients and MSE for each regression model
white_wine_coefs = "white_wine_coefficients.csv"

white_wine_coefs = pd.read_csv(white_wine_coefs, encoding="utf-8")
white_wine_coefs

Unnamed: 0,feature,linear,ridge,lasso,elastic net
0,fixed acidity,0.06243,0.031551,0.05453,0.027422
1,volitale acidity,-0.212048,-0.211633,-0.212395,-0.211759
2,citric acid,0.003019,0.002226,0.002138,0.001627
3,residual sugar,0.466653,0.374304,0.446373,0.364381
4,chlorides,-0.0061,-0.01315,-0.006906,-0.013418
5,free sulfur dioxide,0.071681,0.076836,0.071004,0.075955
6,total sulfur dioxide,-0.013712,-0.020906,-0.013013,-0.019935
7,density,-0.507528,-0.372967,-0.477957,-0.358874
8,pH,0.117021,0.091477,0.110356,0.087955
9,sulphates,0.081374,0.073408,0.079061,0.072103
