In [1]:
# Import Dependencies
import pandas as pd
import numpy as np

In [2]:
# from google.colab import drive
# drive.mount('/content/drive')

In [3]:
# cd drive/My\ Drive/Project\ 3\ -\ Wine\ Quality

In [4]:
# Import White Wine Data
white_wine_data = pd.read_csv('data/wineQualityWhites.csv')
white_wine_data.head()

Unnamed: 0.1,Unnamed: 0,fixed.acidity,volatile.acidity,citric.acid,residual.sugar,chlorides,free.sulfur.dioxide,total.sulfur.dioxide,density,pH,sulphates,alcohol,quality
0,1,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,2,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,3,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,5,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [5]:
len(white_wine_data)

4898

In [6]:
white_wine_data.columns

Index(['Unnamed: 0', 'fixed.acidity', 'volatile.acidity', 'citric.acid',
       'residual.sugar', 'chlorides', 'free.sulfur.dioxide',
       'total.sulfur.dioxide', 'density', 'pH', 'sulphates', 'alcohol',
       'quality'],
      dtype='object')

In [7]:
# Assign the data to X and y

X = white_wine_data[['fixed.acidity', 'volatile.acidity', 'citric.acid',
       'residual.sugar', 'chlorides', 'free.sulfur.dioxide',
       'total.sulfur.dioxide', 'density', 'pH', 'sulphates', 'alcohol']]
y = white_wine_data["quality"]
print(X.shape, y.shape)

(4898, 11) (4898,)


In [8]:
# Use train_test_split to create training and testing data
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [9]:
# Create the model using LinearRegression
from sklearn.linear_model import LinearRegression

model = LinearRegression()

In [10]:
# Fit the model to the training data and calculate the scores for the training and testing data
model.fit(X_train, y_train)

training_score = model.score(X_train, y_train)
testing_score = model.score(X_test, y_test)

print(f"Training Score: {training_score}")
print(f"Testing Score: {testing_score}")

Training Score: 0.2828919982151695
Testing Score: 0.2727562934675569


# Based on all attribute variables, this model is horrible. Next approach is looking at P-values to remove variables that have no impact on quality.

In [11]:
# Build better model using Background Elimination
import statsmodels.api as sm
import statsmodels.formula.api as smf

# Adding a column of ones at beginning of dataframe
X = np.append(arr = np.ones((len(white_wine_data), 1)).astype(int), values = X, axis = 1)

In [12]:
# Fit using all variables
X_opt = X[:, [0,1,2,3,4,5,6,7,8,9,10,11]]
regressor_OLS = sm.OLS(endog= y, exog = X_opt).fit()
regressor_OLS.summary()
# Need to remove 3 (citric.acid)

0,1,2,3
Dep. Variable:,quality,R-squared:,0.282
Model:,OLS,Adj. R-squared:,0.28
Method:,Least Squares,F-statistic:,174.3
Date:,"Mon, 30 Sep 2019",Prob (F-statistic):,0.0
Time:,21:12:56,Log-Likelihood:,-5543.7
No. Observations:,4898,AIC:,11110.0
Df Residuals:,4886,BIC:,11190.0
Df Model:,11,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,150.1928,18.804,7.987,0.000,113.328,187.057
x1,0.0655,0.021,3.139,0.002,0.025,0.106
x2,-1.8632,0.114,-16.373,0.000,-2.086,-1.640
x3,0.0221,0.096,0.231,0.818,-0.166,0.210
x4,0.0815,0.008,10.825,0.000,0.067,0.096
x5,-0.2473,0.547,-0.452,0.651,-1.319,0.824
x6,0.0037,0.001,4.422,0.000,0.002,0.005
x7,-0.0003,0.000,-0.756,0.450,-0.001,0.000
x8,-150.2842,19.075,-7.879,0.000,-187.679,-112.890

0,1,2,3
Omnibus:,114.161,Durbin-Watson:,1.621
Prob(Omnibus):,0.0,Jarque-Bera (JB):,251.637
Skew:,0.073,Prob(JB):,2.28e-55
Kurtosis:,4.101,Cond. No.,374000.0


In [13]:
# Refit model after removing 3 (citric.acid) column
X_opt = X[:, [0,1,2,4,5,6,7,8,9,10,11]]
regressor_OLS = sm.OLS(endog= y, exog = X_opt).fit()
regressor_OLS.summary()
# Need to remove 5 (chlorides)

0,1,2,3
Dep. Variable:,quality,R-squared:,0.282
Model:,OLS,Adj. R-squared:,0.28
Method:,Least Squares,F-statistic:,191.8
Date:,"Mon, 30 Sep 2019",Prob (F-statistic):,0.0
Time:,21:12:56,Log-Likelihood:,-5543.8
No. Observations:,4898,AIC:,11110.0
Df Residuals:,4887,BIC:,11180.0
Df Model:,10,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,149.9012,18.760,7.991,0.000,113.124,186.679
x1,0.0661,0.021,3.192,0.001,0.026,0.107
x2,-1.8677,0.112,-16.668,0.000,-2.087,-1.648
x3,0.0814,0.008,10.827,0.000,0.067,0.096
x4,-0.2338,0.543,-0.430,0.667,-1.299,0.831
x5,0.0037,0.001,4.434,0.000,0.002,0.005
x6,-0.0003,0.000,-0.747,0.455,-0.001,0.000
x7,-149.9866,19.029,-7.882,0.000,-187.292,-112.681
x8,0.6843,0.105,6.517,0.000,0.478,0.890

0,1,2,3
Omnibus:,114.042,Durbin-Watson:,1.621
Prob(Omnibus):,0.0,Jarque-Bera (JB):,251.382
Skew:,0.072,Prob(JB):,2.59e-55
Kurtosis:,4.1,Cond. No.,373000.0


In [14]:
# Refit model after removing 5 (chlorides) column
X_opt = X[:, [0,1,2,4,6,7,8,9,10,11]]
regressor_OLS = sm.OLS(endog= y, exog = X_opt).fit()
regressor_OLS.summary()
# Need to remove 7 (total.sulfur.dioxide)

0,1,2,3
Dep. Variable:,quality,R-squared:,0.282
Model:,OLS,Adj. R-squared:,0.281
Method:,Least Squares,F-statistic:,213.1
Date:,"Mon, 30 Sep 2019",Prob (F-statistic):,0.0
Time:,21:12:56,Log-Likelihood:,-5543.9
No. Observations:,4898,AIC:,11110.0
Df Residuals:,4888,BIC:,11170.0
Df Model:,9,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,151.2557,18.492,8.179,0.000,115.003,187.509
x1,0.0675,0.020,3.303,0.001,0.027,0.108
x2,-1.8717,0.112,-16.761,0.000,-2.091,-1.653
x3,0.0821,0.007,11.148,0.000,0.068,0.096
x4,0.0037,0.001,4.422,0.000,0.002,0.005
x5,-0.0003,0.000,-0.753,0.451,-0.001,0.000
x6,-151.3981,18.743,-8.078,0.000,-188.142,-114.654
x7,0.6922,0.103,6.695,0.000,0.490,0.895
x8,0.6339,0.100,6.324,0.000,0.437,0.830

0,1,2,3
Omnibus:,114.166,Durbin-Watson:,1.621
Prob(Omnibus):,0.0,Jarque-Bera (JB):,251.721
Skew:,0.073,Prob(JB):,2.19e-55
Kurtosis:,4.101,Cond. No.,368000.0


In [15]:
# Refit model after removing 7 (total.sulfur.dioxide) column
X_opt = X[:, [0,1,2,4,6,8,9,10,11]]
regressor_OLS = sm.OLS(endog= y, exog = X_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,quality,R-squared:,0.282
Model:,OLS,Adj. R-squared:,0.281
Method:,Least Squares,F-statistic:,239.7
Date:,"Mon, 30 Sep 2019",Prob (F-statistic):,0.0
Time:,21:12:57,Log-Likelihood:,-5544.1
No. Observations:,4898,AIC:,11110.0
Df Residuals:,4889,BIC:,11160.0
Df Model:,8,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,154.1062,18.100,8.514,0.000,118.622,189.591
x1,0.0681,0.020,3.333,0.001,0.028,0.108
x2,-1.8881,0.110,-17.242,0.000,-2.103,-1.673
x3,0.0828,0.007,11.370,0.000,0.069,0.097
x4,0.0033,0.001,4.950,0.000,0.002,0.005
x5,-154.2913,18.344,-8.411,0.000,-190.254,-118.329
x6,0.6942,0.103,6.717,0.000,0.492,0.897
x7,0.6285,0.100,6.287,0.000,0.433,0.824
x8,0.1932,0.024,8.021,0.000,0.146,0.240

0,1,2,3
Omnibus:,114.194,Durbin-Watson:,1.621
Prob(Omnibus):,0.0,Jarque-Bera (JB):,251.255
Skew:,0.075,Prob(JB):,2.76e-55
Kurtosis:,4.099,Cond. No.,99500.0


# Create New Model from only predictive columns

In [16]:
# Assign the data to X and y

X = white_wine_data[['fixed.acidity', 'volatile.acidity',
       'residual.sugar', 'free.sulfur.dioxide',
       'density', 'pH', 'sulphates', 'alcohol']]
y = white_wine_data["quality"]
print(X.shape, y.shape)

(4898, 8) (4898,)


In [17]:
# Use train_test_split to create training and testing data

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [18]:
# Create the model using LinearRegression
model = LinearRegression()

In [19]:
# Fit the model to the training data and calculate the scores for the training and testing data
model.fit(X_train, y_train)

training_score = model.score(X_train, y_train)
testing_score = model.score(X_test, y_test)

print(f"Training Score: {training_score}")
print(f"Testing Score: {testing_score}")

Training Score: 0.28266373294992864
Testing Score: 0.2741089926312341


# Check strength of model based on density and alcohol

In [20]:
# Assign the data to X and y

X = white_wine_data[['density', 'alcohol']]
y = white_wine_data["quality"]
print(X.shape, y.shape)

(4898, 2) (4898,)


In [21]:
# Use train_test_split to create training and testing data

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [22]:
# Create the model using LinearRegression
model = LinearRegression()

In [23]:
# Fit the model to the training data and calculate the scores for the training and testing data
model.fit(X_train, y_train)

training_score = model.score(X_train, y_train)
testing_score = model.score(X_test, y_test)

print(f"Training Score: {training_score}")
print(f"Testing Score: {testing_score}")

Training Score: 0.1885391656456661
Testing Score: 0.20285576685485185


# Check model relating residual.sugar and density

In [24]:
# Assign the data to X and y

X = white_wine_data[['residual.sugar']]
y = white_wine_data["density"]
print(X.shape, y.shape)

(4898, 1) (4898,)


In [25]:
# Use train_test_split to create training and testing data

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [26]:
# Create the model using LinearRegression
model = LinearRegression()

In [27]:
# Fit the model to the training data and calculate the scores for the training and testing data
model.fit(X_train, y_train)

training_score = model.score(X_train, y_train)
testing_score = model.score(X_test, y_test)

print(f"Training Score: {training_score}")
print(f"Testing Score: {testing_score}")

Training Score: 0.7104476424322768
Testing Score: 0.6812611044515996
