In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, Lasso, LassoCV
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split

%matplotlib inline

In [2]:
df = pd.read_csv('data/winequality-white.csv',sep=';')
# df = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv',sep=';')
df.head()

Unnamed: 0,"fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality"
0,"7.4,0.7,0,1.9,0.076,11,34,0.9978,3.51,0.56,9.4,5"
1,"7.8,0.88,0,2.6,0.098,25,67,0.9968,3.2,0.68,9.8,5"
2,"7.8,0.76,0.04,2.3,0.092,15,54,0.997,3.26,0.65,..."
3,"11.2,0.28,0.56,1.9,0.075,17,60,0.998,3.16,0.58..."
4,"7.4,0.7,0,1.9,0.076,11,34,0.9978,3.51,0.56,9.4,5"


In [3]:
df.describe()

Unnamed: 0,"fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality"
count,1599
unique,1359
top,"7.2,0.36,0.46,2.1,0.074,24,44,0.99534,3.4,0.85..."
freq,4


In [4]:
correlations = df.corr()['quality'].drop('quality')
print(correlations)

KeyError: 'quality'

In [None]:
sns.heatmap(df.corr())
plt.show()

In [None]:
def get_features(correlation_threshold):
    abs_corrs = correlations.abs()
    high_correlations = abs_corrs[abs_corrs > correlation_threshold].index.values.tolist()
    return high_correlations

In [None]:
# taking features with correlation more than 0.05 as input x and quality as target variable y 
features = get_features(0.05) 
print(features) 
X = df[features] 
y = df['quality']

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=99)

Simple Linear Regression Approach

In [None]:
# Fit linear regression to training data
lr = LinearRegression()
lr.fit(X_train,y_train)

# Predict
train_pred=lr.predict(X_train)
lr_pred = lr.predict(X_test)

print('Mean Absolute Error:', mean_absolute_error(y_test, lr_pred))
print('Mean Squared Error:', mean_squared_error(y_test, lr_pred))

In [None]:
# displaying coefficients of each feature
coeffecients = pd.DataFrame(lr.coef_,features)
coeffecients.columns = ['Coeffecient'] 
print(coeffecients)

Ridge Regression Approach

We'll generate an array of alpha values ranging from very big to very small, essentially covering the full range of scenarios from the null model containing only the intercept, to the least squares fit:

In [None]:
alphas = 10**np.linspace(10,-2,100)*0.5
alphas

In [None]:
ridgecv = RidgeCV(alphas = alphas, scoring = 'neg_mean_squared_error', normalize = True)
ridgecv.fit(X_train, y_train)
ridgecv.alpha_

In [None]:
ridge = Ridge(alpha = ridgecv.alpha_, normalize = True)
ridge.fit(X_train, y_train)

ridge_pred = ridge.predict(X_test)

print('Mean Absolute Error:', mean_absolute_error(y_test, ridge_pred))
print('Mean Squared Error:', mean_squared_error(y_test, ridge_pred))

In [None]:
# displaying coefficients of each feature
coeffecients = pd.DataFrame(ridge.coef_,features)
coeffecients.columns = ['Coeffecient'] 
print(coeffecients)

LASSO Regression Approach

In [None]:
lassocv = LassoCV(alphas=None, cv=10, max_iter=100000, normalize = True)
lassocv.fit(X_train, y_train)
lassocv.alpha_

In [None]:
lasso = Lasso(alpha = lassocv.alpha_, normalize = True)
lasso.fit(X_train, y_train)

lasso_pred = lasso.predict(X_test)

print('Mean Absolute Error:', mean_absolute_error(y_test, lasso_pred))
print('Mean Squared Error:', mean_squared_error(y_test, lasso_pred))

In [None]:
# displaying coefficients of each feature
coeffecients = pd.DataFrame(lasso.coef_,features)
coeffecients.columns = ['Coeffecient'] 
print(coeffecients)