In [9]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge, Lasso, RidgeCV, LassoCV, LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split

In [53]:
df = pd.read_csv('C:/Users/Ilsaf/Desktop/winequality-white.csv', sep=";")

In [54]:
df.columns

Index(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol', 'quality'],
      dtype='object')

In [55]:
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [56]:
df.quality.unique()

array([6, 5, 7, 8, 4, 3, 9], dtype=int64)

In [133]:
X, y = df.drop("quality", axis=1), df.quality

In [134]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=17)

In [135]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [136]:
lr = LinearRegression(fit_intercept=True)
lr.fit(X_train_scaled, y_train)

In [137]:
mean_squared_error(y_train, lr.predict(X_train_scaled))

0.5580606489803572

In [138]:
mean_squared_error(y_test, lr.predict(X_test_scaled))

0.5842473102404546

In [139]:
pd.DataFrame({"columns": df.columns[:-1], "coef": np.abs(lr.coef_)}).sort_values("coef", ascending=False)

Unnamed: 0,columns,coef
7,density,0.66572
3,residual sugar,0.538164
1,volatile acidity,0.19226
8,pH,0.150036
10,alcohol,0.129533
0,fixed acidity,0.097822
9,sulphates,0.062053
5,free sulfur dioxide,0.04218
6,total sulfur dioxide,0.014304
4,chlorides,0.008127


In [140]:
lasso = Lasso(alpha=0.01, random_state=17)
lasso.fit(X_train_scaled, y_train)

In [141]:
pd.DataFrame({"columns": df.columns[:-1], "coef": np.abs(lasso.coef_)}).sort_values("coef", ascending=False)

Unnamed: 0,columns,coef
10,alcohol,0.322425
3,residual sugar,0.256363
7,density,0.235492
1,volatile acidity,0.188479
8,pH,0.067277
5,free sulfur dioxide,0.043088
9,sulphates,0.029722
4,chlorides,0.002747
0,fixed acidity,0.0
2,citric acid,0.0


In [145]:
alphas = np.logspace(-6, 2, 200)

In [146]:
lassoCV = LassoCV(alphas=alphas, cv=5, random_state=17)

In [147]:
lassoCV.fit(X_train_scaled, y_train)

In [148]:
pd.DataFrame({"columns": df.columns[:-1], "coef": np.abs(lassoCV.coef_)}).sort_values("coef", ascending=False)

Unnamed: 0,columns,coef
7,density,0.648161
3,residual sugar,0.526883
1,volatile acidity,0.192049
8,pH,0.146549
10,alcohol,0.137115
0,fixed acidity,0.093295
9,sulphates,0.060939
5,free sulfur dioxide,0.042698
6,total sulfur dioxide,0.012969
4,chlorides,0.006933


In [149]:
mean_squared_error(y_train, lassoCV.predict(X_train_scaled))

0.558070014187378

In [150]:
mean_squared_error(y_test, lassoCV.predict(X_test_scaled))

0.5832976077860635

In [183]:
forest = RandomForestRegressor(random_state=17)
forest.fit(X_train_scaled, y_train)

In [184]:
mean_squared_error(y_train, forest.predict(X_train_scaled))

0.05261155192532089

In [185]:
cross_val_score(forest, X_train, y_train, scoring='neg_mean_squared_error').mean()

-0.41433128807644015

In [187]:
mean_squared_error(y_test, forest.predict(X_test_scaled))

0.37163775510204083

In [165]:
forest_params = {'max_depth': list(range(10, 25)), 
                 'min_samples_leaf': list(range(1, 8)),
                 'max_features': list(range(6,12))}

In [166]:
locally_best_forest = GridSearchCV(forest, param_grid=forest_params, n_jobs=-1, scoring='neg_mean_squared_error')
locally_best_forest.fit(X_train, y_train)

In [172]:
locally_best_forest.best_params_

{'max_depth': 21, 'max_features': 6, 'min_samples_leaf': 1}

In [173]:
cross_val_score(locally_best_forest.best_estimator_, X_train, y_train, scoring='neg_mean_squared_error').mean()

-0.3978585793392684

In [176]:
best_forest = locally_best_forest.best_estimator_

In [177]:
mean_squared_error(y_test, best_forest.predict(X_test))

0.36575122387834635

In [171]:
pd.DataFrame(X_test_scaled, columns=df.columns[:-1]).head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,0.402427,-0.299522,-0.438812,1.622733,0.389351,1.132334,1.554483,1.580995,-0.453747,-0.965416,-1.24867
1,-0.295152,-0.299522,-0.114603,-0.146263,-0.303727,-1.180352,-1.63579,-0.801434,-1.51571,0.28108,0.45788
2,0.169901,-1.575365,-0.438812,-0.005547,0.475986,-1.064717,-0.820756,0.58974,2.068416,-0.431204,-0.761084
3,-1.341519,-0.005096,0.209605,-0.910148,-0.217093,0.149442,-1.123483,-1.351752,0.541844,4.376708,1.107994
4,-0.178888,0.191188,0.128553,-0.990556,5.804029,0.033808,0.506583,-0.093884,-0.52012,0.45915,-0.923613


In [181]:
pd.DataFrame({"columns": df.columns[:-1], "coef": np.abs(best_forest.feature_importances_)}).sort_values("coef", ascending=False)

Unnamed: 0,columns,coef
10,alcohol,0.206056
1,volatile acidity,0.117578
5,free sulfur dioxide,0.111556
7,density,0.088549
8,pH,0.073659
6,total sulfur dioxide,0.07364
4,chlorides,0.073366
3,residual sugar,0.072072
2,citric acid,0.062601
0,fixed acidity,0.061813
