## 1. Imports

In [13]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, LassoCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

data = pd.read_csv("data/WineQT.csv")
data.columns = data.columns.str.replace(" ", "_")

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1143 entries, 0 to 1142
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed_acidity         1143 non-null   float64
 1   volatile_acidity      1143 non-null   float64
 2   citric_acid           1143 non-null   float64
 3   residual_sugar        1143 non-null   float64
 4   chlorides             1143 non-null   float64
 5   free_sulfur_dioxide   1143 non-null   float64
 6   total_sulfur_dioxide  1143 non-null   float64
 7   density               1143 non-null   float64
 8   pH                    1143 non-null   float64
 9   sulphates             1143 non-null   float64
 10  alcohol               1143 non-null   float64
 11  quality               1143 non-null   int64  
 12  Id                    1143 non-null   int64  
dtypes: float64(11), int64(2)
memory usage: 116.2 KB


## 2. Data preparation

In [6]:
target = data["quality"]

features = data.drop("quality", axis=1).drop("Id", axis=1)



X_train, X_test, y_train, y_test = train_test_split(features, target,
                                                    test_size = 0.2,
                                                    random_state = 23)

## 3. Modelling

In [14]:
models = [LinearRegression(), Lasso(), LassoCV(cv=5), RandomForestRegressor(), GradientBoostingRegressor()]

results = []
for model in models:
    model.fit(X_train, y_train)
    preds = model.predict(X_test)

    rmse = np.sqrt(np.mean((y_test - preds)**2))
    results.append({"model": model, "rmse": rmse})

results_df = pd.DataFrame(results)
results_df

Unnamed: 0,model,rmse
0,LinearRegression(),0.684036
1,Lasso(),0.869115
2,LassoCV(cv=5),0.695059
3,"(DecisionTreeRegressor(max_features='auto', ra...",0.609509
4,([DecisionTreeRegressor(criterion='friedman_ms...,0.659462


In [9]:
# the random forest seems the best performing model

# hyperparameter tuning
from sklearn.model_selection import RandomizedSearchCV

random_grid = {"n_estimators": [5, 20, 50, 100],
               "max_features": ["auto", "sqrt"],
               "max_depth": [int(i) for i in np.linspace(10, 120, num=12)],
               "min_samples_split": [2, 6, 10],
               "min_samples_leaf": [1, 3, 4],
               "bootstrap": [True, False]}

rf_random = RandomizedSearchCV(estimator = RandomForestRegressor(), param_distributions=random_grid,
                               n_iter=100, cv=5, verbose=2, random_state=23, n_jobs=-1)

rf_random.fit(X_train, y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


RandomizedSearchCV(cv=5, estimator=RandomForestRegressor(), n_iter=100,
                   n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100, 110,
                                                      120],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 3, 4],
                                        'min_samples_split': [2, 6, 10],
                                        'n_estimators': [5, 20, 50, 100]},
                   random_state=23, verbose=2)

In [11]:
# get best parameters
best_params = rf_random.best_params_
best_params

{'n_estimators': 100,
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'max_features': 'sqrt',
 'max_depth': 90,
 'bootstrap': True}

In [22]:
model = RandomForestRegressor(n_estimators=best_params.get("n_estimators"),
                              min_samples_split=best_params.get("min_samples_split"),
                              min_samples_leaf=best_params.get("min_samples_leaf"),
                              max_features=best_params.get("max_features"),
                              max_depth=best_params.get("max_depth"),
                              bootstrap=best_params.get("bootstrap"))

model.fit(X_train, y_train)

preds = model.predict(X_test)

rmse = np.sqrt(np.mean((y_test - preds)**2))

rmse

0.6171918374315555

In [26]:
# Visualise feature importance
import plotly.express as px

forest_importances = pd.Series(model.feature_importances_, index=X_train.columns)
px.bar(x=forest_importances.index, y=forest_importances)

In [27]:
data.describe()

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality,Id
count,1143.0,1143.0,1143.0,1143.0,1143.0,1143.0,1143.0,1143.0,1143.0,1143.0,1143.0,1143.0,1143.0
mean,8.311111,0.531339,0.268364,2.532152,0.086933,15.615486,45.914698,0.99673,3.311015,0.657708,10.442111,5.657043,804.969379
std,1.747595,0.179633,0.196686,1.355917,0.047267,10.250486,32.78213,0.001925,0.156664,0.170399,1.082196,0.805824,463.997116
min,4.6,0.12,0.0,0.9,0.012,1.0,6.0,0.99007,2.74,0.33,8.4,3.0,0.0
25%,7.1,0.3925,0.09,1.9,0.07,7.0,21.0,0.99557,3.205,0.55,9.5,5.0,411.0
50%,7.9,0.52,0.25,2.2,0.079,13.0,37.0,0.99668,3.31,0.62,10.2,6.0,794.0
75%,9.1,0.64,0.42,2.6,0.09,21.0,61.0,0.997845,3.4,0.73,11.1,6.0,1209.5
max,15.9,1.58,1.0,15.5,0.611,68.0,289.0,1.00369,4.01,2.0,14.9,8.0,1597.0


In [28]:
new_input = np.array([[4.3, 1, 0.01, 1, 0.05, 5, 25, 0.9, 3, 0.6, 13]])

model.predict(new_input)[0]

5.74

In [78]:
# # Final model for predictions
# model = Lasso()
# model.fit(X_train, y_train)
# preds = model.predict(X_test)

# rmse = np.sqrt(np.mean((y_test - preds)**2))
# rmse

0.8691145628384873

In [79]:
# # selected columns by Lasso
# selected_coeffs = [False if i == 0 else True for i in model.coef_.tolist()]

# selected_columns = features.loc[:,selected_coeffs]
# selected_columns

Unnamed: 0,total_sulfur_dioxide
0,34.0
1,67.0
2,54.0
3,60.0
4,34.0
...,...
1138,40.0
1139,38.0
1140,44.0
1141,51.0


## 4. Predictions

In [80]:
# only input needed for predictions
SULFUR_DIOXIDE = 200

new_input = np.array([[0, 0, 0, 0, 0, 0, SULFUR_DIOXIDE, 0, 0, 0, 0]])

model.predict(new_input)[0]

5.079575167554518

## 5. Saving the model

In [81]:
import pickle

pickle.dump(model, open('LassoModel.pkl', 'wb'))

In [82]:
loaded_model = pickle.load(open('LassoModel.pkl', 'rb'))

In [83]:
# only input needed for predictions
SULFUR_DIOXIDE = 200

new_input = np.array([[0, 0, 0, 0, 0, 0, SULFUR_DIOXIDE, 0, 0, 0, 0]])

loaded_model.predict(new_input)[0]

5.079575167554518