## 1. Imports

In [74]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor

data = pd.read_csv("data/WineQT.csv")
data.columns = data.columns.str.replace(" ", "_")

In [75]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1143 entries, 0 to 1142
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed_acidity         1143 non-null   float64
 1   volatile_acidity      1143 non-null   float64
 2   citric_acid           1143 non-null   float64
 3   residual_sugar        1143 non-null   float64
 4   chlorides             1143 non-null   float64
 5   free_sulfur_dioxide   1143 non-null   float64
 6   total_sulfur_dioxide  1143 non-null   float64
 7   density               1143 non-null   float64
 8   pH                    1143 non-null   float64
 9   sulphates             1143 non-null   float64
 10  alcohol               1143 non-null   float64
 11  quality               1143 non-null   int64  
 12  Id                    1143 non-null   int64  
dtypes: float64(11), int64(2)
memory usage: 116.2 KB


## 2. Data preparation

In [76]:
target = data["quality"]

features = data.drop("quality", axis=1).drop("Id", axis=1)



X_train, X_test, y_train, y_test = train_test_split(features, target,
                                                    test_size = 0.2,
                                                    random_state = 23)

## 3. Modelling

In [77]:
models = [LinearRegression(), Lasso(), RandomForestRegressor()]

results = []
for model in models:
    model.fit(X_train, y_train)
    preds = model.predict(X_test)

    rmse = np.sqrt(np.mean((y_test - preds)**2))
    results.append({"model": model, "rmse": rmse})

results_df = pd.DataFrame(results)
results_df

Unnamed: 0,model,rmse
0,LinearRegression(),0.684036
1,Lasso(),0.869115
2,"(DecisionTreeRegressor(max_features='auto', ra...",0.620958


In [78]:
# Final model for predictions
model = Lasso()
model.fit(X_train, y_train)
preds = model.predict(X_test)

rmse = np.sqrt(np.mean((y_test - preds)**2))
rmse

0.8691145628384873

In [79]:
# selected columns by Lasso
selected_coeffs = [False if i == 0 else True for i in model.coef_.tolist()]

selected_columns = features.loc[:,selected_coeffs]
selected_columns

Unnamed: 0,total_sulfur_dioxide
0,34.0
1,67.0
2,54.0
3,60.0
4,34.0
...,...
1138,40.0
1139,38.0
1140,44.0
1141,51.0


## 4. Predictions

In [80]:
# only input needed for predictions
SULFUR_DIOXIDE = 200

new_input = np.array([[0, 0, 0, 0, 0, 0, SULFUR_DIOXIDE, 0, 0, 0, 0]])

model.predict(new_input)[0]

5.079575167554518

## 5. Saving the model

In [81]:
import pickle

pickle.dump(model, open('LassoModel.pkl', 'wb'))

In [82]:
loaded_model = pickle.load(open('LassoModel.pkl', 'rb'))

In [83]:
# only input needed for predictions
SULFUR_DIOXIDE = 200

new_input = np.array([[0, 0, 0, 0, 0, 0, SULFUR_DIOXIDE, 0, 0, 0, 0]])

loaded_model.predict(new_input)[0]

5.079575167554518