# Model training

In this notebook you will find the training process only, no explorations are done in this file.

The target variable is `quality`. We are considering each note is a category so the problem is not a regression but a multi-classification.

In [1]:
import joblib
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv("winequality.csv")

In [3]:
df.head()

Unnamed: 0,type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,white,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,white,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,white,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,white,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,white,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [24]:
df[df.quality==7]

Unnamed: 0,type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
13,white,6.6,0.16,0.40,1.5,0.044,48.0,143.0,0.99120,3.54,0.52,12.40,7
15,white,6.6,0.17,0.38,1.5,0.032,28.0,112.0,0.99140,3.25,0.55,11.40,7
21,white,6.4,0.31,0.38,2.9,0.038,19.0,102.0,0.99120,3.17,0.35,11.00,7
29,white,7.2,0.32,0.36,2.0,0.033,37.0,114.0,0.99060,3.10,0.71,12.30,7
45,white,7.4,0.18,0.31,1.4,0.058,38.0,167.0,0.99310,3.16,0.53,10.00,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6432,red,6.6,0.56,0.14,2.4,0.064,13.0,29.0,0.99397,3.42,0.62,11.70,7
6439,red,7.4,0.25,0.29,2.2,0.054,19.0,49.0,0.99666,3.40,0.76,10.90,7
6442,red,8.4,0.37,0.43,2.3,0.063,12.0,19.0,0.99550,3.17,0.81,11.20,7
6453,red,7.0,0.56,0.17,1.7,0.065,15.0,24.0,0.99514,3.44,0.68,10.55,7


In [4]:
y = df["quality"]
X = df.drop(["quality", "type"], axis=1)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
model = Pipeline(steps=[("imputer", SimpleImputer(strategy="mean")),
                        ("scaler", StandardScaler()),
                        ("classifier", RandomForestClassifier())])

In [7]:
model.fit(X_train, y_train)

print("Accuracy: {:.2f}".format(model.score(X_test, y_test)))

Accuracy: 0.70


In [8]:
joblib.dump(model, "model.joblib")

['model.joblib']