In [1]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import pickle

# Load Data

In [2]:
df = pd.read_csv('../data/clean/clean.csv')

In [3]:
df.head()

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [None]:
df.head

# Feature Exploration

## Overall Statistic

Let's calculate basic statistical features for all columns

In [4]:
for column in df.columns:
    print('Statistic for ', column)
    print(df[column].describe())
    print('---------------------')

Statistic for  fixed_acidity
count    1599.000000
mean        8.319637
std         1.741096
min         4.600000
25%         7.100000
50%         7.900000
75%         9.200000
max        15.900000
Name: fixed_acidity, dtype: float64
---------------------
Statistic for  volatile_acidity
count    1599.000000
mean        0.527821
std         0.179060
min         0.120000
25%         0.390000
50%         0.520000
75%         0.640000
max         1.580000
Name: volatile_acidity, dtype: float64
---------------------
Statistic for  citric_acid
count    1599.000000
mean        0.270976
std         0.194801
min         0.000000
25%         0.090000
50%         0.260000
75%         0.420000
max         1.000000
Name: citric_acid, dtype: float64
---------------------
Statistic for  residual_sugar
count    1599.000000
mean        2.538806
std         1.409928
min         0.900000
25%         1.900000
50%         2.200000
75%         2.600000
max        15.500000
Name: residual_sugar, dtype: float6

## Correlation between Features

In [5]:
correlation_matrix = df.corr()

In [6]:
correlation_matrix

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality
fixed_acidity,1.0,-0.256131,0.671703,0.114777,0.093705,-0.153794,-0.113181,0.668047,-0.682978,0.183006,-0.061668,0.124052
volatile_acidity,-0.256131,1.0,-0.552496,0.001918,0.061298,-0.010504,0.07647,0.022026,0.234937,-0.260987,-0.202288,-0.390558
citric_acid,0.671703,-0.552496,1.0,0.143577,0.203823,-0.060978,0.035533,0.364947,-0.541904,0.31277,0.109903,0.226373
residual_sugar,0.114777,0.001918,0.143577,1.0,0.05561,0.187049,0.203028,0.355283,-0.085652,0.005527,0.042075,0.013732
chlorides,0.093705,0.061298,0.203823,0.05561,1.0,0.005562,0.0474,0.200632,-0.265026,0.37126,-0.221141,-0.128907
free_sulfur_dioxide,-0.153794,-0.010504,-0.060978,0.187049,0.005562,1.0,0.667666,-0.021946,0.070377,0.051658,-0.069408,-0.050656
total_sulfur_dioxide,-0.113181,0.07647,0.035533,0.203028,0.0474,0.667666,1.0,0.071269,-0.066495,0.042947,-0.205654,-0.1851
density,0.668047,0.022026,0.364947,0.355283,0.200632,-0.021946,0.071269,1.0,-0.341699,0.148506,-0.49618,-0.174919
pH,-0.682978,0.234937,-0.541904,-0.085652,-0.265026,0.070377,-0.066495,-0.341699,1.0,-0.196648,0.205633,-0.057731
sulphates,0.183006,-0.260987,0.31277,0.005527,0.37126,0.051658,0.042947,0.148506,-0.196648,1.0,0.093595,0.251397


In [8]:
fig = go.Figure(data=go.Heatmap(x=correlation_matrix.columns, y=correlation_matrix.columns,z=correlation_matrix))
fig.show()

## Relation between 'density' and 'fixed_acidity'

From the correlation matrix, we can see that there is a relatively high correlation between 'density' and 'fixed acidity'

In [20]:
fig = px.scatter(x=df.density, y=df.fixed_acidity)
fig.show()

## Relation between 'pH' and 'fixed_acidity'

From the correlation matrix, we can see that there is a high negativ correlation between 'density' and 'fixed acidity'

In [21]:
fig = px.scatter(x=df.pH, y=df.fixed_acidity)
fig.show()

## Distribution of 'Quality' values

In [23]:
fig = px.histogram(df, x="quality")
fig.show()

# Train Model for Predicting Quality of Wine

In [26]:
X = df.drop('quality', axis=1)
y = df.quality

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [31]:
regr = RandomForestRegressor()
regr.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

## Evaluation

In [32]:
y_pred = regr.predict(X_test)

In [4]:
y_pred

NameError: name 'y_pred' is not defined

In [34]:
mean_squared_error(y_test, y_pred)

0.3737215625

# Save and Load Model

## Save Model

In [36]:
pickle.dump(regr, open('RandomForestRegressor.model', 'wb'))

## Load Model

In [37]:
loaded_regr = pickle.load(open('RandomForestRegressor.model', 'rb'))

## Use Loaded Model

In [42]:
sample = [ 9.6,  0.38,  0.4,  1.9,  0.07,  5, 13,  0.99659,  3.15,  0.7, 9.5]

In [43]:
loaded_regr.predict([sample])

array([5.47])