In [1]:
import pandas as pd
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import shap

In [2]:
df = pd.read_csv('archive/winequality-red.csv')
print(df.shape)
df.head()

(1599, 12)


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [3]:
df.isnull().sum()

fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
dtype: int64

In [4]:
train_x, test_x, train_y, test_y = train_test_split(df.drop('quality', axis=1), df['quality'], test_size=0.2, random_state=42)

In [5]:
train_x.shape, test_x.shape

((1279, 11), (320, 11))

In [6]:
model = MLPClassifier(solver='lbfgs', alpha=1e-2, hidden_layer_sizes=(10, 5), random_state=1)
model.fit(train_x, train_y)

MLPClassifier(activation='relu', alpha=0.01, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(10, 5), learning_rate='constant',
              learning_rate_init=0.001, max_iter=200, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=1, shuffle=True, solver='lbfgs', tol=0.0001,
              validation_fraction=0.1, verbose=False, warm_start=False)

In [7]:
model.predict(test_x)

array([5, 5, 6, 5, 6, 5, 5, 5, 6, 6, 6, 5, 5, 5, 5, 6, 5, 5, 6, 5, 6, 5,
       6, 6, 5, 5, 6, 5, 5, 6, 5, 6, 6, 5, 5, 5, 6, 6, 6, 6, 6, 5, 6, 5,
       6, 6, 6, 6, 5, 6, 5, 5, 6, 6, 5, 5, 6, 6, 6, 5, 5, 6, 5, 5, 6, 5,
       6, 5, 6, 5, 6, 5, 6, 6, 6, 5, 6, 6, 6, 6, 5, 6, 5, 6, 6, 6, 5, 6,
       6, 5, 6, 5, 6, 6, 5, 6, 5, 6, 5, 5, 5, 5, 6, 6, 6, 6, 6, 5, 6, 5,
       6, 5, 6, 5, 6, 6, 6, 5, 5, 6, 6, 5, 6, 5, 5, 5, 6, 6, 6, 6, 6, 5,
       5, 6, 6, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 5, 6, 6, 6, 5, 6, 6, 5, 6,
       6, 6, 5, 6, 5, 6, 6, 6, 6, 5, 5, 6, 5, 5, 5, 5, 5, 5, 6, 5, 5, 6,
       6, 5, 5, 6, 5, 6, 5, 6, 5, 6, 6, 6, 6, 5, 6, 6, 6, 6, 6, 5, 5, 5,
       5, 6, 5, 5, 5, 5, 6, 6, 5, 5, 5, 6, 6, 5, 6, 6, 6, 6, 5, 5, 6, 5,
       5, 6, 6, 6, 5, 5, 5, 6, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 5, 5, 6, 5,
       6, 6, 6, 6, 6, 5, 6, 5, 6, 6, 5, 5, 6, 5, 6, 6, 6, 6, 6, 6, 5, 6,
       6, 6, 5, 5, 6, 6, 5, 6, 5, 5, 6, 6, 6, 6, 6, 5, 6, 5, 5, 5, 5, 6,
       5, 6, 5, 6, 6, 6, 6, 5, 5, 6, 5, 6, 6, 6, 6,

In [8]:
model.score(test_x, test_y)

0.559375

In [9]:
# Determining a good initial value for alpha
previous_best = 0
for i in range(1, 100, 1):
    model = MLPClassifier(solver='lbfgs', alpha=i*10**-5, hidden_layer_sizes=(10, 5), random_state=1)
    model.fit(train_x, train_y)
    t = model.score(test_x, test_y)
    if t > previous_best:
        previous_best = t
        print(i, t)

1 0.515625
2 0.55
15 0.553125
23 0.55625
30 0.565625


In [10]:
# Determining a good set of starting parameters for hidden layer sizes
previous_best = 0
for i in range(10, 31):
    for j in range(5, 16):
        for k in range(3, 11):
            model = MLPClassifier(solver='lbfgs', alpha=3e-4, hidden_layer_sizes=(i, j, k), random_state=1)
            model.fit(train_x, train_y)
            t = model.score(test_x, test_y)
            if t > previous_best:
                previous_best = t
                print((i, j, k, t))

(10, 5, 3, 0.546875)
(10, 5, 9, 0.5625)
(10, 6, 7, 0.571875)
(10, 13, 7, 0.575)
(10, 13, 10, 0.58125)
(12, 8, 8, 0.5875)
(13, 12, 3, 0.59375)
(17, 10, 10, 0.596875)


In [11]:
# Further iteration to improve alpha value
for i in range(1, 501):
    model = MLPClassifier(solver='lbfgs', alpha=i*10**-5, hidden_layer_sizes=(17, 10, 10), random_state=1)
    model.fit(train_x, train_y)
    t = model.score(test_x, test_y)
    if t > previous_best:
        previous_best = t
        print(i, t)

5 0.6
29 0.6125
485 0.615625


In [12]:
# Aaand further iteration on hidden layer sizes
for i in range(15, 26):
    for j in range(7, 14):
        for k in range(7, 14):
            model = MLPClassifier(solver='lbfgs', alpha=4.85e-3, hidden_layer_sizes=(i, j, k), random_state=1)
            model.fit(train_x, train_y)
            t = model.score(test_x, test_y)
            if t > previous_best:
                previous_best = t
                print((i, j, k, t))

In [16]:
# Final model
model1 = MLPClassifier(solver='lbfgs', alpha=4.85e-3, hidden_layer_sizes=(17, 10, 10), random_state=1)
model1.fit(train_x, train_y)
model1.score(test_x, test_y)

0.615625

In [17]:
# Comparison to a RFC model
model2 = RandomForestClassifier(random_state=1)
model2.fit(train_x, train_y)
model2.score(test_x, test_y)

The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.


0.671875

In [15]:
# Shap plotting does not yet support MLPClassifier
#shap_values = shap.TreeExplainer(model1).shap_values(x_train)
#shap.summary_plot(shap_values, x_train, plot_type='bar')