In [2]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
import seaborn as sns

In [3]:
#  Read the CSV and Perform Basic Data Cleaning
df1 = pd.read_csv('data/winequality-red.csv', sep=';')
df2 = pd.read_csv('data/winequality-white.csv', sep=';')
# Drop the null columns where all values are null
df1 = df1.dropna(axis='columns', how='all')
df2 = df2.dropna(axis='columns', how='all')
# Drop the null rows
df1 = df1.dropna()
df2 = df2.dropna()
# Merge two dataframes
df = df1.merge(df2, how='outer')

# pick white/red
# df = df2 # pick white
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
2,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
3,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
4,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6


In [4]:
#Dividing wine as good and bad by giving the limit for the quality
bins = (2, 6.5, 9)
group_names = ['bad', 'good']
df['quality'] = pd.cut(df['quality'], bins = bins, labels = group_names)

In [5]:
#Now lets assign a labels to our quality variable
label_quality = LabelEncoder()

In [6]:
#Bad becomes 0 and good becomes 1 
df['quality'] = label_quality.fit_transform(df['quality'])

In [7]:
df['quality'].value_counts()

0    5219
1    1276
Name: quality, dtype: int64

In [8]:
#  Create a Train Test Split
from sklearn.model_selection import train_test_split
y = df["quality"]
X = df.drop(columns=["quality"])
# X = df[["citric acid", "chlorides", "alcohol"]]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

In [9]:
# Pre-processing
from sklearn.preprocessing import MinMaxScaler
X_scaler = MinMaxScaler().fit(X_train)

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [21]:
print(X_test[:2])

      fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
6087            5.8             0.300         0.33             3.5      0.033   
1531            6.1             0.705         0.10             2.8      0.081   

      free sulfur dioxide  total sulfur dioxide  density   pH  sulphates  \
6087                 25.0                 116.0  0.99057  3.2       0.44   
1531                 13.0                  28.0  0.99631  3.6       0.66   

      alcohol  
6087     11.7  
1531     10.2  


In [23]:
print(X_train_scaled[:2])

[[0.31404959 0.016      0.26829268 0.00613497 0.03654485 0.11111111
  0.20046083 0.07383844 0.2519685  0.26589595 0.46376812]
 [0.41322314 0.232      0.3902439  0.02300613 0.1461794  0.13194444
  0.3202765  0.20030846 0.23622047 0.46820809 0.1884058 ]]


In [10]:
import tensorflow as tf
from keras.models import Sequential
from keras.utils import to_categorical
from keras.layers import Dense
from sklearn.preprocessing import LabelEncoder
from keras.callbacks import EarlyStopping

In [11]:
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
encoded_y_train = label_encoder.transform(y_train)
encoded_y_test = label_encoder.transform(y_test)

y_train_categorical = to_categorical(encoded_y_train)
y_test_categorical = to_categorical(encoded_y_test)

In [12]:
y_train_categorical.shape

(4871, 2)

In [13]:
# Train the Model
model = Sequential()
model.add(Dense(units=200, activation='relu', input_dim=11))
model.add(Dense(units=200, activation='relu'))
# model.add(Dense(units=2, activation='sigmoid'))
model.add(Dense(units=2, activation='softmax'))

In [14]:
# Compile and fit the model
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [15]:
 model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 200)               2400      
_________________________________________________________________
dense_1 (Dense)              (None, 200)               40200     
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 402       
Total params: 43,002
Trainable params: 43,002
Non-trainable params: 0
_________________________________________________________________


In [16]:
# set early stopping as callback
callbacks = [EarlyStopping(monitor='loss', patience=2)]
model.fit(
    X_train_scaled,
    y_train_categorical,
    callbacks=callbacks,
    epochs=90,
    shuffle=True,
    verbose=2
)

Epoch 1/90
153/153 - 1s - loss: 0.4407 - accuracy: 0.8041
Epoch 2/90
153/153 - 0s - loss: 0.3985 - accuracy: 0.8140
Epoch 3/90
153/153 - 0s - loss: 0.3898 - accuracy: 0.8150
Epoch 4/90
153/153 - 0s - loss: 0.3819 - accuracy: 0.8183
Epoch 5/90
153/153 - 0s - loss: 0.3785 - accuracy: 0.8177
Epoch 6/90
153/153 - 0s - loss: 0.3760 - accuracy: 0.8202
Epoch 7/90
153/153 - 0s - loss: 0.3656 - accuracy: 0.8247
Epoch 8/90
153/153 - 0s - loss: 0.3685 - accuracy: 0.8292
Epoch 9/90
153/153 - 0s - loss: 0.3608 - accuracy: 0.8300
Epoch 10/90
153/153 - 0s - loss: 0.3613 - accuracy: 0.8310
Epoch 11/90
153/153 - 0s - loss: 0.3562 - accuracy: 0.8329
Epoch 12/90
153/153 - 0s - loss: 0.3550 - accuracy: 0.8351
Epoch 13/90
153/153 - 0s - loss: 0.3499 - accuracy: 0.8362
Epoch 14/90
153/153 - 0s - loss: 0.3490 - accuracy: 0.8317
Epoch 15/90
153/153 - 0s - loss: 0.3486 - accuracy: 0.8325
Epoch 16/90
153/153 - 0s - loss: 0.3491 - accuracy: 0.8343
Epoch 17/90
153/153 - 0s - loss: 0.3451 - accuracy: 0.8370
Epoch 

<tensorflow.python.keras.callbacks.History at 0x213c9b1ebe0>

In [18]:
model_loss, model_accuracy = model.evaluate(
    X_test_scaled, y_test_categorical, verbose=2)
print(
    f"Normal Neural Network - Loss: {model_loss}, Accuracy: {model_accuracy}")

51/51 - 0s - loss: 0.3565 - accuracy: 0.8282
Normal Neural Network - Loss: 0.35647597908973694, Accuracy: 0.828201949596405


In [19]:
encoded_predictions = model.predict_classes(X_test_scaled[:20])
prediction_labels = label_encoder.inverse_transform(encoded_predictions)



In [20]:
print(X_test_scaled[:2])

[[0.16528926 0.176      0.26829268 0.04447853 0.03986711 0.08333333
  0.25345622 0.06670522 0.36220472 0.12716763 0.53623188]
 [0.19008264 0.5        0.08130081 0.03374233 0.11960133 0.04166667
  0.05069124 0.17736649 0.67716535 0.25433526 0.31884058]]


In [18]:
print(f"Predicted classes: {prediction_labels}")
print(f"Actual Labels: {list(y_test[:20])}")

Predicted classes: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
Actual Labels: [0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]


In [19]:
# save fitted model to file
model.save("model.h5")