In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
import seaborn as sns

In [2]:
#  Read the CSV and Perform Basic Data Cleaning
df = pd.read_csv('../data/combined_wine.csv', sep=',')
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()

df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,type
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,1
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,1
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,1
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,1
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,1


In [3]:
#Dividing wine as good and bad by giving the limit for the quality
bins = (2, 6.5, 9)
group_names = ['bad', 'good']
df['quality'] = pd.cut(df['quality'], bins = bins, labels = group_names)

In [4]:
#Now lets assign a labels to our quality variable
label_quality = LabelEncoder()

In [5]:
#Bad becomes 0 and good becomes 1 
df['quality'] = label_quality.fit_transform(df['quality'])

In [6]:
df['quality'].value_counts()

0    5220
1    1277
Name: quality, dtype: int64

In [7]:
#  Create a Train Test Split
from sklearn.model_selection import train_test_split
y = df["quality"]
X = df.drop(columns=["quality"])
# X = df[["citric acid", "chlorides", "alcohol"]]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

In [8]:
# Pre-processing
from sklearn.preprocessing import MinMaxScaler
X_scaler = MinMaxScaler().fit(X_train)

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [9]:
print(X_test[:2])

      fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
2168            6.5              0.28         0.27             5.2      0.040   
5880            5.7              0.26         0.24            17.8      0.059   

      free sulfur dioxide  total sulfur dioxide  density    pH  sulphates  \
2168                 44.0                 179.0  0.99480  3.19       0.69   
5880                 23.0                 124.0  0.99773  3.30       0.50   

      alcohol  type  
2168      9.4     0  
5880     10.1     0  


In [10]:
print(X_train_scaled[:2])

[[0.25619835 0.128      0.29268293 0.30981595 0.03654485 0.13541667
  0.30645161 0.2        0.37209302 0.0625     0.43478261 0.        ]
 [0.21487603 0.128      0.21138211 0.11656442 0.07475083 0.15972222
  0.40552995 0.15911283 0.31007752 0.15909091 0.2173913  0.        ]]


In [11]:
import tensorflow as tf
from keras.models import Sequential
from keras.utils import to_categorical
from keras.layers import Dense
from sklearn.preprocessing import LabelEncoder
from keras.callbacks import EarlyStopping

In [12]:
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
encoded_y_train = label_encoder.transform(y_train)
encoded_y_test = label_encoder.transform(y_test)

y_train_categorical = to_categorical(encoded_y_train)
y_test_categorical = to_categorical(encoded_y_test)

In [13]:
y_train_categorical.shape

(4872, 2)

In [14]:
# Train the Model
model = Sequential()
model.add(Dense(units=200, activation='relu', input_dim=12))
model.add(Dense(units=200, activation='relu'))
# model.add(Dense(units=2, activation='sigmoid'))
model.add(Dense(units=2, activation='softmax'))

In [15]:
# Compile and fit the model
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [16]:
 model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 200)               2600      
_________________________________________________________________
dense_1 (Dense)              (None, 200)               40200     
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 402       
Total params: 43,202
Trainable params: 43,202
Non-trainable params: 0
_________________________________________________________________


In [17]:
# set early stopping as callback
callbacks = [EarlyStopping(monitor='loss', patience=2)]
model.fit(
    X_train_scaled,
    y_train_categorical,
    callbacks=callbacks,
    epochs=90,
    shuffle=True,
    verbose=2
)

Epoch 1/90
153/153 - 1s - loss: 0.4442 - accuracy: 0.8023
Epoch 2/90
153/153 - 0s - loss: 0.3999 - accuracy: 0.8153
Epoch 3/90
153/153 - 0s - loss: 0.3932 - accuracy: 0.8210
Epoch 4/90
153/153 - 0s - loss: 0.3823 - accuracy: 0.8235
Epoch 5/90
153/153 - 0s - loss: 0.3803 - accuracy: 0.8239
Epoch 6/90
153/153 - 0s - loss: 0.3755 - accuracy: 0.8259
Epoch 7/90
153/153 - 0s - loss: 0.3677 - accuracy: 0.8286
Epoch 8/90
153/153 - 0s - loss: 0.3667 - accuracy: 0.8290
Epoch 9/90
153/153 - 0s - loss: 0.3615 - accuracy: 0.8298
Epoch 10/90
153/153 - 0s - loss: 0.3585 - accuracy: 0.8311
Epoch 11/90
153/153 - 0s - loss: 0.3520 - accuracy: 0.8350
Epoch 12/90
153/153 - 0s - loss: 0.3524 - accuracy: 0.8378
Epoch 13/90
153/153 - 0s - loss: 0.3503 - accuracy: 0.8360
Epoch 14/90
153/153 - 0s - loss: 0.3517 - accuracy: 0.8309
Epoch 15/90
153/153 - 0s - loss: 0.3471 - accuracy: 0.8403
Epoch 16/90
153/153 - 0s - loss: 0.3473 - accuracy: 0.8403
Epoch 17/90
153/153 - 0s - loss: 0.3427 - accuracy: 0.8436
Epoch 

<tensorflow.python.keras.callbacks.History at 0x16ba8c0c748>

In [18]:
model_loss, model_accuracy = model.evaluate(
    X_test_scaled, y_test_categorical, verbose=2)
print(
    f"Normal Neural Network - Loss: {model_loss}, Accuracy: {model_accuracy}")

51/51 - 0s - loss: 0.3590 - accuracy: 0.8320
Normal Neural Network - Loss: 0.3590450584888458, Accuracy: 0.8320000171661377


In [19]:
encoded_predictions = model.predict_classes(X_test_scaled[:20])
prediction_labels = label_encoder.inverse_transform(encoded_predictions)



In [20]:
print(X_test_scaled[:2])

[[0.2231405  0.16       0.2195122  0.07055215 0.05149502 0.14930556
  0.39861751 0.14792671 0.36434109 0.26704545 0.20289855 0.        ]
 [0.15702479 0.144      0.19512195 0.26380368 0.08305648 0.07638889
  0.2718894  0.20443587 0.4496124  0.15909091 0.30434783 0.        ]]


In [33]:
print(f"Predicted classes: {prediction_labels[:20]}")
print(f"Actual Labels: {list(y_test[:20])}")

Predicted classes: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0]
Actual Labels: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0]


In [37]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, prediction_labels)

array([[1244,   62],
       [ 211,  108]], dtype=int64)

In [36]:
predictions = prediction_labels[:20]
matrix = pd.DataFrame({"Prediction": predictions, "Actual": y_test[:20]})
matrix

Unnamed: 0,Prediction,Actual
2168,0,0
5880,0,0
4776,0,0
683,0,0
1851,0,0
47,0,0
3642,0,0
3054,0,0
3603,0,0
2030,0,0


In [23]:
from sklearn.metrics import classification_report
encoded_predictions = model.predict_classes(X_test_scaled)
prediction_labels = label_encoder.inverse_transform(encoded_predictions)
print(classification_report(y_test, prediction_labels))

              precision    recall  f1-score   support

           0       0.85      0.95      0.90      1306
           1       0.64      0.34      0.44       319

    accuracy                           0.83      1625
   macro avg       0.75      0.65      0.67      1625
weighted avg       0.81      0.83      0.81      1625



In [None]:
# save fitted model to file
model.save("deep_learning_model.h5")