# Import Required Packages

In [1]:
# Import packages
import numpy as np
import pandas as pd
import warnings
import joblib
from time import time
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, SCORERS

warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=DeprecationWarning)

# Data Preprocessing

## Load wine data

In [2]:
# Load data
wine_data = pd.read_csv("white_wine_quality.csv")
print(wine_data.head())

   fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
0            8.5              0.26         0.21            16.2      0.074   
1            5.8              0.24         0.44             3.5      0.029   
2            9.1              0.59         0.38             1.6      0.066   
3            7.1              0.32         0.32            11.0      0.038   
4            6.9              0.39         0.40             4.6      0.022   

   free sulfur dioxide  total sulfur dioxide  density    pH  sulphates  \
0                 41.0                 197.0   0.9980  3.02       0.50   
1                  5.0                 109.0   0.9913  3.53       0.43   
2                 34.0                 182.0   0.9968  3.23       0.38   
3                 16.0                  66.0   0.9937  3.24       0.40   
4                  5.0                  19.0   0.9915  3.31       0.37   

   alcohol  quality  
0      9.8        3  
1     11.7        3  
2      8.5        3 

## Scale data to between 0 and 1

In [3]:
# Scale data with min-max scaler for each column in X
scaler = MinMaxScaler()
wine_data_scaled = scaler.fit_transform(wine_data)
wine_data_scaled = pd.DataFrame(wine_data_scaled, columns=wine_data.columns.values)
print(wine_data_scaled.head())

   fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
0       0.451923          0.176471     0.126506        0.239264   0.192878   
1       0.192308          0.156863     0.265060        0.044479   0.059347   
2       0.509615          0.500000     0.228916        0.015337   0.169139   
3       0.317308          0.235294     0.192771        0.159509   0.086053   
4       0.298077          0.303922     0.240964        0.061350   0.038576   

   free sulfur dioxide  total sulfur dioxide   density        pH  sulphates  \
0             0.135889              0.436195  0.209948  0.272727   0.325581   
1             0.010453              0.232019  0.080779  0.736364   0.244186   
2             0.111498              0.401392  0.186813  0.463636   0.186047   
3             0.048780              0.132251  0.127048  0.472727   0.209302   
4             0.010453              0.023202  0.084635  0.536364   0.174419   

    alcohol  quality  
0  0.290323      0.0  
1  0.59677

## Split data into features and labels, and then training and testing data, then store them

In [4]:
features = wine_data_scaled.drop('quality', axis=1)
labels = wine_data_scaled['quality']

# Split data to train and test
X_train, X_test, y_train, y_test = train_test_split(features, labels,
                                                    stratify=labels, 
                                                    test_size=0.2, random_state=490)

X_train.to_csv('train_features.csv', index=False)
X_test.to_csv('test_features.csv', index=False)

y_train.to_csv('train_labels.csv', index=False)
y_test.to_csv('test_labels.csv', index=False)

# Training Linear Regression Model

In [5]:
tr_features = pd.read_csv('train_features.csv')
tr_labels = pd.read_csv('train_labels.csv')

print(tr_features.head())
print(tr_labels.head())

lr = LinearRegression()

mdl = lr.fit(tr_features, tr_labels)

scores = cross_val_score(lr, tr_features, tr_labels, cv=5, scoring='neg_mean_squared_error')
print(scores)

mse_scores = -scores
print(mse_scores)

rmse_scores = np.sqrt(mse_scores)
print(rmse_scores)

print(rmse_scores.mean())

def print_regression_results(mdl):
    print('Coefficients: \n', mdl.coef_)

    pred_labels =  mdl.predict(tr_features)
    print('Mean squared error: %.2f'
          % mean_squared_error(tr_labels, pred_labels))

    print('Coefficient of determination: %.2f'
          % r2_score(tr_labels, pred_labels))
    
    print('mean absolute error: %.2f'
          % mean_absolute_error(tr_labels, pred_labels))

# Print regression results
print_regression_results(mdl)

# Store regression model
joblib.dump(mdl, './LR.pkl')

   fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
0       0.375000          0.196078     0.210843        0.225460   0.139466   
1       0.269231          0.254902     0.150602        0.064417   0.086053   
2       0.221154          0.068627     0.240964        0.009202   0.062315   
3       0.115385          0.117647     0.240964        0.019939   0.017804   
4       0.403846          0.117647     0.240964        0.070552   0.136499   

   free sulfur dioxide  total sulfur dioxide   density        pH  sulphates  \
0             0.101045              0.250580  0.244650  0.500000   0.325581   
1             0.048780              0.259861  0.093889  0.581818   0.569767   
2             0.059233              0.174014  0.041450  0.427273   0.860465   
3             0.062718              0.206497  0.049933  0.590909   0.383721   
4             0.135889              0.366589  0.157895  0.418182   0.209302   

    alcohol  
0  0.258065  
1  0.741935  
2  0.806452  


['./LR.pkl']

# Training Neural Network Model

## 5-fold cross validation

In [6]:
tr_features = pd.read_csv('train_features.csv')
tr_labels = pd.read_csv('train_labels.csv')

def print_results(results):
    print('Best Hyperparams: {}\n'.format(results.best_params_))

    score = -results.cv_results_['mean_test_score']
    for mse, params in zip(score, results.cv_results_['params']):
        print('mse: {} / for {}'.format(round(mse, 6), params))


mlp = MLPRegressor(max_iter=5000)

parameters = {
    'hidden_layer_sizes': [(400,400,400), (200,200,200)],
    'activation': ['relu'],
    'learning_rate': ['constant']
}

cv = GridSearchCV(mlp, parameters, cv=5, scoring='neg_mean_squared_error')
cv.fit(tr_features, tr_labels.values.ravel())
print_results(cv)

joblib.dump(cv.best_estimator_, './MLP.pkl')

Best Hyperparams: {'activation': 'relu', 'hidden_layer_sizes': (200, 200, 200), 'learning_rate': 'constant'}

mse: 0.013779 / for {'activation': 'relu', 'hidden_layer_sizes': (400, 400, 400), 'learning_rate': 'constant'}
mse: 0.013693 / for {'activation': 'relu', 'hidden_layer_sizes': (200, 200, 200), 'learning_rate': 'constant'}


['./MLP.pkl']

## Train final neural network model on all training data with best hyper parameter

In [7]:
mlp = MLPRegressor(max_iter=5000, hidden_layer_sizes=(400,400,400), activation='relu', learning_rate='constant')

nn_model = mlp.fit(tr_features, tr_labels.values.ravel())

joblib.dump(nn_model, './MLP.pkl')

['./MLP.pkl']

# Evaluation of Two models on the testing data

In [8]:
test_features = pd.read_csv('./test_features.csv')
test_labels = pd.read_csv('./test_labels.csv')

models = {}

for mdl in ['LR', 'MLP']:
    models[mdl] = joblib.load('./{}.pkl'.format(mdl))


def evaluate_model(name, model, features, labels):
    start = time()
    pred = model.predict(features)
    end = time()
    rmse = round(np.sqrt(mean_squared_error(labels, pred)), 6)
    r2 = round(r2_score(labels, pred), 6)
    mae = round(mean_absolute_error(labels, pred), 6)
    print('{} -- rmse: {} / r2: {} / mae: {} / Latency: {}ms'.format(name, rmse, r2, mae, round((end - start), 5)))


for name, mdl in models.items():
    print(mdl)
    evaluate_model(name, mdl, test_features, test_labels)


LinearRegression()
LR -- rmse: 0.129502 / r2: 0.230897 / mae: 0.099754 / Latency: 0.001ms
MLPRegressor(hidden_layer_sizes=(400, 400, 400), max_iter=5000)
MLP -- rmse: 0.115362 / r2: 0.38968 / mae: 0.089045 / Latency: 0.01596ms


# Using the model

In [9]:
def predict_wine_quality(max_min_scaled_sample): 
    mdl = joblib.load('./MLP.pkl')
    pred = mdl.predict(max_min_scaled_sample)
    print(pred)
    return round(pred[0] * 6 + 3, 0)

max_min_scaled_sample = [[0.2, 0.4, 0.5, 0.7, 0.3, 0.2, 0.4, 0.2, 0.1, 0.9, 0.3]]
print(predict_wine_quality(max_min_scaled_sample))

[-0.08734992]
2.0
