In [164]:
!pip install ucimlrepo
!pip install tensorflow



In [165]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline
from ucimlrepo import fetch_ucirepo
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, models
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import make_scorer
from sklearn.metrics import f1_score

In [166]:
def load_data():
  # fetch dataset
  x = fetch_ucirepo(id=109).data.features
  y = fetch_ucirepo(id=109).data.targets
  return x,y

In [167]:
def check_missing(df):
  # check if there exists missing data
  miss_data = df.isna().sum()
  if miss_data.sum() > 0:
    df.fillna(df.mean(),inplace=True)
  return df

In [168]:
def scale(xTrain, xTest):
  # MinMaxScaling
  scaler = MinMaxScaler()
  xTrain = pd.DataFrame(scaler.fit_transform(xTrain),columns=xTrain.columns)
  xTest = pd.DataFrame(scaler.fit_transform(xTest),columns=xTest.columns)
  return xTrain, xTest

In [169]:
class KerasClassifier():
  def __init__(self, n_classes, epochs, batch_size):
    self.n_classes = n_classes
    self.epochs = epochs
    self.batch_size = batch_size
    self.model = self.build_model()

  def one_hot_encode(y):
    # one-hot encoder
    class_labels = [0, 1, 2]
    encoder = OneHotEncoder(sparse_output=False)
    y = encoder.fit_transform(y)
    # print(wine_dt)
    return y

  def build_model(self):
    # Define the parameter
    num_layers = [1, 2, 3]
    num_neurons = [32, 64, 128]
    learning_rates = [0.001, 0.01, 0.1]

    # keep track of best parameters and best F1 score
    best_num_layer = None
    best_num_neuron = None
    best_learning_rate = None
    best_f1_score = 0.0

    # Perform stratified tenfold cross-validation
    vali = StratifiedKFold(n_splits=10, shuffle=True, random_state=6)
    for num_layer in num_layers:
      for num_neuron in num_neurons:
        for learning_rate in learning_rates:
          print("Testing parameters: num_layers=%s, num_neurons=%s, learning_rate=%s" % (num_layer,num_neuron,learning_rate))

          # store cross-validation F1 scores
          f1Scores = []

          for train_index, vali_index in vali.split(xTrain, yTrain):
            xTrain_fold,xVali_fold = xTrain.iloc[train_index,:],xTrain.iloc[vali_index,:]
            yTrain_fold,yVali_fold = yTrain.iloc[train_index,:],yTrain.iloc[vali_index,:]

            # One-hot encode target labels
            yTrain_fold_encoded = one_hot_encode(yTrain_fold)
            yTal_fold_encoded = one_hot_encode(yVali_fold)

            # Build and compile the MLP model
            model = keras.Sequential()
            for _ in range(num_layer):
              model.add(keras.layers.Dense(num_neuron, activation='relu'))
            model.add(keras.layers.Dense(self.n_classes, activation='softmax'))
            model.compile(loss='categorical_crossentropy', optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate), metrics=['accuracy'])

            # Train the model
            model.fit(xTrain_fold, yTrain_fold_encoded, epochs=self.epochs, batch_size=self.batch_size, verbose=0)

            # Make predictions
            yVal_pred_encoded = model.predict(xVali_fold)
            yVal_pred = tf.argmax(yVal_pred_encoded, axis=1)

            # Calculate the F1 score for this fold
            f1 = f1_score(yVali_fold, yVal_pred, average='weighted')
            f1Scores.append(f1)
          f1Score_avg = np.mean(f1Scores)
          print("The average f1 score is: %s" % f1Score_avg)

          # if results in a better F1 score, update the data
          if f1Score_avg > best_f1_score:
            best_f1_score = f1Score_avg
            best_num_layer = num_layer
            best_num_neuron = num_neuron
            best_learning_rate = learning_rate
          print("best f1 score is: %.4f" % best_f1_score)
          print("best num of layer is: %d" % best_num_layer)
          print("best num of neuron is: %d" % best_num_neuron)
          print("best learning rate is: %s" % best_learning_rate)

    # Build and compile the MLP model, train using the best parameters
    best_model = keras.Sequential()
    for _ in range(best_num_layer):
      model.add(keras.layers.Dense(best_num_neuron, activation='relu'))
    best_model.add(keras.layers.Dense(self.n_classes, activation='softmax'))
    best_model.compile(loss='categorical_crossentropy', optimizer=tf.keras.optimizers.Adam(learning_rate=best_learning_rate), metrics=['accuracy'])

    return best_model

  def fit(self, x, y):
    self.model.fit(x, one_hot_encode(y), epochs=self.epochs, batch_size=self.batch_size, verbose=1)

  def predict(self, x):
    yTest_pred = tf.argmax(self.model.predict(x), axis=1)
    return yTest_pred

In [170]:
if __name__ == "__main__":
  x,y = load_data()
  x = check_missing(x)

  # preprocessing: split into train and test sets using the stratified sampling
  xTrain, xTest, yTrain, yTest = train_test_split(x,y,test_size=0.1,stratify=y,random_state=6)
  xTrain, xTest = scale(xTrain, xTest)

  # Create a scikit-learn pipeline
  pipeline = Pipeline([
      ('keras', KerasClassifier(n_classes=3, epochs=10, batch_size=32))
  ])

  # Fit the pipeline
  pipeline.fit(xTrain, yTrain)

  # Make predictions
  yPred = pipeline.predict(xTest)

  # Evaluate the model
  f1Score_f = f1_score(yTest, yPred, average='weighted')
  print("Final f1 score applying best params: %.4f" % f1Score_f)

Testing parameters: num_layers=1, num_neurons=32, learning_rate=0.001
The average f1 score is: 0.0140625
best f1 score is: 0.0141
best num of layer is: 1
best num of neuron is: 32
best learning rate is: 0.001
Testing parameters: num_layers=1, num_neurons=32, learning_rate=0.01
The average f1 score is: 0.027717074592074585
best f1 score is: 0.0277
best num of layer is: 1
best num of neuron is: 32
best learning rate is: 0.01
Testing parameters: num_layers=1, num_neurons=32, learning_rate=0.1
The average f1 score is: 0.012019230769230768
best f1 score is: 0.0277
best num of layer is: 1
best num of neuron is: 32
best learning rate is: 0.01
Testing parameters: num_layers=1, num_neurons=64, learning_rate=0.001
The average f1 score is: 0.022348484848484847
best f1 score is: 0.0277
best num of layer is: 1
best num of neuron is: 32
best learning rate is: 0.01
Testing parameters: num_layers=1, num_neurons=64, learning_rate=0.01
The average f1 score is: 0.02290938228438228
best f1 score is: 0.027

* best f1 score is: 0.0575
* best num of layer is: 3
* best num of neuron is: 64
* best learning rate is: 0.1
* Final f1 score applying best params: 0.0598