 * Reference:
https://www.youtube.com/watch?v=Y_hzMnRXjhI&list=PLQY2H8rRoyvzDbLUZkbudP-MFQZwNmU4S&index=3
 * ChatGPT: TensorFlow Multiclass Loss Functions

In [None]:
#Importing required packages.

import warnings

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
import pathlib

import time
import csv

from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
%matplotlib inline


from experiment.api import mlflow as mlflow_api


In [None]:
#Ignore warnings
warnings.filterwarnings('ignore')

In [None]:
# Loading dataset
df = pd.read_csv(pathlib.Path("../data") / "output.csv")


In [None]:
#Standard random state for all operations
RANDOM_STATE = 42

In [None]:
mlflow = mlflow_api.MLFlow(local_storage=True)
mlflow.clean()

# 1. Exploratory Data Analysis

In [None]:
#Let's check how the data is distributed
df.head()

In [None]:
#Information about the data columns
df.info()

In [None]:
df.columns

In [None]:
df.shape

In [None]:
sns.countplot(df['quality'])

# 2. Data Preprocessing

In [None]:
def remove_outliers(df: pd.DataFrame) -> pd.DataFrame:
    # Calculate z-scores of `df`
    z_scores = stats.zscore(df)

    abs_z_scores = np.abs(z_scores)
    filtered_entries = (abs_z_scores < 3).all(axis=1)
    df = df[filtered_entries]
    return df


# Create a reproducible function for the input data
def apply_feature_engineering_preprocessing(df: pd.DataFrame) -> pd.DataFrame:

    # Making binary classificaion for the response variable.
    # Dividing wine as good and bad by giving the limit for the quality
    bins = (2, 6.5, 8)
    group_names = ["bad", "good"]
    df["quality"] = pd.cut(df["quality"], bins=bins, labels=group_names)

    # Now lets assign a labels to our quality variable
    label_quality = LabelEncoder()

    # Bad becomes 0 and good becomes 1
    df["quality"] = label_quality.fit_transform(df["quality"])

    df = remove_outliers(df)
    return df


In [None]:
#Apply feature engineering
df = apply_feature_engineering_preprocessing(df)

df.head(20)

In [None]:
df['quality'].value_counts()

In [None]:
sns.countplot(df['quality'])

We have an unbalanced data set.

In [None]:
# Now seperate the dataset to feature and target variables
X = df.drop("quality", axis=1)
y = df["quality"]


In [None]:
# Train and test splitting of data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_STATE
)


# 3. Model Creation

In [None]:
# Apply preprocessing
preprocessing = ColumnTransformer(
    [
        # Column dropper
        ("column_dropper", "drop", ["residual sugar"]),
    ],
    remainder="passthrough",
)


In [None]:
# Creating the pipeline
clf = Pipeline(
    [
        ("preprocessing", preprocessing),
        ("scaler", StandardScaler()),  # scale values before PCA
        ("pca", PCA()),
        ("classification", KNeighborsClassifier()),
    ]
)


In [None]:
# Try different hyperparamaters
param_grid = [
    {
        "pca__n_components": list(range(3, 10)),
        "classification__n_neighbors": [3, 4, 5, 6, 7, 8],
        "classification__leaf_size": [10, 20, 30, 40, 50],
    }
]


In [None]:
clf = GridSearchCV(
    clf,
    param_grid,
    n_jobs=-1,
    cv=5,
    scoring="accuracy",
    return_train_score=True,
    verbose=1,
)


### 3.1 Without mlflow

In [None]:
clf.fit(X_train, y_train)

In [None]:
#Listing the best parameters for the param_grid:
clf.best_params_

In [None]:
#Get the best score
clf.best_score_

In [None]:
#Store the best model in a variable
best_model = clf.best_estimator_
best_model

### Try on unseen data

In [None]:
#Let's use the test set to create predictions
predictions = best_model.predict(X_test)

In [None]:
#Calculating the accuracy score manually
score = accuracy_score(y_test, predictions)
score

Since the target column is unbalanced, we should check f1 score, too.

In [None]:
score = f1_score(y_test, predictions)
score

As expected, f1 score is much lower because our dataset is unbalanced.

In [None]:
#Creating the confusion matrix
cm = confusion_matrix(y_test, predictions)

In [None]:
#Plot the confusion matrix
plt.figure(figsize=(5,5))
sns.heatmap(cm, annot=True, fmt=".0f", linewidths=.5, square = True, cmap = 'Blues_r')
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
all_sample_title = 'Accuracy Score: {0}'.format(score)
plt.title(all_sample_title, size = 11)

### 3.2 With mlflow

#### 3.2.1 Train and log models using mlflow

In [None]:
def try_different_neighbors(
    X_train: pd.DataFrame,
    X_test: pd.DataFrame,
    y_train: pd.Series,
    y_test: pd.Series,
    mlflow: mlflow_api.MLFlow,
    leaf_size: int,
    n_jobs: int,
    neighbor_array: list[int],
) -> str:
    """
    This function tries different neighbors on the model
    """

    max_f1 = 0

    for n_neighbors in neighbor_array:
        knn = KNeighborsClassifier(
            leaf_size=leaf_size, n_neighbors=n_neighbors, n_jobs=n_jobs
        )
        knn.fit(X_train, y_train)

        # let's use the test set to create predictions
        predictions = knn.predict(X_test)

        # calculating the accuracy score manually
        accuracy = accuracy_score(y_test, predictions)
        f1 = f1_score(y_test, predictions)

        log_dict = {
            "params": {"n_neighbors": n_neighbors},
            "metrics": {"accuracy": accuracy, "f1": f1},
        }

        temp_run_id = mlflow.log_experiment_run(
            model=knn,
            experiment_name="KNN Experiments",
            run_name=f"KNN: {n_neighbors}",
            log_dict=log_dict,
            registered_model_name="knn_n_neighbours",
            tags={
                "model": "knn"
            }
        )

        if f1 > max_f1:
            run_id = temp_run_id

    return run_id


In [None]:
# run the tracking server in background
mlflow.run_server()

In [None]:
# Start logging with mlflow
neighbors = list(range(3, 6))

run_id = try_different_neighbors(
    X_train,
    X_test,
    y_train,
    y_test,
    mlflow=mlflow,
    leaf_size=10,
    n_jobs=-1,
    neighbor_array=neighbors,
)


* n_neighbors = 5 is the best performing model:

#### 3.1.2 Get predictions directly from an API endpoint

First, you should run this command to start the server:

In [None]:
# serve a model with the best f1 score
mlflow.serve_model(run_id)

In [None]:
def get_prediction_from_server(row, mlflow: mlflow_api.MLFlow):
    """
    This functions receives response from the machine learning server
    """

    row = row[:-1]
    data = {"dataframe_split": {"columns": list(X_train.columns), "data": [row]}}
    response = mlflow.get_predictions(data)

    return response.json()


In [None]:
# wait for 5 seconds before the model server starts
time.sleep(5)

In [None]:
#Bad becomes 0 and good becomes 1 
with open("predictions.csv", 'wt', encoding='utf-8') as output:
   profiles_writer = csv.writer(output, delimiter=',')
   columns = list(X_train.columns)

   # add column names
   columns.append("quality")
   columns.append("prediction")
   profiles_writer.writerow(columns)
   
   for row in df.values.tolist():
      # add predictions
      features = row
      features.append(float(get_prediction_from_server(row, mlflow)["predictions"][0]))
      profiles_writer.writerow(features)