# Train

Training various models, following this [Kaggle guide](https://www.kaggle.com/code/merturper/breast-cancer-outliers-pca-nca/notebook#Train-Test-Split-&-StandardScaler). Pushing results to our "Wisconsin BCa" experiment in MLFlow.

Note that the `train` folder contains code to package and train the model on the cloud, while this is just playing around.

## Read Data 

Read both boxplot and lof preprocessed data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.neighbors import KNeighborsClassifier, NeighborhoodComponentsAnalysis, LocalOutlierFactor
from sklearn.ensemble import VotingClassifier
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

import mlflow
import mlflow.sklearn

In [None]:
# get a handle of the data asset and print the URI
data_boxplot = './data/cleaned-wisconsin-boxplot.parquet'
data_lof = './data/cleaned-wisconsin-lof.parquet'

df_boxplot = pd.read_parquet(data_boxplot)
df_lof = pd.read_parquet(data_boxplot)

# Setup MLFLow


In [None]:
from azure.ai.ml import MLClient
from azure.identity import DefaultAzureCredential

ml_client = MLClient.from_config(credential=DefaultAzureCredential())

In [None]:
mlflow_tracking_uri = ml_client.workspaces.get(ml_client.workspace_name).mlflow_tracking_uri


## Define

In [None]:
def split_and_scale(df):
    y = df["diagnosis_01"]
    X = df.drop(["diagnosis_01"],axis=1)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30)

    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    return X_train, X_test, y_train, y_test
    

In [None]:
def train_logistic(X_train, X_test, y_train, y_test):
    log_reg = LogisticRegression()
    log_reg.fit(X_train, y_train)
    # model predictions
    y_pred = log_reg.predict(X_test)
    mlflow.log_metrics({
        'trainScore': accuracy_score(y_train, log_reg.predict(X_train)),
        'testScore': accuracy_score(y_test, log_reg.predict(X_test))
    })
    mlflow.log_dict(np.array(confusion_matrix(y_test, y_pred)).tolist(), artifact_file='confusion_matrix.json')
    mlflow.log_dict(classification_report(y_test, y_pred), 'classification_report.json')

In [None]:
def train_knn(X_train, X_test, y_train, y_test):
    knn = KNeighborsClassifier(n_neighbors = 3)
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    mlflow.log_metrics({
        'trainScore': accuracy_score(y_train, knn.predict(X_train)),
        'testScore': accuracy_score(y_test, knn.predict(X_test))
    })
    mlflow.log_dict(np.array(confusion_matrix(y_test, y_pred)).tolist(), artifact_file='confusion_matrix.json')
    mlflow.log_dict(classification_report(y_test, y_pred), 'classification_report.json')

In [None]:
def train_svc(X_train, X_test, y_train, y_test):
    svc = SVC()
    svc.fit(X_train, y_train)
    # model predictions 
    y_pred = svc.predict(X_test)
    mlflow.log_metrics({
        'trainScore': accuracy_score(y_train, svc.predict(X_train)),
        'testScore': accuracy_score(y_test, svc.predict(X_test))
    })
    mlflow.log_dict(np.array(confusion_matrix(y_test, y_pred)).tolist(), artifact_file='confusion_matrix.json')
    mlflow.log_dict(classification_report(y_test, y_pred), 'classification_report.json')

## Run

In [None]:
# set name for logging
mlflow.set_experiment("Wisconsin BCa Experiment local")
mlflow.autolog()

In [None]:
# random helper function
# https://stackoverflow.com/a/11146645
def cartesian_product(x, y):
    return [(x0, y0) for x0 in x for y0 in y]

In [None]:
with mlflow.start_run():
    dfs = [(df_boxplot, 'boxplot'), (df_lof, 'lof')]
    trains = [(train_knn, 'knn'), (train_logistic, 'logistic'), (train_svc, 'svc')]

    mlflow.log_param("parent", "yes")
    for (df, df_name), (train, train_name) in [(x0, y0) for x0 in dfs for y0 in trains]:
        with mlflow.start_run(run_name=f'{df_name}-outlier-with-{train_name}', nested=True):
            mlflow.set_tag('outlier_func', df_name)
            X_train, X_test, y_train, y_test = split_and_scale(df)
            train(X_train, X_test, y_train, y_test)

# Viewing the models

See the models by running `mlflow ui`. I might connect this with azure...

In [None]:
X_train, X_test, y_train, y_test = split_and_scale(df_lof)
print(len(X_test[0]))
list(df_lof.columns[:-1])

## Summary

The Kaggle guy says that SVC gives the best results. I'm going to just use SVC and deploy it, comparing results is a bit weird on Azure somehow.

##