# Train

Training various models, following this [Kaggle guide](https://www.kaggle.com/code/merturper/breast-cancer-outliers-pca-nca/notebook#Train-Test-Split-&-StandardScaler). Pushing results to our "Wisconsin BCa" experiment.

Using MLFlow library to log our results: https://learn.microsoft.com/en-us/azure/machine-learning/tutorial-train-model?view=azureml-api-2

## Setup

Note I am not creating a Compute cluster because we don't have enough quota, plus it's unnecessary for such a small dataset.

In [7]:
from azure.ai.ml import MLClient
from azure.identity import DefaultAzureCredential

# authenticate
credential = DefaultAzureCredential()

# Get a handle to the workspace
ml_client = MLClient(
    credential=credential,
    subscription_id="1b1ae7cf-df24-428b-8bb9-e4dd07869ac9",
    resource_group_name="SummerProjects2023",
    workspace_name="Nanostics_ML_Workspace",
)

## Read Data 

Read both boxplot and lof preprocessed data

In [4]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.neighbors import KNeighborsClassifier, NeighborhoodComponentsAnalysis, LocalOutlierFactor
from sklearn.ensemble import VotingClassifier
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

In [8]:
# get a handle of the data asset and print the URI
data_boxplot = ml_client.data.get(name="wisconsin-bca", version='2-boxplot')
data_lof = ml_client.data.get(name="wisconsin-bca", version='2-lof')
print(f"Data asset URI (boxplot): {data_boxplot.path}")
print(f"Data asset URI (log): {data_lof.path}")


df_boxplot = pd.read_parquet(data_boxplot.path)
df_lof = pd.read_parquet(data_boxplot.path)

Data asset URI (boxplot): azureml://subscriptions/1b1ae7cf-df24-428b-8bb9-e4dd07869ac9/resourcegroups/SummerProjects2023/workspaces/Nanostics_ML_Workspace/datastores/workspaceblobstore/paths/LocalUpload/87247cc674f58dd015f9a173b014e577/cleaned-wisconsin-bca.parquet
Data asset URI (log): azureml://subscriptions/1b1ae7cf-df24-428b-8bb9-e4dd07869ac9/resourcegroups/SummerProjects2023/workspaces/Nanostics_ML_Workspace/datastores/workspaceblobstore/paths/LocalUpload/6ad44bced7ba2bd6022a77c7f4ace95b/cleaned-wisconsin-bca-lof.parquet


## Define

In [2]:
def split_and_scale(df):
    y = df["diagnosis_01"]
    X = df.drop(["diagnosis_01"],axis=1)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30)

    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    return X_train, X_test, y_train, y_test
    

In [5]:
def train_logistic(X_train, X_test, y_train, y_test):
    log_reg = LogisticRegression()
    log_reg.fit(X_train, y_train)
    # model predictions
    y_pred = log_reg.predict(X_test)
    print("*****LogisticRegression******\n")
    print("Train score:")
    print(accuracy_score(y_train, log_reg.predict(X_train)))
    print("Test score:")
    log_reg_acc = accuracy_score(y_test, log_reg.predict(X_test))
    print(log_reg_acc)
    # confusion matrix
    print("Accuracy Score:")
    print(confusion_matrix(y_test, y_pred))
    # classification report
    print(classification_report(y_test, y_pred))

In [6]:
def train_knn(X_train, X_test, y_train, y_test):
    knn = KNeighborsClassifier(n_neighbors = 3)
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    print("******KNeighborsClassifier******\n")
    # accuracy score
    print("Train score:")
    print(accuracy_score(y_train, knn.predict(X_train)))
    print("Test score:")
    knn_acc = accuracy_score(y_test, knn.predict(X_test))
    print(knn_acc)
    # confusion matrix
    print("Accuracy Score:")
    print(confusion_matrix(y_test, y_pred))
    # classification report
    print(classification_report(y_test, y_pred))

In [7]:
def train_svc(X_train, X_test, y_train, y_test):
    svc = SVC()
    svc.fit(X_train, y_train)
    # model predictions 
    y_pred = svc.predict(X_test)
    print("******Support Vector Classifier******\n")
    # accuracy score
    print("Train score:")
    print(accuracy_score(y_train, svc.predict(X_train)))
    print("Test score:")
    svc_acc = accuracy_score(y_test, svc.predict(X_test))
    print(svc_acc)
    # confusion matrix
    print("Accuracy Score:")
    print(confusion_matrix(y_test, y_pred))
    # classification report
    print(classification_report(y_test, y_pred))

## Run

In [8]:
import mlflow
import mlflow.sklearn

# set name for logging
mlflow.set_experiment("Wisconsin BCa Experiment 1")
# enable autologging with MLflow
mlflow.sklearn.autolog()



In [12]:
import numpy
# random helper function
# https://stackoverflow.com/a/11146645
def cartesian_product(x, y):
    return [(x0, y0) for x0 in x for y0 in y]

In [34]:
with mlflow.start_run():
    dfs = [(df_boxplot, 'boxplot'), (df_lof, 'lof')]
    trains = [(train_knn, 'knn'), (train_logistic, 'logistic'), (train_svc, 'svc')]

    mlflow.log_param("parent", "yes")
    for (df, df_name), (train, train_name) in [(x0, y0) for x0 in dfs for y0 in trains]:
        with mlflow.start_run(run_name=f'{df_name}-outlier-with-{train_name}', nested=True):
            X_train, X_test, y_train, y_test = split_and_scale(df)
            train(X_train, X_test, y_train, y_test)

******KNeighborsClassifier******

Train score:
0.9771573604060914
Test score:
0.9526627218934911
Accuracy Score:
[[94  3]
 [ 5 67]]
              precision    recall  f1-score   support

           0       0.95      0.97      0.96        97
           1       0.96      0.93      0.94        72

    accuracy                           0.95       169
   macro avg       0.95      0.95      0.95       169
weighted avg       0.95      0.95      0.95       169

*****LogisticRegression******

Train score:
0.9898477157360406
Test score:
0.9704142011834319
Accuracy Score:
[[97  0]
 [ 5 67]]
              precision    recall  f1-score   support

           0       0.95      1.00      0.97        97
           1       1.00      0.93      0.96        72

    accuracy                           0.97       169
   macro avg       0.98      0.97      0.97       169
weighted avg       0.97      0.97      0.97       169

******Support Vector Classifier******

Train score:
0.9822335025380711
Test score:
0.

In [17]:
X_train, X_test, y_train, y_test = split_and_scale(df_lof)
print(len(X_test[0]))
list(df_lof.columns[:-1])

26


['texture_mean',
 'smoothness_mean',
 'compactness_mean',
 'concavity_mean',
 'concave points_mean',
 'symmetry_mean',
 'fractal_dimension_mean',
 'radius_se',
 'texture_se',
 'perimeter_se',
 'area_se',
 'smoothness_se',
 'compactness_se',
 'concavity_se',
 'concave points_se',
 'symmetry_se',
 'fractal_dimension_se',
 'texture_worst',
 'perimeter_worst',
 'area_worst',
 'smoothness_worst',
 'compactness_worst',
 'concavity_worst',
 'concave points_worst',
 'symmetry_worst',
 'fractal_dimension_worst']

## Summary

The Kaggle guy says that SVC gives the best results. I'm going to just use SVC and deploy it, comparing results is a bit weird on Azure somehow.

##