## Setting up

In [None]:
!git clone https://github.com/mciprian/ml_class_content.git
!mv ml_class_content/notebooks/img img
!mv ml_class_content/notebooks/data data
!rm data/creditcard.zip
!rm -fr ml_class_content/
!pip install ydata-profiling

## Previously on ...

In [None]:
from IPython.display import Image
from IPython.display import YouTubeVideo
Image(filename="img/purchase_kaggle.png")

In [None]:
import pandas as pd
import numpy as np
from ydata_profiling import ProfileReport
from sklearn.model_selection import train_test_split

## Solving last class exercise

In [None]:
df_purchase = pd.read_csv(
    "data/training_sample.csv", delimiter=",", quotechar='"'
)

df_purchase_train, df_purchase_test = train_test_split(df_purchase, test_size=0.2)

Don't forget EDA

In [None]:
profile = ProfileReport(df_purchase_train)

In [None]:
profile.to_notebook_iframe()

After EDA we concluded to remove UserID (unique variable)

In [None]:
train_target = df_purchase_train.ordered
df_purchase_train = df_purchase_train.drop(['UserID','ordered'],axis=1)

Principal Components Analysis

In [None]:
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

import matplotlib.pyplot as plt
from matplotlib.pyplot import figure

In [None]:
pca = PCA(n_components=df_purchase_train.shape[1])
purchase_train_pca =  pca.fit_transform(df_purchase_train)

In [None]:
def draw_vector(v0, v1, ax=None):
    ax = ax or plt.gca()
    arrowprops = dict(arrowstyle="<-", linewidth=2, shrinkA=0, shrinkB=0)
    ax.annotate("", v0, v1, arrowprops=arrowprops)

plt.scatter(purchase_train_pca[:, 0], purchase_train_pca[:, 1], color="blue",alpha=0.01, s=4)

xs = pca.components_[0]
ys = pca.components_[1]

for i, varnames in enumerate(df_purchase_train.columns):
    draw_vector([0,0],[xs[i],ys[i]])
    plt.text(xs[i], ys[i], varnames)

plt.xlabel('PC1')
plt.ylabel('PC2')
plt.title("Bi-plot")
plt.show()

In [None]:
np.cumsum(pca.explained_variance_/np.sum(pca.explained_variance_))

Let's choose k=12, >95% explained variance, seeting up the pipeline

In [None]:
pipeline = Pipeline([('pca', PCA(n_components=12)), ('rf', RandomForestClassifier(n_estimators=50))])

Training the pipeline

In [None]:
pipeline_model = pipeline.fit(df_purchase_train, train_target)

In [None]:
pipeline_model

Using the model on test data

In [None]:
test_target = df_purchase_test.ordered
df_purchase_test = df_purchase_test.drop(['UserID','ordered'],axis=1)

In [None]:
test_results = pipeline_model.predict(df_purchase_test)

In [None]:
test_results

In [None]:
test_target

## Time to measure

Calculate Accuracy and discuss the result

Do you remember confusion matrix?

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay
import matplotlib.pyplot as plt

In [None]:
test_results.reshape(-1,1)

In [None]:
ConfusionMatrixDisplay.from_estimator(
        pipeline_model,
        df_purchase_test,
        test_target,
        cmap=plt.cm.Blues,
        normalize=None,
    )

Calculate Precision and Recall (Sensitiviy)

## Precision, Recall and F1-Score explained

In [None]:
YouTubeVideo('8d3JbbSj-I8',width=640, height=480)

## The F1-Score

In [None]:
from sklearn.metrics import f1_score

In [None]:
f1_score(test_target, test_results)

## ROC Curve and Area under the curve (AUC)

ROC = Receiver Operating Characteristic

In [None]:
YouTubeVideo('4jRBRDbJemM',width=640, height=480)

In [None]:
from sklearn.metrics import RocCurveDisplay, roc_auc_score

In [None]:
RocCurveDisplay.from_predictions(y_pred=test_results,y_true=test_target)

In [None]:
roc_auc_score(test_target,test_results)

Some people uses also de GINI coefficient

$GINI = 2*AUC - 1$

In [None]:
2*roc_auc_score(test_target,test_results) - 1

## Validation: The thrid dataset

Ideally a third data set is used to measure the final version of the model. Once used the train/test/validation data sets must be regenerated

Discussion: What are the benefits of using 3 data sets?

Calculate Accuracy, F1-Score, AUC and GINI of the pipeline on validation dataset

In [None]:
df_purchase_validation = pd.read_csv(
    "data/testing_sample.csv", delimiter=",", quotechar='"'
    )

In [None]:
df_purchase_validation.head()

## Improve the model

Could you propose improvements to the analytical pipeline, report the metrics on validation dataset

## The AutoML way: Autogluon

In [None]:
!pip install torch==2.0.1+cpu torchvision==0.15.2+cpu --index-url https://download.pytorch.org/whl/cpu
!pip install autogluon

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [None]:
df_purchase = pd.read_csv(
    "data/training_sample.csv", delimiter=",", quotechar='"'
)

df_purchase_train, df_purchase_test = train_test_split(df_purchase, test_size=0.2)

In [None]:
from autogluon.tabular import TabularDataset, TabularPredictor

In [None]:
train_data = TabularDataset(df_purchase)

In [None]:
label = 'ordered'

In [None]:
predictor = TabularPredictor(label=label,eval_metric='f1').fit(train_data)

In [None]:
#predictor.evaluate(TabularDataset(df_purchase_test), silent=True)

In [None]:
predictor.leaderboard(TabularDataset(df_purchase_test))