<a href="https://colab.research.google.com/github/nickprock/corso_data_science/blob/master/imbalanced_classification/Class_Weights.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imbalanced Classification
## Class Weights

Esempio con una regressione logistica su un dataset solo per far mostrare l'iperparametro da impostare e la learning curve come strumento di visualizzazione.

*N.B. esperimento a scopo didattico, scusate se il processo non è del tutto corretto*

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split, StratifiedKFold, learning_curve
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score

Creazione del dataset

In [None]:
X, y = make_classification(n_samples=100000, n_features=4, weights=[0.99], flip_y=0.001, random_state=1)

In [None]:
np.bincount(y)

In [None]:
plt.figure(figsize=(18,10))
plt.scatter(X[:,0], X[:,1], c=y, s = 100)
plt.title("Imbalanced Dataset?\n 99% - 1%")
plt.show()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, stratify=y, random_state=42)

In [None]:
skf = StratifiedKFold(n_splits=5, random_state=42)

In [None]:
def learning_plot(train_sizes, train_scores, test_scores, title):
  train_mean = np.mean(train_scores, axis = 1)
  train_std = np.std(train_scores, axis = 1)
  test_mean = np.mean(test_scores, axis = 1)
  test_std = np.std(test_scores, axis = 1)

  plt.plot(train_sizes, train_mean, color = 'blue', marker = 'o', markersize = 5, label = 'Training Recall')
  plt.fill_between(train_sizes, train_mean + train_std, train_mean - train_std, alpha=0.15, color = 'blue')
  plt.plot(train_sizes, test_mean, color = 'green', marker = '*', markersize = 5, label = 'Validation recall')
  plt.fill_between(train_sizes, test_mean + test_std, test_mean - test_std, alpha=0.15, color = 'green')
  plt.grid()
  plt.xlabel('Number of Training Example')
  plt.ylabel('Recall Score')
  plt.legend()
  plt.title(title)
  plt.show()

In [None]:
lr = LogisticRegression(class_weight=None, random_state=42)

In [None]:
train_sizes_lr, train_scores_lr, val_scores_lr = learning_curve(lr, X_train, y_train, 
                                                        train_sizes=np.linspace(0.1,1.0,10), cv = skf, n_jobs = -1, scoring="recall",
                                                        verbose = 2)

In [None]:
plt.figure(figsize = (10,8))
learning_plot(train_sizes_lr, train_scores_lr, val_scores_lr, title="LR not weighted")

In [None]:
np.nanmean(train_scores_lr)

In [None]:
np.nanmean(val_scores_lr)

In [None]:
lr.fit(X_train, y_train)
yhat_lr=lr.predict(X_test)

print("accuracy: ", accuracy_score(y_test, yhat_lr))
print("\n")
print("precision: ", precision_score(y_test, yhat_lr))
print("\n")
print("recall: ", recall_score(y_test, yhat_lr))
print("\n")
print("F1: ", f1_score(y_test, yhat_lr))
print("\n")
print("\n")
print(confusion_matrix(y_test, yhat_lr))

In [None]:
# oltre a "balanced" possono essere impostati i pesi delle classi inserendo una lista (vedere sulla documentazione)
lr_w = LogisticRegression(class_weight="balanced", random_state=42)

In [None]:
train_sizes_lrw, train_scores_lrw, test_scores_lrw = learning_curve(lr_w, X_train, y_train, 
                                                        train_sizes=np.linspace(0.1,1.0,10), cv = skf, n_jobs = -1, scoring = 'recall',
                                                        verbose = 2)

In [None]:
plt.figure(figsize = (10,8))
learning_plot(train_sizes_lrw, train_scores_lrw, test_scores_lrw, title="LR weighted")

In [None]:
np.nanmean(train_scores_lrw)

In [None]:
np.nanmean(test_scores_lrw)

In [None]:
lr_w.fit(X_train, y_train)
yhat_lrw=lr_w.predict(X_test)

print("accuracy: ", accuracy_score(y_test, yhat_lrw))
print("\n")
print("precision: ", precision_score(y_test, yhat_lrw))
print("\n")
print("recall: ", recall_score(y_test, yhat_lrw))
print("\n")
print("F1: ", f1_score(y_test, yhat_lrw))
print("\n")
print("\n")
print(confusion_matrix(y_test, yhat_lrw))