<a href="https://colab.research.google.com/github/nickprock/corso_data_science/blob/master/imbalanced_classification/threshold_moving.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Threshold-Moving

<br>

![into_img](https://www.researchgate.net/publication/344012378/figure/fig1/AS:930866989649922@1598947393917/Example-distributions-of-outputs-of-a-classifier-algorithm-for-a-2-class-problem-The.ppm)

<br>

[Image Credits](https://www.researchgate.net/figure/Example-distributions-of-outputs-of-a-classifier-algorithm-for-a-2-class-problem-The_fig1_344012378)

<br>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, recall_score, precision_score, accuracy_score, confusion_matrix, precision_recall_curve, roc_auc_score, roc_curve, f1_score

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
def print_result(y_test, yhat):
    print(classification_report(y_test, yhat), "\n")
    print(confusion_matrix(y_test, yhat), "\n")
    print("accuracy score: ", round(accuracy_score(y_test, yhat), 3), "\n")
    print("precision score: ", round(precision_score(y_test, yhat), 3), "\n")
    print("recall score: ", round(recall_score(y_test, yhat), 3), "\n")
    print("F1 score: ", round(f1_score(y_test, yhat), 3), "\n")
    print("GMeans score: ", round(np.sqrt(recall_score(y_test, yhat) * precision_score(y_test, yhat)), 3), "\n")

In [None]:
my_seed = 3

In [None]:
X, y = make_classification(n_samples=1000, n_features=2, random_state=my_seed, n_classes=2, weights=[0.90,0.10], n_informative=2, 
                           n_clusters_per_class=1, n_redundant=0, n_repeated=0)

In [None]:
plt.figure(figsize=(18,10))
plt.scatter(X[:,0], X[:,1], c=y, s = 100)
plt.title("Imbalanced Dataset?\n 90% - 10%")
plt.show()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y, random_state=my_seed)

## Dummy Classifier
### Build a baseline

In [None]:
dummy_clf = DummyClassifier(strategy='most_frequent', random_state=my_seed)

In [None]:
dummy_clf.fit(X_train, y_train)
yhat_dummy = dummy_clf.predict(X_test)

In [None]:
print_result(y_test, yhat_dummy)

### SVM Classifier

In [None]:
svm_clf = SVC(probability=True)

In [None]:
svm_clf.fit(X_train, y_train)
yhat_svm = svm_clf.predict(X_test)

In [None]:
print_result(y_test, yhat_svm)

### Threshold-Moving by Precision-Recall AUC

#### Precision and Recall Scores as function of the Decision Threshold

In [None]:
df = svm_clf.decision_function(X_test)

In [None]:
print(yhat_svm[:50])
print("\n")
print(df[:50])

In [None]:
precision, recall, threshold = precision_recall_curve(y_test, df)

In [None]:
plt.figure(figsize=(18,10))
plt.plot(threshold, precision[:-1], c ='r', label ='PRECISION')
plt.plot(threshold, recall[:-1], c ='b', label ='RECALL')
plt.grid()
plt.legend()
plt.xlabel("Threshold")
plt.title('Precision and Recall Scores as function of the Decision Threshold')
plt.show()

We choose the intersection who maximize both precision and recall (but not in this example)

In [None]:
my_thr = threshold[np.where(precision == recall)[0].tolist()][0]

In [None]:
yhat_alt = []
for i in df:
    if i < my_thr:
        yhat_alt.append(0)
    else:
        yhat_alt.append(1)

In [None]:
print_result(y_test, yhat_alt)

***Can I improve the performance?*** 

 Yes, in this example, look the curves.

In [None]:
my_thr2 = threshold[np.where(recall == recall[np.where(precision == recall)[0].tolist()[0]])[0].tolist()[-1]]

In [None]:
yhat_alt2 = []
for i in df:
    if i < my_thr2:
        yhat_alt2.append(0)
    else:
        yhat_alt2.append(1)

In [None]:
print_result(y_test, yhat_alt2)

#### Precision - Recall Curve

In [None]:
fscore = (2 * precision * recall) / (precision + recall)
# locate the index of the largest f score
ix = np.argmax(fscore)
print('Best Threshold=%f, F-Score=%.3f' % (threshold[ix], fscore[ix]))

In [None]:
plt.figure(figsize=(18,10))
no_skill = len(y_test[y_test==1]) / len(y_test)
plt.plot([0,1], [no_skill,no_skill], 'r--', label='No Skill')
plt.plot(recall, precision, 'g-o', label='Model')
plt.plot(recall[ix], precision[ix], 'ro', label='Best')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend()
plt.show()

#### F1-Score as function of the Decision Threshold

In [None]:
plt.figure(figsize=(18,10))
plt.plot(threshold, fscore[:-1], c ='black', label ='F-Score')
plt.plot(threshold[ix], fscore[ix], 'ro', label='Best')
plt.grid()
plt.legend()
plt.ylim((0,1))
plt.xlabel("Threshold")
plt.ylabel("F-Score")
plt.title('F-Score as function of the Decision Threshold')
plt.show()

In [None]:
yhat_alt3 = []
for i in df:
    if i < threshold[ix]:
        yhat_alt3.append(0)
    else:
        yhat_alt3.append(1)

In [None]:
print_result(y_test, yhat_alt3)

### Threshold-Moving by ROC AUC

In [None]:
yhat_svm_prob = svm_clf.predict_proba(X_test)

In [None]:
fpr, tpr, threshold = roc_curve(y_test, yhat_svm_prob[:,1])

In [None]:
roc_auc_score(y_test, yhat_svm_prob[:,1])

In [None]:
plt.figure(figsize=(18,10))
plt.plot([0,1], [0,1], "r--")
plt.plot(fpr, tpr, "g-o")
plt.legend(["No Skills", "SVM"], loc='lower right')
plt.title("ROC AUC")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.show()

In [None]:
sensitivity = tpr
specificity = 1 - fpr

gmeans = np.sqrt(sensitivity * specificity)
# locate the index of the largest gmeans
idx = np.argmax(gmeans)
print('Best Threshold=%f, F-Score=%.3f' % (threshold[idx], gmeans[idx]))

In [None]:
plt.figure(figsize=(18,10))
plt.plot([0,1], [0,1], "r--")
plt.plot(fpr, tpr, "g-o")
plt.plot(fpr[idx], tpr[idx], "ro")
plt.legend(["No Skills", "SVM", "Best Threshold"], loc='lower right')
plt.title("ROC AUC")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.show()

In [None]:
yhat_alt4 = []
for i in yhat_svm_prob[:,1]:
  if i < threshold[idx]:
    yhat_alt4.append(0)
  else:
    yhat_alt4.append(1)

In [None]:
print_result(y_test, yhat_alt4)