# # Imbalanced dataset -- cannot depend on accuracy alone as class =1 might be too small

In [4]:
from sklearn.datasets import load_digits
digits = load_digits()

y = digits.target == 9 

from sklearn.model_selection import train_test_split

X_train,X_test,y_train, y_test= train_test_split(digits.data,y,random_state=0)

In [5]:
from sklearn.dummy import DummyClassifier
import numpy as np
dummy_majority = DummyClassifier(strategy='most_frequent').fit(X_train, y_train)

my_pred = dummy_majority.predict(X_test)

print("unique predicted labels: {}".format(np.unique(my_pred)))
print("Test score: {:3f}".format(dummy_majority.score(X_test,y_test)))

unique predicted labels: [False]
Test score: 0.895556


In [6]:
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier(max_depth=2).fit(X_train,y_train)

#pred_tree = tree.predict(X_test)
print("Test score: {:3f}".format(tree.score(X_test,y_test)))

Test score: 0.917778


In [7]:
from sklearn.linear_model import LogisticRegression

dummy = DummyClassifier().fit(X_train, y_train)
print(" dummy Test score: {:3f}".format(dummy.score(X_test,y_test)))

logreg = LogisticRegression(C=0.1).fit(X_train, y_train)
print(" LogReg Test score: {:3f}".format(logreg.score(X_test,y_test)))

 dummy Test score: 0.786667




 LogReg Test score: 0.977778


In [8]:
from sklearn.metrics import confusion_matrix

print(" with dummy most frequent:")
confusion = confusion_matrix(y_test, dummy_majority.predict(X_test))
print(confusion)


print(" tree with max depth of 2:")
confusion = confusion_matrix(y_test, tree.predict(X_test))
print(confusion)

print(" simply dummy classifier:")
confusion = confusion_matrix(y_test, dummy.predict(X_test))
print(confusion)


print(" logostic regression:")
confusion = confusion_matrix(y_test, logreg.predict(X_test))
print(confusion)

 with dummy most frequent:
[[403   0]
 [ 47   0]]
 tree with max depth of 2:
[[390  13]
 [ 24  23]]
 simply dummy classifier:
[[370  33]
 [ 40   7]]
 logostic regression:
[[401   2]
 [  8  39]]


In [9]:
from sklearn.metrics import f1_score

print(" with dummy most frequent f1 score :")
print(f1_score(y_test,dummy_majority.predict(X_test)))

print(" tree with max depth of 2 f1 score :")
print(f1_score(y_test,tree.predict(X_test)))

print(" simply dummy classifier f1 score:")
print(f1_score(y_test,dummy.predict(X_test)))


print(" logostic regression f1 score :")
print(f1_score(y_test,logreg.predict(X_test)))


 with dummy most frequent f1 score :
0.0
 tree with max depth of 2 f1 score :
0.5542168674698795
 simply dummy classifier f1 score:
0.048780487804878044
 logostic regression f1 score :
0.8863636363636364


  'precision', 'predicted', average, warn_for)


In [10]:
from sklearn.metrics import classification_report

print(" with dummy most frequent report :")
print(classification_report(y_test,dummy_majority.predict(X_test), target_names=["not 9","9"]))

print(" tree with max depth of 2 report :")
print(classification_report(y_test,tree.predict(X_test), target_names=["not 9","9"]))

print(" simply dummy classifier report:")
print(classification_report(y_test,dummy.predict(X_test), target_names=["not 9","9"]))


print(" logostic regression report :")
print(classification_report(y_test,logreg.predict(X_test), target_names=["not 9","9"]))

 with dummy most frequent report :
              precision    recall  f1-score   support

       not 9       0.90      1.00      0.94       403
           9       0.00      0.00      0.00        47

    accuracy                           0.90       450
   macro avg       0.45      0.50      0.47       450
weighted avg       0.80      0.90      0.85       450

 tree with max depth of 2 report :
              precision    recall  f1-score   support

       not 9       0.94      0.97      0.95       403
           9       0.64      0.49      0.55        47

    accuracy                           0.92       450
   macro avg       0.79      0.73      0.75       450
weighted avg       0.91      0.92      0.91       450

 simply dummy classifier report:
              precision    recall  f1-score   support

       not 9       0.91      0.89      0.90       403
           9       0.19      0.21      0.20        47

    accuracy                           0.82       450
   macro avg       0.55  

  'precision', 'predicted', average, warn_for)


# Lets do PRECISION  as evaluation metric

In [None]:
from sklearn.svm import SVC 
from sklearn.datasets import load_breast_cancer
from sklearn.preprocessing import StandardScaler

data = load_breast_cancer()

X = data.data 
y = data.target


x_train_org,x_test_org,y_train,y_test=train_test_split(X,y,random_state=0)  ## org stands for the very original
scaler = StandardScaler()
X_train = scaler.fit_transform(x_train_org)  ### you can fit and transform together 
X_test = scaler.transform(x_test_org)  

svc= SVC(kernel='rbf', C =0.1, gamma=10)
svc.fit(X_train, y_train)

In [None]:
%matplotlib notebook
import matplotlib.pyplot as plt

precision, recall, thresholds = precision_recall_curve(y_test, svc.decision_function(X_test))

close_zero = np.argmin(np.abs(thresholds))

plt.plot(precision[close_zero],recall[close_zero],'o',markersize=10,
        label='threshold zero', fillstyle='none' , c='k',mew=2)

plt.plot(precision,recall, label='precision recall label')
plt.xlabel('Precision')
plt.ylabel('recall')
plt.legend(loc='best')

In [None]:
%matplotlib notebook
import matplotlib.pyplot as plt

from sklearn.metrics import roc_curve

fpr,tpr,thresholds= roc_curve(y_test, svc.decision_function(X_test))
plt.plot(fpr,tpr,label='roc_curve')
plt.xlabel('FPR')
plt.ylabel('TPR OR RECALL')

## find threshold closets to zero


close_zero = np.argmin(np.abs(thresholds))

plt.plot(fpr[close_zero],tpr[close_zero],'o',markersize=10,
        label='threshold zero', fillstyle='none' , c='k',mew=2)

plt.plot(precision,recall, label='precision recall label')

plt.legend(loc=4)