# Concept of Dammy Classifier

In [1]:
# dependances and data setup
%matplotlib ipympl
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_digits

dataset = load_digits()

X = dataset.data
y = dataset.target

for class_name, class_count in zip(dataset.target_names, np.bincount(dataset.target)):
    print(class_name,class_count)

0 178
1 182
2 177
3 183
4 181
5 182
6 181
7 179
8 174
9 180


In [2]:
# Creating a dataset with imbalanced binary classes:  
# Negative class (0) is 'not digit 1' 
# Positive class (1) is 'digit 1'
y_binary_imbalanced = y.copy()
y_binary_imbalanced[y_binary_imbalanced != 1] = 0

print('Original labels:\t', y[1:30])
print('New binary labels:\t', y_binary_imbalanced[1:30])

Original labels:	 [1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9]
New binary labels:	 [1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0]


In [3]:
np.bincount(y_binary_imbalanced)    # Negative class (0) is the most frequent class

array([1615,  182], dtype=int64)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y_binary_imbalanced, random_state=0)

# Accuracy of Support Vector Machine classifier
from sklearn.svm import SVC

svm = SVC(kernel='rbf', C=1).fit(X_train, y_train)
svm.score(X_test, y_test)

0.9955555555555555

In [5]:
from sklearn.dummy import DummyClassifier

dummy = DummyClassifier(strategy = 'most_frequent').fit(X_train, y_train)
dummy.score(X_test, y_test)

0.9044444444444445

In [7]:
# using a linear kernel
svm = SVC(kernel='linear', C=1).fit(X_train, y_train)
svm.score(X_test, y_test)

0.9777777777777777

# Confusion matrices

In [13]:
dummy_most = DummyClassifier(strategy='most_frequent').fit(X_train, y_train)
dummy_stra = DummyClassifier(strategy='stratified').fit(X_train, y_train)

In [14]:
from sklearn.metrics import confusion_matrix

In [15]:
#dammy most frequent stratagy
y_prediction = dummy_most.predict(X_test)
confusion = confusion_matrix(y_test, y_prediction)
confusion

array([[407,   0],
       [ 43,   0]], dtype=int64)

In [16]:
#dammy stratified stratagy
y_prediction = dummy_stra.predict(X_test)
confusion = confusion_matrix(y_test, y_prediction)
confusion

array([[377,  30],
       [ 35,   8]], dtype=int64)

In [17]:
# svm confusion matrix
y_prediction = svm.predict(X_test)
confusion = confusion_matrix(y_test, y_prediction)
confusion

array([[402,   5],
       [  5,  38]], dtype=int64)

In [18]:
#using radial
svm = SVC(kernel='rbf', C=1).fit(X_train, y_train)
y_prediction = svm.predict(X_test)
confusion = confusion_matrix(y_test, y_prediction)
confusion

array([[407,   0],
       [  2,  41]], dtype=int64)

In [19]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression().fit(X_train, y_train)
y_prediction = lr.predict(X_test)
confusion = confusion_matrix(y_test, y_prediction)
confusion

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


array([[401,   6],
       [  8,  35]], dtype=int64)

In [20]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier().fit(X_train, y_train)
y_prediction = dt.predict(X_test)
confusion = confusion_matrix(y_test, y_prediction)
confusion

array([[400,   7],
       [ 10,  33]], dtype=int64)