In [2]:
#imports 
from matplotlib import pyplot as plt
import numpy as np
#setup
%matplotlib ipympl

# Dataset

In [3]:
from sklearn.datasets import make_classification, make_blobs
from matplotlib.colors import ListedColormap
from sklearn.datasets import load_breast_cancer

# synthetic dataset for classification (binary) 
X_C2, y_C2 = make_classification(n_samples = 100, n_features=2,
                                n_redundant=0, n_informative=2,
                                n_clusters_per_class=1, flip_y = 0.1,
                                class_sep = 0.5, random_state=0)

# more difficult synthetic dataset for classification (binary) 
# with classes that are not linearly separable
X_D2, y_D2 = make_blobs(n_samples = 100, n_features = 2, centers = 8,
                       cluster_std = 1.3, random_state = 4)


# Breast cancer dataset for classification
cancer = load_breast_cancer()
(X_cancer, y_cancer) = load_breast_cancer(return_X_y = True)


# Classifiers

## KNeighborsClassifier

In [6]:
from sklearn.model_selection import train_test_split


In [7]:
X_train, X_test, y_train, y_test = train_test_split(X_C2, y_C2, 
                                                    random_state=0)

In [13]:
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
knn.score(X_train, y_train)

0.84

### Effect of varing K

In [12]:

from sklearn.neighbors import KNeighborsClassifier
for i in range(1,10):
    knn = KNeighborsClassifier(n_neighbors = i)
    knn.fit(X_train, y_train)
    score = knn.score(X_train, y_train)
    score_test = knn.score(X_test, y_test)
    print(f'When K = {i}, the score was: {score} and on test: {score_test}')

When K = 1, the score was: 1.0 and on test: 0.8
When K = 2, the score was: 0.88 and on test: 0.8
When K = 3, the score was: 0.92 and on test: 0.72
When K = 4, the score was: 0.9066666666666666 and on test: 0.8
When K = 5, the score was: 0.84 and on test: 0.76
When K = 6, the score was: 0.8666666666666667 and on test: 0.76
When K = 7, the score was: 0.7866666666666666 and on test: 0.8
When K = 8, the score was: 0.7866666666666666 and on test: 0.8
When K = 9, the score was: 0.7733333333333333 and on test: 0.8


### Effect of scaleing

In [14]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)

X_test_scaled = scaler.transform(X_test)

knn = KNeighborsClassifier()
knn.fit(X_train_scaled, y_train)
train = knn.score(X_train_scaled, y_train)
test = knn.score(X_test_scaled, y_test)
print(f'train score: {train}')
print(f'test score: {test}')

train score: 0.84
test score: 0.84


## Logistic Regression

In [15]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression().fit(X_train, y_train)
clf.score(X_train, y_train)

0.8133333333333334

## Effect of Vareing C

In [24]:
for i in range(2,2000,200):

    clf = LogisticRegression(C=i).fit(X_train, y_train)
    score = clf.score(X_train, y_train)
    score = clf.score(X_train, y_train)
    score_test = clf.score(X_test, y_test)
    print(f'When C = {i}, the score was: {score} and on test: {score_test}')    

When C = 2, the score was: 0.8266666666666667 and on test: 0.84
When C = 202, the score was: 0.8133333333333334 and on test: 0.84
When C = 402, the score was: 0.8133333333333334 and on test: 0.84
When C = 602, the score was: 0.8133333333333334 and on test: 0.84
When C = 802, the score was: 0.8133333333333334 and on test: 0.84
When C = 1002, the score was: 0.8133333333333334 and on test: 0.84
When C = 1202, the score was: 0.8133333333333334 and on test: 0.84
When C = 1402, the score was: 0.8133333333333334 and on test: 0.84
When C = 1602, the score was: 0.8133333333333334 and on test: 0.84
When C = 1802, the score was: 0.8133333333333334 and on test: 0.84


In [26]:
X_train, X_test, y_train, y_test = train_test_split(X_cancer, y_cancer, random_state = 0)

clf = LogisticRegression().fit(X_train, y_train)
print('Breast cancer dataset')
print('Accuracy of Logistic regression classifier on training set: {:.2f}'
     .format(clf.score(X_train, y_train)))
print('Accuracy of Logistic regression classifier on test set: {:.2f}'
     .format(clf.score(X_test, y_test)))

Breast cancer dataset
Accuracy of Logistic regression classifier on training set: 0.95
Accuracy of Logistic regression classifier on test set: 0.95


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## Support Vector Machines

In [36]:
from sklearn.svm import SVC
clf = SVC(kernel = 'linear').fit(X_train, y_train)
clf.score(X_train,y_train)

0.9671361502347418

### Effect of varing C

In [32]:
for i in range(1,10):
    clf = SVC(kernel = 'linear', C=i).fit(X_train, y_train)
    clf.score(X_train,y_train)
    score = clf.score(X_train, y_train)
    score_test = clf.score(X_test, y_test)
    print(f'When C = {i}, the score was: {score} and on test: {score_test}')

When C = 1, the score was: 0.9671361502347418 and on test: 0.958041958041958
When C = 2, the score was: 0.9671361502347418 and on test: 0.965034965034965
When C = 3, the score was: 0.971830985915493 and on test: 0.951048951048951
When C = 4, the score was: 0.971830985915493 and on test: 0.951048951048951
When C = 5, the score was: 0.9694835680751174 and on test: 0.9440559440559441
When C = 6, the score was: 0.9765258215962441 and on test: 0.958041958041958
When C = 7, the score was: 0.9765258215962441 and on test: 0.958041958041958
When C = 8, the score was: 0.9765258215962441 and on test: 0.958041958041958
When C = 9, the score was: 0.9765258215962441 and on test: 0.958041958041958


## Linear SVM

In [37]:
from sklearn.svm import LinearSVC
clf = LinearSVC().fit(X_train, y_train)
clf.score(X_train, y_train)



0.4812206572769953

## Decision Trees

In [8]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier().fit(X_train, y_train)
clf.score(X_train, y_train)

1.0

In [12]:
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from adspy_shared_utilities import plot_decision_tree
from sklearn.model_selection import train_test_split


iris = load_iris()

X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state = 3)
clf = DecisionTreeClassifier().fit(X_train, y_train)

print('Accuracy of Decision Tree classifier on training set: {:.2f}'
     .format(clf.score(X_train, y_train)))
print('Accuracy of Decision Tree classifier on test set: {:.2f}'
     .format(clf.score(X_test, y_test)))

Accuracy of Decision Tree classifier on training set: 1.00
Accuracy of Decision Tree classifier on test set: 0.95


In [13]:
plot_decision_tree(clf, iris.feature_names, iris.target_names)

ExecutableNotFound: failed to execute WindowsPath('dot'), make sure the Graphviz executables are on your systems' PATH

<graphviz.sources.Source at 0x193196ddff0>