In [1]:
import sklearn
# The datasets module in sklearn (Scikit-learn) provides utilities to load datasets, 
# including methods to load and fetch popular reference datasets. 
from sklearn import datasets
from sklearn import svm
# the metrics module provides a comprehensive set of tools to evaluate and 
# measure the performance, similarity, and distance of machine learning models and data points.
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier

In [2]:
cancer = datasets.load_breast_cancer()
print(cancer.feature_names)
print(cancer.target_names)

['mean radius' 'mean texture' 'mean perimeter' 'mean area'
 'mean smoothness' 'mean compactness' 'mean concavity'
 'mean concave points' 'mean symmetry' 'mean fractal dimension'
 'radius error' 'texture error' 'perimeter error' 'area error'
 'smoothness error' 'compactness error' 'concavity error'
 'concave points error' 'symmetry error' 'fractal dimension error'
 'worst radius' 'worst texture' 'worst perimeter' 'worst area'
 'worst smoothness' 'worst compactness' 'worst concavity'
 'worst concave points' 'worst symmetry' 'worst fractal dimension']
['malignant' 'benign']


In [3]:
x = cancer.data
y = cancer.target

In [4]:
x_train, x_test, y_train, y_test = sklearn.model_selection.train_test_split(x, y, test_size = 0.2, random_state=42)

In [5]:
print(x_train, y_train)

[[9.029e+00 1.733e+01 5.879e+01 ... 1.750e-01 4.228e-01 1.175e-01]
 [2.109e+01 2.657e+01 1.427e+02 ... 2.903e-01 4.098e-01 1.284e-01]
 [9.173e+00 1.386e+01 5.920e+01 ... 5.087e-02 3.282e-01 8.490e-02]
 ...
 [1.429e+01 1.682e+01 9.030e+01 ... 3.333e-02 2.458e-01 6.120e-02]
 [1.398e+01 1.962e+01 9.112e+01 ... 1.827e-01 3.179e-01 1.055e-01]
 [1.218e+01 2.052e+01 7.722e+01 ... 7.431e-02 2.694e-01 6.878e-02]] [1 0 1 1 1 0 1 1 1 0 1 0 0 1 1 0 0 0 1 1 1 0 1 1 1 0 1 0 1 1 0 1 0 0 0 1 0
 1 1 1 1 0 0 1 1 1 1 1 1 1 0 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 0 0 0 1 1
 0 1 0 1 1 1 1 0 1 1 0 1 1 1 0 1 0 0 1 1 1 0 1 1 1 1 0 1 1 1 1 1 0 1 0 0 1
 1 0 1 1 1 1 1 1 1 0 0 0 1 1 0 1 1 0 1 0 1 0 1 0 1 1 0 1 1 1 0 1 0 1 0 1 0
 1 1 0 1 1 1 1 0 1 1 1 0 1 1 0 1 1 0 1 1 1 1 1 1 1 0 1 1 1 0 1 0 1 1 1 0 1
 0 0 1 1 0 1 0 0 0 1 1 1 0 1 1 0 1 0 1 1 1 0 1 0 1 1 0 0 1 1 0 1 0 0 1 0 0
 1 1 0 0 0 1 1 1 1 0 1 0 0 0 0 1 1 1 1 1 1 1 1 0 0 1 1 0 1 1 1 1 1 0 1 1 0
 0 1 0 1 0 1 1 1 1 1 1 0 1 1 0 1 1 1 1 1 1 0 1 1 0 1 0 0 0 1 0 1 1 

In [6]:
# later when we want to print out the actual result we can index 1 and 0 to these classes
classes = ['malignant' 'benign']

In [7]:
'''
SVM - support vector machines
creates a hyperplane (the same distance to a pair of points of the different classes)
so we can have infinite number of hyperplanes
How to pick the best one? Find the two cloest points to the line that are furthest possible away aka. we want to maximize margin
What if dots are meshed up? We use Kernels - a function takes in features to a higher dimension, and data are not meshed together
kernels don't always work, and we would repeat the process 
Soft margin vs hard margin
Soft margin - allow for a few points to exist in between, in order to get a better classifier
'''

"\nSVM - support vector machines\ncreates a hyperplane (the same distance to a pair of points of the different classes)\nso we can have infinite number of hyperplanes\nHow to pick the best one? Find the two cloest points to the line that are furthest possible away aka. we want to maximize margin\nWhat if dots are meshed up? We use Kernels - a function takes in features to a higher dimension, and data are not meshed together\nkernels don't always work, and we would repeat the process \nSoft margin vs hard margin\nSoft margin - allow for a few points to exist in between, in order to get a better classifier\n"

In [8]:
clf1 = svm.SVC()
clf1.fit(x_train, y_train)

y_pred = clf1.predict(x_test)

acc = metrics.accuracy_score(y_test, y_pred) #the oder doesn't even matter becuase we are just comparing two lists
print(acc)


0.9473684210526315


In [9]:
clf2 = svm.SVC(kernel = "linear")
clf2.fit(x_train, y_train)

y_pred = clf2.predict(x_test)

acc = metrics.accuracy_score(y_test, y_pred) #the oder doesn't even matter becuase we are just comparing two lists
print(acc)

0.956140350877193


In [10]:
clf3 = svm.SVC(kernel = "linear", C=2) #c --> soft margin, how many points are allowed in the margin
clf3.fit(x_train, y_train)

y_pred = clf3.predict(x_test)

acc = metrics.accuracy_score(y_test, y_pred)
print(acc)

0.956140350877193


In [11]:
# Compare the K nearest neighbour classifier 
clf4 = KNeighborsClassifier(n_neighbors = 13)
clf4.fit(x_train, y_train)

y_pred = clf4.predict(x_test)

acc = metrics.accuracy_score(y_test, y_pred) 
print(acc)

# it is entirely possible to get the same accuracy for different values of n_neighbors in the KNeighborsClassifier
# Eg.in a binary classification problem, 5 out of 7 neighbors vote for class 1 and the other 2 for class 0, 
# then increasing the neighbors to 9 might still result in a majority vote for class 1 even if the new neighbors vote differently

0.9736842105263158


In [12]:
# Usually KNN does not woek as well as SVM for datasets with a lot of features (here there are 30). 
# But the result is surprising here, that's why we need to test out different machine learning algo

# while they might produce the same accuracy, the underlying decision process is different:

# SVM tries to maximize the margin between the two classes.
# KNN bases its decision on the proximity of data points.