# Getting started with scikit-learn

Let us first use the method `makeblobs` from `sklearn.datasets.samples_generator` to
generate three two-dimensional data sets with three classes where classes have a different degree of overlap.

#### First dataset with 3 clusters and cluster standard deviation = 0.85 

In [None]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from scipy import stats
from sklearn import neighbors, datasets
from sklearn.metrics import accuracy_score
# use seaborn plotting defaults
import seaborn as sns; sns.set()

In [None]:
from sklearn.datasets.samples_generator import make_blobs

deviation_1 = 0.85

X1, y1 = make_blobs(n_samples=500, centers=[[0,0],[-10,-10],[-15,15]],
                  random_state=0, cluster_std=deviation_1)

# Create color maps
cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])


# Plotting the cluster in different colors for easy visualization
plt.scatter(X1[:, 0], X1[:, 1], s=20, c=y1, cmap=cmap_bold);
plt.title("CASE I : Random points with 3 classes (cluster_SD = %f)"%deviation_1);

Let us now see the classification error of a `5-NN classifier` for these settings.

In [None]:
n_neighbors = 5 # Number of neighbors for kNN

# Create an instance of neighbors class imported from sklearn
clf_1 = neighbors.KNeighborsClassifier(n_neighbors)

# Fit the generated data to the model 
clf_1.fit(X1, y1)

In [None]:
# Plot the decision boundary. For that, we will assign a color to each
# point in the mesh [x_min, x_max]x[y_min, y_max].

x1_min, x1_max = X1[:, 0].min() - 1, X1[:, 0].max() + 1
y1_min, y1_max = X1[:, 1].min() - 1, X1[:, 1].max() + 1

h = .02  # step size in the mesh

xx1, yy1 = np.meshgrid(np.arange(x1_min, x1_max, h),np.arange(y1_min, y1_max, h))

# Get the prediction result
Z1 = clf_1.predict(np.c_[xx1.ravel(), yy1.ravel()])


# Put the result into a color plot
Z1 = Z1.reshape(xx1.shape)


plt.figure()
plt.pcolormesh(xx1, yy1, Z1, cmap=cmap_light)


# Plot also the training points
plt.scatter(X1[:, 0], X1[:, 1], c=y1, cmap=cmap_bold)
plt.xlim(xx1.min(), xx1.max())
plt.ylim(yy1.min(), yy1.max())
plt.title(" CASE I : 3-Class classification (k = %i)" % n_neighbors)
plt.show()

In [None]:
# print the training scores
print("training score : %.3f " % clf_1.score(X1, y1))

# Get the prediction result (for test data)
Z = clf_1.predict(X1)

# print the test accurracy
print("prediction accuracy: %.3f " % accuracy_score(Z, y1))


#### Second dataset with 3 clusters and cluster standard deviation = 1.0

In [None]:
deviation_2 = 1.0

X2, y2 = make_blobs(n_samples=200, centers=3,
                  random_state=0, cluster_std=deviation_2)

# Plotting the cluster for easy visualization
plt.scatter(X2[:, 0], X2[:, 1], s=20, c=y2, cmap=cmap_bold);
plt.title("CASE II : Random points with 3 classes (cluster_SD = %f)"%deviation_2);

In [None]:
n_neighbors = 5 # Number of neighbors for kNN


# Create an instance of neighbors class imported from sklearn
clf_2 = neighbors.KNeighborsClassifier(n_neighbors)

# Fit the generated data to the model 
clf_2.fit(X2, y2)

# Plot the decision boundary. For that, we will assign a color to each
# point in the mesh [x_min, x_max]x[y_min, y_max].

x2_min, x2_max = X2[:, 0].min() - 1, X2[:, 0].max() + 1
y2_min, y2_max = X2[:, 1].min() - 1, X2[:, 1].max() + 1

h = .02  # step size in the mesh

xx2, yy2 = np.meshgrid(np.arange(x2_min, x2_max, h),np.arange(y2_min, y2_max, h))

# Get the prediction result
Z2 = clf_2.predict(np.c_[xx2.ravel(), yy2.ravel()])


# Put the result into a color plot
Z2 = Z2.reshape(xx2.shape)

plt.figure()
plt.pcolormesh(xx2, yy2, Z2, cmap=cmap_light)

# Plot also the training points
plt.scatter(X2[:, 0], X2[:, 1], c=y2, cmap=cmap_bold)
plt.xlim(xx2.min(), xx2.max())
plt.ylim(yy2.min(), yy2.max())
plt.title(" CASE II : 3-Class classification (k = %i)" % n_neighbors)
plt.show()

In [None]:
# print the training scores
print("training score : %.3f " % clf_2.score(X2, y2))

# Get the prediction result (for test data)
Z = clf_2.predict(X2)

# print the test accurracy
print("prediction accuracy: %.3f " % accuracy_score(Z, y2))


#### Third dataset with 3 clusters and cluster standard deviation = 3

In [None]:
deviation_3 = 3

X3, y3 = make_blobs(n_samples=500, centers=3,
                  random_state=0, cluster_std=deviation_3)

# Plotting the cluster for easy visualization
plt.scatter(X3[:, 0], X3[:, 1], s=20, c=y3, cmap=cmap_bold);
plt.title("CASE III : Random points with 3 classes (cluster_SD = %f)"%deviation_3);

In [None]:
n_neighbors = 5 # Number of neighbors for kNN

# Create an instance of neighbors class imported from sklearn
clf_3 = neighbors.KNeighborsClassifier(n_neighbors)

# Fit the generated data to the model 
clf_3.fit(X3, y3)

# Plot the decision boundary. For that, we will assign a color to each
# point in the mesh [x_min, x_max]x[y_min, y_max].

x3_min, x3_max = X3[:, 0].min() - 1, X3[:, 0].max() + 1
y3_min, y3_max = X3[:, 1].min() - 1, X3[:, 1].max() + 1

h = .02  # step size in the mesh

xx3, yy3 = np.meshgrid(np.arange(x3_min, x3_max, h),np.arange(y3_min, y3_max, h))

# Get the prediction result
Z3 = clf_3.predict(np.c_[xx3.ravel(), yy3.ravel()])


# Put the result into a color plot
Z3 = Z3.reshape(xx3.shape)

plt.figure()
plt.pcolormesh(xx3, yy3, Z3, cmap=cmap_light)

# Plot also the training points
plt.scatter(X3[:, 0], X3[:, 1], c=y3, cmap=cmap_bold)
plt.xlim(xx3.min(), xx3.max())
plt.ylim(yy3.min(), yy3.max())
plt.title(" CASE III : 3-Class classification (k = %i)" % n_neighbors)
plt.show()

In [None]:
# print the training scores
print("training score : %.3f " % clf_3.score(X3, y3))

# Get the prediction result (for test data)
Z = clf_3.predict(X3)

# print the test accurracy
print("prediction accuracy: %.3f " % accuracy_score(Z, y3))


Let's now generate another data set by using `make_blobs(n_samples=200, centers=2, n_features=2, cluster_std = 5, random_state=42)` and split it into training (2/3 of the data) and test set (1/3 of the data). We will then evaluate the performance of a `k-NN classifier` as concerns training and test error for different choices of k, whereby the rank of k should be chosen such that an optimum choice of k can be inferred thereof.

In [None]:
X, y = make_blobs(n_samples=200, centers=2, n_features =2, random_state=42, cluster_std=5)

In [None]:
plt.scatter(X[:, 0], X[:, 1], s=20, c=y, cmap=cmap_bold);

In [None]:
from sklearn.model_selection import train_test_split

# Splitting 33 percent (approx. 1/3) data for test and remaining 67 (approx. 2/3) for training
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0) 

Showing the training points only in plot:


In [None]:
# Plot also the training points

plt.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cmap_bold)
plt.title("Training points extracted from the data")
plt.show()

In [None]:
# Plot also the test points

plt.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cmap_bold)
plt.title("Test points extracted from the data")
plt.show()

In [None]:
d= {}

for k in range(5, 50, 5): # Varying the k value from 5 to 100 in step size of 10
    l =[]
    n_neighbors = k # Number of neighbors for kNN

    # Create an instance of neighbors class imported from sklearn
    classifier = neighbors.KNeighborsClassifier(n_neighbors)

    # Fit the generated data to the model 
    classifier.fit(X_train, y_train)

    # Get the training scores
    l.append("%.3f"%classifier.score(X_train, y_train))
    
    # Get the prediction result (for test data)
    Z = classifier.predict(X_test)

    # Get the test accurracy
    l.append("%.3f"%accuracy_score(y_test, Z))
    d[k] = l

# Printing out as a table format
print ("{:<8} {:<15} {:<10}".format('k-value','Training score','Test score'))
for key, val in sorted(d.items()):
    label, num = val
    print ("{:<8} {:<15} {:<10}".format(key, label, num))


The iris data set is four dimensional; we have used only the first two
dimensions for their classification in order to arrive at a direct visualization. Compare the
result of a kNN classifier for the first two dimensions of the data to a k-NN classifier where
all dimensions of the data are used. What happens if only the last two dimensions are used?

### Taking only the first two dimension of the Iris dataset

In [None]:
n_neighbors = 15

# import data from IRIS dataset
iris = datasets.load_iris()
X = iris.data[:, :2]  # we only take the first two features of the 4-D dataset 
y = iris.target

h = .02  # step size in the mesh

# create an instance of Neighbours Classifier and fit the data.
clf = neighbors.KNeighborsClassifier(n_neighbors)
clf.fit(X, y)


# Plot the decision boundary. For that, we will assign a color to each
# point in the mesh [x_min, x_max]x[y_min, y_max].
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),np.arange(y_min, y_max, h))
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])

# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.figure()
plt.pcolormesh(xx, yy, Z, cmap=cmap_light)

# Plot also the training points
plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold)
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.title("3-Class classification with first two dimension(Iris) (k = %i)" % n_neighbors)
plt.show()

In [None]:
# print the training scores
print("training score : %.3f " % clf.score(X, y))

# Get the prediction result (for test data)
Z = clf.predict(X)

# print the test accurracy
print("prediction accuracy: %.3f " % accuracy_score(Z, y))


### Taking only the last two dimensions of the Iris dataset

In [None]:
n_neighbors = 15

# import data from IRIS dataset
iris = datasets.load_iris()
X = iris.data[:, 2:]  # we only take the last two features of the 4-D dataset 
y = iris.target

h = .02  # step size in the mesh

# create an instance of Neighbours Classifier and fit the data.
clf = neighbors.KNeighborsClassifier(n_neighbors)
clf.fit(X, y)

# Plot the decision boundary. For that, we will assign a color to each
# point in the mesh [x_min, x_max]x[y_min, y_max].
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),np.arange(y_min, y_max, h))
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])

# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.figure()
plt.pcolormesh(xx, yy, Z, cmap=cmap_light)

# Plot also the training points
plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold)
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.title("3-Class classification with last two dimensions (Iris) (k = %i)" % n_neighbors)
plt.show()

In [None]:
# print the training scores
print("training score : %.3f " % clf.score(X, y))

# Get the prediction result (for test data)
Z = clf.predict(X)

# print the test accurracy
print("prediction accuracy: %.3f " % accuracy_score(Z, y))

### Using all four dimensions of the Iris dataset (without splitting the dataset)

In [None]:
n_neighbors = 15

# import data from IRIS dataset
iris = datasets.load_iris()
X = iris.data[:, :]  # we take all four features of the 4-D dataset 
y = iris.target

# create an instance of Neighbours Classifier and fit the data.
clf = neighbors.KNeighborsClassifier(n_neighbors)
clf.fit(X, y)

# print the training scores
print("training score : %.3f " % clf.score(X, y))

# Get the prediction result (for test data)
Z = clf.predict(X)

# print the test accurracy
print("prediction accuracy: %.3f " % accuracy_score(Z, y))

### Using all four dimensions of the Iris dataset (splitting the dataset 2/3 training and 1/3 testing)

In [None]:

# Splitting 33 percent (approx. 1/3) data for test and remaining 67 (approx. 2/3) for training
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0) 

d= {}

for i in range(5, 100, 10):
    l =[]
    n_neighbors = i # Number of neighbors for kNN

    # Create an instance of neighbors class imported from sklearn
    classifier = neighbors.KNeighborsClassifier(n_neighbors)

    # Fit the generated data to the model 
    classifier.fit(X_train, y_train)

    # Get the training scores
    l.append("%.3f"%classifier.score(X_train, y_train))
    
    # Get the prediction result (for test data)
    Z = classifier.predict(X_test)

    # Get the test accurracy
    l.append("%.3f"%accuracy_score(y_test, Z))
    d[i] = l

# Printing out as a table format
print ("{:<8} {:<15} {:<10}".format('k-value','Training score','Test score'))
for k, v in sorted(d.items()):
    label, num = v
    print ("{:<8} {:<15} {:<10}".format(k, label, num))
