### Practice I

Here, we will generate two dimensional data using the make blobs function and
split it into training (2/3) and test set (1/3). We will then evaluate the result of Logistic regression with
polynomial features where the degree of the polynomials is varied. We will also try to see hrough some tests whether logistic regression is prone to overfitting.

In [None]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from scipy import stats
from sklearn import neighbors, datasets
from sklearn.metrics import accuracy_score
# use seaborn plotting defaults
import seaborn as sns; sns.set()

In [None]:
from sklearn.datasets.samples_generator import make_blobs


std = 3

X, y = make_blobs(n_samples=200, centers=2, n_features=2, cluster_std = std, random_state=42)

#X, y = make_blobs(n_samples=500, centers=[[0,0],[5,5]], random_state=0, cluster_std = std)

# Create color maps
cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])


# Plotting the cluster in different colors for easy visualization
plt.scatter(X[:, 0], X[:, 1], s=20, c=y, cmap=cmap_bold);
plt.title("CASE I : Random points with 2 classes (cluster_SD = %.3f)"%std);

In [None]:
from sklearn.model_selection import train_test_split

# Splitting 33 percent (approx. 1/3) data for test and remaining 67 (approx. 2/3) for training
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)

Showing only the training points in plot:


In [None]:
# Plot also the training points

plt.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cmap_bold)
plt.title("Training points extracted from the data")
plt.show()

In [None]:
# Plot also the test points

plt.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cmap_bold)
plt.title("Test points extracted from the data")
plt.show()

#### Read more on Polynomial features here:

https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.PolynomialFeatures.html


#### Read more on Pipeline here:

https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html#sklearn.pipeline.Pipeline


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline

# Logistic regression 
deg = 1
model = make_pipeline(PolynomialFeatures(deg),LogisticRegression(solver='lbfgs',C = 0.01))
clf = model.fit(X_train, y_train)

# create a mesh to plot in
h = .1  # step size in the mesh
x_min, x_max = X_train[:, 0].min() - 1, X_train[:, 0].max() + 1
y_min, y_max = X_train[:, 1].min() - 1, X_train[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),np.arange(y_min, y_max, h))

# Plot the decision boundary. For that, we will assign a color to each
# point in the mesh [x_min, x_max]x[y_min, y_max].
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.figure()
plt.contourf(xx, yy, Z, cmap=cmap_light)
plt.title("Decision surface of LogisticRegression")
plt.axis('tight')

# Plot also the training points
plt.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cmap_bold)
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.title("Logistic regression with polynomial degree:  %.1f" %deg)

plt.show()

In [None]:
# print the training scores
print("training score : %.3f" % (clf.score(X_train, y_train)))


# Get the prediction result (for test data)
Z = clf.predict(X_test)

# print the test score
print("prediction accuracy (test score): %.3f " % accuracy_score(Z, y_test))

### Increasing the degree of polynomial to two

In [None]:
# Logistic regression with polynomial of degree 2
deg = 2
model = make_pipeline(PolynomialFeatures(deg),LogisticRegression(solver='lbfgs',C = 0.01))
clf = model.fit(X_train, y_train)

# create a mesh to plot in
h = .1  # step size in the mesh
x_min, x_max = X_train[:, 0].min() - 1, X_train[:, 0].max() + 1
y_min, y_max = X_train[:, 1].min() - 1, X_train[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),np.arange(y_min, y_max, h))

# Plot the decision boundary. For that, we will assign a color to each
# point in the mesh [x_min, x_max]x[y_min, y_max].
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.figure()
plt.contourf(xx, yy, Z, cmap=cmap_light)
plt.title("Decision surface of LogisticRegression")
plt.axis('tight')

# Plot also the training points
plt.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cmap_bold)
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.title("Logistic regression with polynomial degree:  %.1f" %deg)

plt.show()

In [None]:
# print the training scores
print("training score : %.3f" % (clf.score(X_train, y_train)))


# Get the prediction result (for test data)
Z = clf.predict(X_test)

# print the test score
print("prediction accuracy (test score): %.3f " % accuracy_score(Z, y_test))

### Increasing the degree of polynomial to three

In [None]:
# Logistic regression with polynomial of degree 3
deg = 3
model = make_pipeline(PolynomialFeatures(deg),LogisticRegression(solver='lbfgs',C = 0.01)) # C is inverse of 
# regularization strength. Smaller values mean strong regularization
clf = model.fit(X_train, y_train)

# create a mesh to plot in
h = .1  # step size in the mesh
x_min, x_max = X_train[:, 0].min() - 1, X_train[:, 0].max() + 1
y_min, y_max = X_train[:, 1].min() - 1, X_train[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),np.arange(y_min, y_max, h))

# Plot the decision boundary. For that, we will assign a color to each
# point in the mesh [x_min, x_max]x[y_min, y_max].
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.figure()
plt.contourf(xx, yy, Z, cmap=cmap_light)
plt.title("Decision surface of LogisticRegression")
plt.axis('tight')

# Plot also the training points
plt.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cmap_bold)
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.title("Logistic regression with polynomial degree:  %.1f" %deg)

plt.show()

In [None]:
# print the training scores
print("training score : %.3f" % (clf.score(X_train, y_train)))


# Get the prediction result (for test data)
Z = clf.predict(X_test)

# print the test score
print("prediction accuracy (test score): %.3f " % accuracy_score(Z, y_test))

### Varying the degrees of the polynomial to check for overfitting

In [None]:
d= {}

for deg in range(1, 10, 1): # Varying the polynomial degree from 1 to 10 in step size of 1
    l =[]
    # Create an instance of classifier
    model = make_pipeline(PolynomialFeatures(deg),LogisticRegression(solver='lbfgs',C = 0.01)) # C is inverse of 
    # regularization strength. Smaller values mean strong regularization
    clf = model.fit(X_train, y_train)
    
    # Fit the generated data to the model 
    clf.fit(X_train, y_train)

    # Get the training scores
    l.append("%.3f"%clf.score(X_train, y_train))
    
    # Get the prediction result (for test data)
    Z = clf.predict(X_test)

    # Get the test accurracy
    l.append("%.3f"%accuracy_score(y_test, Z))
    d[deg] = l

# Printing out as a table format
print ("{:<8} {:<15} {:<10}".format('Polynomial-degree','Training score','Test score'))
for key, val in sorted(d.items()):
    label, num = val
    print ("{:<18} {:<15} {:<10}".format(key, label, num))


### Practice II

We will now fit a Gaussian naive Bayes classifier to the Iris data set, thereby using
2/3 of the data for training and 1/3 for testing.

In [None]:
from sklearn.naive_bayes import GaussianNB

# import data from IRIS dataset
iris = datasets.load_iris()
X = iris.data[:, :2]  # we only take the first two features of the 4-D dataset 
y = iris.target

# Splitting 33 percent (approx. 1/3) data for test and remaining 67 (approx. 2/3) for training
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0) 


# create an instance of Gaussian Naive Bayes Classifier and fit the data.
clf = GaussianNB()
clf.fit(X_train, y_train)
# GaussianNB(priors = None) # class priors are adapted from the data

# centers of the Gaussians:
centers = clf.theta_

h = .02  # step size in the mesh
# Plot the decision boundary. For that, we will assign a color to each
# point in the mesh [x_min, x_max]x[y_min, y_max].
x_min, x_max = X_train[:, 0].min() - 1, X_train[:, 0].max() + 1
y_min, y_max = X_train[:, 1].min() - 1, X_train[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),np.arange(y_min, y_max, h))
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])

# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.figure()
plt.pcolormesh(xx, yy, Z, cmap=cmap_light)

# Plot also the training points
plt.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cmap_bold)
plt.scatter(centers[:,0],centers[:,1],s=150)  # gaussian centers as big blobs
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.title("Gaussian naive Bayes classification with first two dimension (Iris)")
plt.show()



In [None]:
# print the training scores
print("training score : %.3f" % (clf.score(X_train, y_train)))

# Get the prediction result (for test data)
Z = clf.predict(X_test)

# print the test score
print("prediction accuracy (test score): %.3f " % accuracy_score(Z, y_test))

#### Now, comparing it with the k-NN classifier, we see that:

In [None]:
n_neighbors = 10

h = .02  # step size in the mesh

# create an instance of Neighbours Classifier and fit the data.
clf = neighbors.KNeighborsClassifier(n_neighbors)
clf.fit(X_train, y_train)


# Plot the decision boundary. For that, we will assign a color to each
# point in the mesh [x_min, x_max]x[y_min, y_max].
x_min, x_max = X_train[:, 0].min() - 1, X_train[:, 0].max() + 1
y_min, y_max = X_train[:, 1].min() - 1, X_train[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),np.arange(y_min, y_max, h))
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])

# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.figure()
plt.pcolormesh(xx, yy, Z, cmap=cmap_light)

# Plot also the training points
plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold)
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.title("3-Class classification with first two dimension(Iris) (k = %i)" % n_neighbors)
plt.show()

In [None]:
# import data from IRIS dataset
iris = datasets.load_iris()
X = iris.data[:, 2:]  # we only take the last two features of the 4-D dataset 
y = iris.target

# Splitting 33 percent (approx. 1/3) data for test and remaining 67 (approx. 2/3) for training
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0) 


# create an instance of Gaussian Naive Bayes Classifier and fit the data.
clf = GaussianNB()
clf.fit(X_train, y_train)
# GaussianNB(priors = None) # class priors are adapted from the data

# centers of the Gaussians:
centers = clf.theta_

h = .02  # step size in the mesh
# Plot the decision boundary. For that, we will assign a color to each
# point in the mesh [x_min, x_max]x[y_min, y_max].
x_min, x_max = X_train[:, 0].min() - 1, X_train[:, 0].max() + 1
y_min, y_max = X_train[:, 1].min() - 1, X_train[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),np.arange(y_min, y_max, h))
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])

# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.figure()
plt.pcolormesh(xx, yy, Z, cmap=cmap_light)

# Plot also the training points
plt.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cmap_bold)
plt.scatter(centers[:,0],centers[:,1],s=150)  # gaussian centers as big blobs
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.title("Gaussian naive Bayes classification with last two dimension (Iris)")
plt.show()

In [None]:
# print the training scores
print("training score : %.3f" % (clf.score(X_train, y_train)))

# Get the prediction result (for test data)
Z = clf.predict(X_test)

# print the test score
print("prediction accuracy (test score): %.3f " % accuracy_score(Z, y_test))

#### Again, comparing it with the k-NN classifier, we see that:

In [None]:
n_neighbors = 10

h = .02  # step size in the mesh

# create an instance of Neighbours Classifier and fit the data.
clf = neighbors.KNeighborsClassifier(n_neighbors)
clf.fit(X_train, y_train)


# Plot the decision boundary. For that, we will assign a color to each
# point in the mesh [x_min, x_max]x[y_min, y_max].
x_min, x_max = X_train[:, 0].min() - 1, X_train[:, 0].max() + 1
y_min, y_max = X_train[:, 1].min() - 1, X_train[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),np.arange(y_min, y_max, h))
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])

# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.figure()
plt.pcolormesh(xx, yy, Z, cmap=cmap_light)

# Plot also the training points
plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold)
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.title("3-Class classification with last two dimension(Iris) (k = %i)" % n_neighbors)
plt.show()

In [None]:
# print the training scores
print("training score : %.3f" % (clf.score(X_train, y_train)))

# Get the prediction result (for test data)
Z = clf.predict(X_test)

# print the test score
print("prediction accuracy (test score): %.3f " % accuracy_score(Z, y_test))

### Using all four dimensions of the Iris dataset (splitting the dataset 2/3 training and 1/3 testing)

In [None]:
# import data from IRIS dataset
iris = datasets.load_iris()
X = iris.data[:, :]  # we take all four features of the 4-D dataset 
y = iris.target

# Splitting 33 percent (approx. 1/3) data for test and remaining 67 (approx. 2/3) for training
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0) 


# create an instance of Gaussian Naive Bayes Classifier and fit the data.
clf = GaussianNB()
clf.fit(X_train, y_train)
# GaussianNB(priors = None) # class priors are adapted from the data

# print the training scores
print("training score : %.3f" % (clf.score(X_train, y_train)))

# Get the prediction result (for test data)
Z = clf.predict(X_test)

# print the test score
print("prediction accuracy (test score): %.3f " % accuracy_score(Z, y_test))

### Comparing the result to k-NN neighbour

In [None]:
n_neighbors = 10 # Number of neighbors for kNN

# Create an instance of neighbors class imported from sklearn
classifier = neighbors.KNeighborsClassifier(n_neighbors)

# Fit the generated data to the model 
classifier.fit(X_train, y_train)


# Get the prediction result (for test data)
Z = classifier.predict(X_test)

# print the training scores
print("training score : %.3f" % (classifier.score(X_train, y_train)))

# print the test score
print("prediction accuracy (test score): %.3f " % accuracy_score(Z, y_test))
