In [None]:
import numpy as np
import pandas as pd
import scipy
import matplotlib
import sklearn
import matplotlib.pyplot as plt

In [None]:
#customize print options (not needed for your code)
np.set_printoptions(precision=2, suppress = True)

### Dataset 1: Diabetes data (regression)

In [None]:
from sklearn.datasets import load_diabetes
# https://scikit-learn.org/stable/datasets/toy_dataset.html#diabetes-dataset

diabetes = load_diabetes()
diabetes.keys()

dict_keys(['data', 'target', 'frame', 'DESCR', 'feature_names', 'data_filename', 'target_filename', 'data_module'])

In [None]:
diabetes['feature_names']

['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6']

In [None]:
diabetes.data

array([[ 0.04,  0.05,  0.06, ..., -0.  ,  0.02, -0.02],
       [-0.  , -0.04, -0.05, ..., -0.04, -0.07, -0.09],
       [ 0.09,  0.05,  0.04, ..., -0.  ,  0.  , -0.03],
       ...,
       [ 0.04,  0.05, -0.02, ..., -0.01, -0.05,  0.02],
       [-0.05, -0.04,  0.04, ...,  0.03,  0.04, -0.03],
       [-0.05, -0.04, -0.07, ..., -0.04, -0.  ,  0.  ]])

In [None]:
diabetes.target

array([151.,  75., 141., 206., 135.,  97., 138.,  63., 110., 310., 101.,
        69., 179., 185., 118., 171., 166., 144.,  97., 168.,  68.,  49.,
        68., 245., 184., 202., 137.,  85., 131., 283., 129.,  59., 341.,
        87.,  65., 102., 265., 276., 252.,  90., 100.,  55.,  61.,  92.,
       259.,  53., 190., 142.,  75., 142., 155., 225.,  59., 104., 182.,
       128.,  52.,  37., 170., 170.,  61., 144.,  52., 128.,  71., 163.,
       150.,  97., 160., 178.,  48., 270., 202., 111.,  85.,  42., 170.,
       200., 252., 113., 143.,  51.,  52., 210.,  65., 141.,  55., 134.,
        42., 111.,  98., 164.,  48.,  96.,  90., 162., 150., 279.,  92.,
        83., 128., 102., 302., 198.,  95.,  53., 134., 144., 232.,  81.,
       104.,  59., 246., 297., 258., 229., 275., 281., 179., 200., 200.,
       173., 180.,  84., 121., 161.,  99., 109., 115., 268., 274., 158.,
       107.,  83., 103., 272.,  85., 280., 336., 281., 118., 317., 235.,
        60., 174., 259., 178., 128.,  96., 126., 28

In [None]:
diabetes.data.shape

(442, 10)

In [None]:
n_samples, n_features = diabetes.data.shape
print((n_samples, n_features))

(442, 10)


In [None]:
# diabetes.data[0]
diabetes.target[0]

151.0

In [None]:
from sklearn.model_selection import train_test_split
Xtrain, Xtest, ytrain, ytest = train_test_split(diabetes.data,diabetes.target,test_size=0.4, random_state=5)
print(Xtrain.shape, Xtest.shape)

(265, 10) (177, 10)


In [None]:
from sklearn.linear_model import LinearRegression

reg = LinearRegression(fit_intercept=False)#fit_intercept=True
reg.fit(Xtrain, ytrain)


LinearRegression(fit_intercept=False)

In [None]:
reg.coef_

array([  406.22,  -658.35,   537.99,   516.11,  1018.77,  -536.86,
       -1304.01,  -787.4 ,   -62.21,  -152.56])

In [None]:
y_pred_train = reg.predict(Xtrain)
y_pred = reg.predict(Xtest)

In [None]:
'''
np.matmul(diff, diff)/diff.shape[0]
y_pred.shape
diff = ytest - y_pred
np.mean(diff * diff)
diff_vec = diff.reshape([-1,1])
np.matmul(diff_vec.T, diff_vec)
diff_vec.T @ diff_vec
np.mean(abs(diff))
'''


### Matrix operations in numpy

* Always check the shape!
* Reshape
* Transpose

In [None]:
print(y_pred.shape)
print(ytest.shape)

(177,)
(177,)


In [None]:
diff = ytest - y_pred
diff.shape

(177,)

In [None]:
# ytrain - y_pred

In [None]:
diff.T.shape

(177,)

In [None]:
# diff = diff.reshape([177,1])
diff = diff.reshape([-1,1])

In [None]:
# print(diff.shape)
print(diff.T.shape)
# np.transpose(diff).shape

(1, 177)

* Matrix multiplication
* Element-wise calculations

In [None]:
# np.matmul(diff.T, diff)
diff.T @ diff

# np.matmul(diff, diff)

array([[540355.03]])

In [None]:
np.sqrt((diff.T @ diff)/diff.shape[0]) 

array([[55.25]])

In [None]:
# relative error
np.abs(diff)/ytest

array([[0.3 , 0.26, 0.48, ..., 0.38, 1.18, 1.1 ],
       [0.4 , 0.34, 0.64, ..., 0.5 , 1.57, 1.47],
       [0.18, 0.15, 0.29, ..., 0.23, 0.71, 0.67],
       ...,
       [0.2 , 0.17, 0.32, ..., 0.26, 0.8 , 0.75],
       [0.03, 0.03, 0.05, ..., 0.04, 0.12, 0.11],
       [0.  , 0.  , 0.  , ..., 0.  , 0.01, 0.01]])

In [None]:
#element-wise multiplication
# diff * diff
# diff + diff

In [None]:
#transpose

#inverse (import the inverse function from numpy)
from numpy.linalg import inv

#a random 3x3 matrix
example = np.random.random([3,3])
example_inv = inv(example)
example @ example_inv

array([[ 1., -0.,  0.],
       [-0.,  1., -0.],
       [ 0., -0.,  1.]])

### Dataset 2: Digits data (classification)

In [None]:
from sklearn.datasets import load_digits
digits = load_digits()
digits.keys()

In [None]:
from sklearn.model_selection import train_test_split
Xtrain, Xtest, ytrain, ytest = train_test_split(digits.data,digits.target,test_size=0.4, random_state=5)
print(Xtrain.shape, Xtest.shape)

In [None]:
# The images themselves
print(digits.images.shape)
print(digits.images[0])

In [None]:
plt.imshow(digits.images[0]);

In [None]:
from sklearn.linear_model import LogisticRegression
myLR = LogisticRegression()#penalty='l2'
myLR.fit(Xtrain, ytrain)

In [None]:
ypred = myLR.predict(Xtest)

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(ytest, ypred)

In [None]:
from sklearn import tree
myDT = tree.DecisionTreeClassifier(criterion = 'entropy', max_depth = 10)
myDT.fit(Xtrain, ytrain)

#check results on training data
ypred_train = myDT.predict(Xtrain)

#check results on test data
ypred = myDT.predict(Xtest)

print('training accuracy: ', accuracy_score(ytrain, ypred_train))
print('test accuracy: ', accuracy_score(ytest, ypred))

In [None]:
from sklearn.ensemble import RandomForestClassifier
#https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html

'''
Some parameters we mentioned in class:
- n_estimators
- max_depth
- criterion{“gini”, “entropy”}
- max_features{“auto”, “sqrt”, “log2”}
'''

# myRF = RandomForestClassifier(n_estimators=100, max_depth=10, max_features = 'sqrt', random_state=0)
myRF = RandomForestClassifier(n_estimators=100, max_features = 'sqrt', random_state=0)

#Train the model
myRF.fit(Xtrain,ytrain)

#check results on training data
ypred_train = myRF.predict(Xtrain)

#check results on test data
ypred = myRF.predict(Xtest)

print('training accuracy: ', accuracy_score(ytrain, ypred_train))
print('test accuracy: ', accuracy_score(ytest, ypred))

In [None]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(ytest, ypred))

In [None]:
from sklearn.neighbors import KNeighborsClassifier#KNeighborsRegressor#KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=50)
neigh.fit(Xtrain, ytrain)#Xtrain, Xtest, ytrain, ytest

In [None]:
ypred_train = neigh.predict(Xtrain)
ypred = neigh.predict(Xtest)

In [None]:
print('training accuracy: ', accuracy_score(ytrain, ypred_train))
print('test accuracy: ', accuracy_score(ytest, ypred))

In [None]:
plt.imshow(np.log(confusion_matrix(ytest, ypred)+1),
           cmap='Blues', interpolation='nearest')
plt.grid(False)
plt.ylabel('true')
plt.xlabel('predicted');

Visualization (not required)

In [None]:
fig, axes = plt.subplots(10, 10, figsize=(8, 8))
fig.subplots_adjust(hspace=0.1, wspace=0.1)

for i, ax in enumerate(axes.flat):
    ax.imshow(Xtest[i].reshape(8, 8), cmap='binary')
    ax.set_xticks([])
    ax.set_yticks([])

In [None]:
fig, axes = plt.subplots(10, 10, figsize=(8, 8))
fig.subplots_adjust(hspace=0.1, wspace=0.1)

for i, ax in enumerate(axes.flat):
    ax.imshow(Xtest[i].reshape(8, 8), cmap='binary')
    ax.text(0.05, 0.05, str(ypred[i]),
            transform=ax.transAxes,
            color='green' if (ytest[i] == ypred[i]) else 'red')
    ax.set_xticks([])
    ax.set_yticks([])