## Solving some classification problems

#### First, let's load some data!

In [3]:
import numpy as np
from sklearn.datasets import load_iris

In [5]:
data = load_iris()
X, y = data['data'], data['target']
idx = np.random.permutation(len(y))
X, y = X[idx], y[idx]
X_train, y_train = X[:100], y[:100]
X_test, y_test =X[100:], y[100:]

In [6]:
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(100, 4) (100,) (50, 4) (50,)


### Classification using the k-NN algorithm

In [7]:
from sklearn.neighbors import KNeighborsClassifier

In [21]:
model = KNeighborsClassifier(n_neighbors=3, metric='euclidean')

In [22]:
model.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='euclidean',
           metric_params=None, n_jobs=None, n_neighbors=3, p=2,
           weights='uniform')

In [23]:
print("Train acc =", model.score(X_train, y_train))

Train acc = 0.99


In [24]:
print("Test acc =", model.score(X_test, y_test))

Test acc = 0.94


### Does k impact the accuracy?

In [16]:
for k in [1, 2, 3, 4, 5, 10, 20]:
    model = KNeighborsClassifier(n_neighbors=k, metric='euclidean')
    model.fit(X_train, y_train)
    print("k = %d, train_acc = %3.2f, test_acc = %3.2f"%(k, 100*model.score(X_train, y_train), 100*model.score(X_test, y_test)))
    

k = 1, train_acc = 100.00, test_acc = 92.00
k = 2, train_acc = 100.00, test_acc = 92.00
k = 3, train_acc = 99.00, test_acc = 94.00
k = 4, train_acc = 100.00, test_acc = 96.00
k = 5, train_acc = 100.00, test_acc = 94.00
k = 10, train_acc = 98.00, test_acc = 94.00
k = 20, train_acc = 98.00, test_acc = 88.00


### Let's try another classifier:

In [17]:
from sklearn.neighbors import NearestCentroid


In [26]:
model = NearestCentroid(metric='euclidean')
model.fit(X_train, y_train)
print("Train acc =", model.score(X_train, y_train))
print("Test acc =", model.score(X_test, y_test))


Train acc = 0.95
Test acc = 0.88


### Or another one


In [37]:
from sklearn.svm import SVC
model = SVC(C=5, kernel='linear')
model.fit(X_train, y_train)
print("Train acc =", model.score(X_train, y_train))
print("Test acc =", model.score(X_test, y_test))


Train acc = 0.99
Test acc = 0.98


### Or another one (a regression model in fact! - not the optimal way of doing this)

In [38]:
from sklearn.linear_model import LinearRegression

In [73]:
model = LinearRegression(fit_intercept=False)
model.fit(X_train, y_train)

train_predictions = model.predict(X_train)
test_predictions = model.predict(X_test)
train_predictions = np.int32(np.round(train_predictions))
test_predictions = np.int32(np.round(test_predictions))


In [53]:
train_acc = np.sum(train_predictions==y_train)/len(y_train)
test_acc = np.sum(test_predictions==y_test)/len(y_test)

In [54]:
print(train_acc, test_acc)

0.99 0.92


### Calculating other metrics?

In [55]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

In [58]:
print("Train set:", accuracy_score(y_train, train_predictions), precision_score(y_train, train_predictions, average='macro'), recall_score(y_train, train_predictions, average='macro'))

Train set: 0.99 0.9901960784313726 0.9895833333333334


In [59]:
print("Train set:", accuracy_score(y_test, test_predictions), precision_score(y_test, test_predictions, average='macro'), recall_score(y_test, test_predictions, average='macro'))

Train set: 0.92 0.9277777777777777 0.9226579520697168


In [74]:
print(f1_score(y_train, train_predictions, average='macro'), f1_score(y_test, test_predictions, average='macro'))

0.9897338703308852 0.9232456140350878


### Can we do such things from scratch?

### Manually fit a regression model

In [60]:
X = X_train

In [62]:
W = np.dot(np.dot(np.linalg.inv(np.dot(X.T, X)),X.T), y_train)

In [63]:
print("Regression parameters: ", W)

Regression parameters:  [ 2.49849143e-04 -1.08081755e-01  1.05683762e-01  7.68223982e-01]


In [65]:
print(model.coef_)

[ 2.49849143e-04 -1.08081755e-01  1.05683762e-01  7.68223982e-01]


### Confirm that we indeed get the same results!

In [66]:
train_predictions = model.predict(X_train)

In [67]:
train_predictions_W = np.dot(X_train, W)

In [69]:
print(train_predictions[:10])

[ 1.98043733e+00  1.15259601e+00  1.10723894e+00 -6.41470600e-02
 -2.57204248e-04 -5.40582825e-02 -5.37285933e-02 -2.17336956e-02
  9.86523016e-02  1.22811458e+00]


In [70]:
print(train_predictions_W[:10])

[ 1.98043733e+00  1.15259601e+00  1.10723894e+00 -6.41470600e-02
 -2.57204248e-04 -5.40582825e-02 -5.37285933e-02 -2.17336956e-02
  9.86523016e-02  1.22811458e+00]
