# Supervised Learning, Part I

### Import libraries

In [13]:
%matplotlib notebook
import numpy as np
import pandas as pd
import seaborn as sn
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler

### Load the digits data

In [14]:
import pandas as pd
labeled_images = pd.read_csv('..\\data\\digits_train.csv')

In [15]:
#select images features from the second column to the last column, and only the first 2000 smaples(rows).
X = labeled_images.iloc[:2000,1:]
#select the first column which is the label, or the digit, and only the first 2000 smaples(rows).
y = labeled_images.iloc[:2000,:1].squeeze()# with .squeeze(), the pandas one column dataframe is turned into a series 
X_train,X_test,y_train,y_test=train_test_split(X, y, train_size=0.8, random_state=0)




### Split the data into train and test

In [16]:
X_train,X_test,y_train,y_test=train_test_split(X, y, train_size=0.8, random_state=0)



## K-nearest Neighbors

In [17]:
from sklearn.neighbors import KNeighborsClassifier
knn=KNeighborsClassifier()

In [18]:
knn.fit(X_train,y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [19]:
prediction=knn.predict(X_test)

In [20]:
prediction.shape

(400,)

In [21]:
prediction[124]

9

In [22]:
pro=knn.predict_proba(X_test)

In [23]:
pro[124]

array([0. , 0. , 0. , 0. , 0.4, 0. , 0. , 0. , 0. , 0.6])

### Important parameters

+ n_neighbors, default=5.
+ p, default=2, which is Euclidean distance.

Other parameters refer to [documentaion.](http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html)

### Fit the training data with different numbers of neighbors from 1 to 15.

Let's also time the training for each number of neighbors.

In [24]:
import time

def knn_k(X_train,y_train,X_test,y_test,n):
    n_time=[]
    for k in range(1,n,2):
        knn=KNeighborsClassifier(n_neighbors=k)
        s=time.time()
        knn.fit(X_train,y_train)
        e=time.time()
        y_pred=knn.predict(X_test)
        f=time.time()
        n_time.append((k,e-s,f-e,knn.score(X_test,y_test),knn.score(X_train,y_train)))
    n_time=pd.DataFrame(n_time,columns=['k','training time','predicting time','test score','training score'])    
    return n_time

n_time=knn_k(X_train,y_train,X_test,y_test,15)

In [25]:
n_time

Unnamed: 0,k,training time,predicting time,test score,training score
0,1,0.046617,0.747645,0.87,1.0
1,3,0.038408,0.780364,0.8775,0.94125
2,5,0.043955,0.887833,0.87,0.93
3,7,0.035016,0.717921,0.8575,0.91875
4,9,0.042885,0.757618,0.87,0.9075
5,11,0.033909,0.776163,0.8675,0.903125
6,13,0.032912,0.702652,0.8525,0.9


### Plot the results

In [26]:
fig,axes = plt.subplots((2,3),figsize=(8,6))

axes.plot(n_time['k'], n_time['training score'], 'r',label='Train')
axes.set_xlabel('Number of neighbors')
axes.set_ylabel('Accuracy')
axes.plot(n_time['k'], n_time['test score'], 'b',label='Test')
axes.legend()
axes.set_title('k number of neighbors')

<IPython.core.display.Javascript object>

TypeError: 'tuple' object cannot be interpreted as an integer

### Let's scale the data

In [28]:
from sklearn.preprocessing import MinMaxScaler,StandardScaler,MaxAbsScaler

scaler_names=['MinMax Scaler','Standard Scaler','MaxAbs Scaler']

scalers=[MinMaxScaler(),StandardScaler(),MaxAbsScaler()]

In [29]:
scaler_t=pd.DataFrame()
for sn,sc in zip(scaler_names,scalers):
    sc.fit(X_train)#fit your scaler on training dataset
    X_train_scaled=sc.transform(X_train)#transform training data with the sclaer
    X_test_scaled=sc.transform(X_test)#transform the test data with the scaler
    n_time=knn_k(X_train_scaled,y_train,X_test_scaled,y_test,15)
    n_time['scaler']=sn
    scaler_t=scaler_t.append(n_time)  

In [30]:
scaler_t

Unnamed: 0,k,training time,predicting time,test score,training score,scaler
0,1,0.054461,0.778566,0.87,1.0,MinMax Scaler
1,3,0.042505,0.71095,0.8775,0.941875,MinMax Scaler
2,5,0.04398,0.796522,0.87,0.93,MinMax Scaler
3,7,0.039894,0.813722,0.8575,0.919375,MinMax Scaler
4,9,0.047413,0.758176,0.87,0.9075,MinMax Scaler
5,11,0.035917,0.772645,0.8675,0.903125,MinMax Scaler
6,13,0.041081,0.780329,0.8525,0.89875,MinMax Scaler
0,1,0.044887,0.80694,0.84,1.0,Standard Scaler
1,3,0.046876,0.786296,0.8575,0.91875,Standard Scaler
2,5,0.039926,0.863551,0.8575,0.903125,Standard Scaler


In [31]:
scaler_t['test score'][scaler_t['scaler']=='MinMax Scaler']

0    0.8700
1    0.8775
2    0.8700
3    0.8575
4    0.8700
5    0.8675
6    0.8525
Name: test score, dtype: float64

In [32]:
fig,axes = plt.subplots(figsize=(8,6))

axes.plot(scaler_t.k[scaler_t['scaler']=='MinMax Scaler'], scaler_t['training score'][scaler_t['scaler']=='MinMax Scaler'], 'r',label='MinMax Train')
axes.plot(scaler_t.k[scaler_t['scaler']=='MinMax Scaler'], scaler_t['test score'][scaler_t['scaler']=='MinMax Scaler'], 'b',label='MinMax Test')

axes.plot(scaler_t.k[scaler_t['scaler']=='Standard Scaler'], scaler_t['training score'][scaler_t['scaler']=='Standard Scaler'], 'y',label='Standard Train')
axes.plot(scaler_t.k[scaler_t['scaler']=='Standard Scaler'], scaler_t['test score'][scaler_t['scaler']=='Standard Scaler'], 'k',label='Standard Test')

axes.plot(scaler_t.k[scaler_t['scaler']=='MaxAbs Scaler'], scaler_t['training score'][scaler_t['scaler']=='MaxAbs Scaler'], 'g-.',label='MaxAbs Train')
axes.plot(scaler_t.k[scaler_t['scaler']=='MaxAbs Scaler'], scaler_t['test score'][scaler_t['scaler']=='MaxAbs Scaler'], 'm',label='MaxAbs Test')

axes.set_xlabel('Number of neighbors')
axes.set_ylabel('Accuracy')
axes.legend()
axes.set_title('k number of neighbors')

<IPython.core.display.Javascript object>

Text(0.5,1,'k number of neighbors')

### Let's use PCA to transform the data, according to the demo of last week, we use only 30,50,70 components here.

In [33]:
from sklearn.decomposition import PCA

pca_t=pd.DataFrame()

for c in [30,50,70]:
    pca=PCA(n_components=c, whiten=True)
    pca.fit(X_train)
    X_train_pca=pca.transform(X_train) #pca transform on training data
    X_test_pca=pca.transform(X_test) #you have to do the same transformation on test data
    n_time=knn_k(X_train_pca,y_train,X_test_pca,y_test,15)
    n_time['n']=c
    pca_t=pca_t.append(n_time)  


In [34]:
pca_t

Unnamed: 0,k,training time,predicting time,test score,training score,n
0,1,0.001995,0.030921,0.8575,1.0,30
1,3,0.001994,0.035904,0.865,0.94375,30
2,5,0.001994,0.028955,0.86,0.93625,30
3,7,0.001995,0.028994,0.855,0.9175,30
4,9,0.001994,0.028988,0.86,0.91,30
5,11,0.001963,0.028923,0.8575,0.904375,30
6,13,0.001994,0.02992,0.86,0.894375,30
0,1,0.003086,0.057889,0.845,1.0,50
1,3,0.003017,0.056873,0.8475,0.92875,50
2,5,0.003022,0.042891,0.8375,0.910625,50


In [35]:
fig,axes = plt.subplots(figsize=(8,6))

axes.plot(pca_t.k[pca_t['n']==30], pca_t['training score'][pca_t['n']==30], 'r',label='30 Train')
axes.plot(pca_t.k[pca_t['n']==30], pca_t['test score'][pca_t['n']==30], 'b',label='30 Test')

axes.plot(pca_t.k[pca_t['n']==50], pca_t['training score'][pca_t['n']==50], 'y',label='50 Train')
axes.plot(pca_t.k[pca_t['n']==50], pca_t['test score'][pca_t['n']==50], 'k',label='50 Test')

axes.plot(pca_t.k[pca_t['n']==70], pca_t['training score'][pca_t['n']==70], 'g',label='70 Train')
axes.plot(pca_t.k[pca_t['n']==70], pca_t['test score'][pca_t['n']==70], 'm',label='70 Test')

axes.set_xlabel('Number of neighbors')
axes.set_ylabel('Accuracy')
axes.legend()
axes.set_title('k number of neighbors')

<IPython.core.display.Javascript object>

Text(0.5,1,'k number of neighbors')

## Linear Models

### Load Boston housing prices data

In [36]:
from sklearn.datasets import load_boston

boston=load_boston ()
(X_boston,y_boston) = load_boston (return_X_y = True)

In [37]:
print(type(boston))

<class 'sklearn.utils.Bunch'>


In [38]:
boston.keys()

dict_keys(['data', 'target', 'feature_names', 'DESCR'])

In [39]:
print(boston['DESCR'])

Boston House Prices dataset

Notes
------
Data Set Characteristics:  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive
    
    :Median Value (attribute 14) is usually the target

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pupil-teacher ratio by town
      

In [40]:
boston['feature_names']

array(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD',
       'TAX', 'PTRATIO', 'B', 'LSTAT'], dtype='<U7')

In [41]:
X=pd.DataFrame(boston['data'],columns=boston['feature_names'])

In [42]:
X.shape

(506, 13)

In [43]:
X.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


In [44]:
y=pd.DataFrame(boston['target'],columns=['price'])

In [45]:
y.shape

(506, 1)

In [46]:
y.head()

Unnamed: 0,price
0,24.0
1,21.6
2,34.7
3,33.4
4,36.2


### Split into train, test

In [47]:
X_train,X_test,y_train,y_test=train_test_split(X, y, train_size=0.8, random_state=0)



### Fit linear regression model

In [48]:
clf=LinearRegression()
clf.fit(X_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

### Check the R square of the regression model on training data

In [49]:
clf.score(X_train,y_train)

0.7729718726571158

### Check the R square of the regression model on test data

In [50]:
clf.score(X_test,y_test)

0.5892011519186434

### Display the intercept and coefficients 

In [51]:
print('The intercept is {0} \nand coefficient vector is \n{1}'.format(clf.intercept_,clf.coef_.T))

The intercept is [38.13869271] 
and coefficient vector is 
[[-1.18410318e-01]
 [ 4.47550643e-02]
 [ 5.85674689e-03]
 [ 2.34230117e+00]
 [-1.61634024e+01]
 [ 3.70135143e+00]
 [-3.04553661e-03]
 [-1.38664542e+00]
 [ 2.43784171e-01]
 [-1.09856157e-02]
 [-1.04699133e+00]
 [ 8.22014729e-03]
 [-4.93642452e-01]]


### Ridge regression

In [52]:
from sklearn.linear_model import Ridge

X_train,X_test,y_train,y_test=train_test_split(X, y, train_size=0.8, random_state=0)

linridge = Ridge(alpha=20.0).fit(X_train, y_train)

print('Ridge regression R2 score on training data is: {}'.format(linridge.score(X_train, y_train)))
print('Ridge regression R2 score on test data is: {}'.format(linridge.score(X_test,y_test)))

Ridge regression R2 score on training data is: 0.7633740736696735
Ridge regression R2 score on test data is: 0.5592236539275828




In [53]:
linridge

Ridge(alpha=20.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

#### Ridge regression with feature normalization

In [54]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

X_train,X_test,y_train,y_test=train_test_split(X, y, train_size=0.8, random_state=0)

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

linridge = Ridge(alpha=20.0).fit(X_train_scaled, y_train)

print('ridge regression linear model intercept: {}'
     .format(linridge.intercept_))
print('ridge regression linear model coeff:\n{}'
     .format(linridge.coef_))
print('R-squared score (training): {:.3f}'
     .format(linridge.score(X_train_scaled, y_train)))
print('R-squared score (test): {:.3f}'
     .format(linridge.score(X_test_scaled, y_test)))
print('Number of non-zero features: {}'
     .format(np.sum(linridge.coef_ != 0)))

ridge regression linear model intercept: [27.32021373]
ridge regression linear model coeff:
[[-1.47359152  2.39298692 -2.69320816  2.65616716 -1.77874723  7.33082289
  -1.36860107 -2.27028828 -0.43326401 -2.73434557 -6.43501373  2.49269948
  -8.91459746]]
R-squared score (training): 0.633
R-squared score (test): 0.371
Number of non-zero features: 13




#### Ridge regression with regularization parameter: alpha

In [55]:
print('Ridge regression: effect of alpha regularization parameter\n')
X_train,X_test,y_train,y_test=train_test_split(X, y, train_size=0.8, random_state=0)

for this_alpha in [0, 1, 10, 20, 50, 100, 1000]:
    linridge = Ridge(alpha = this_alpha).fit(X_train, y_train)
    r2_train = linridge.score(X_train, y_train)
    r2_test = linridge.score(X_test, y_test)
    num_coeff_bigger = np.sum(abs(linridge.coef_) > 1.0)
    print('Alpha = {:.2f}\nnum abs(coeff) > 1.0: {}, \
r-squared training: {:.2f}, r-squared test: {:.2f}\n'
         .format(this_alpha, num_coeff_bigger, r2_train, r2_test))

Ridge regression: effect of alpha regularization parameter

Alpha = 0.00
num abs(coeff) > 1.0: 5, r-squared training: 0.77, r-squared test: 0.59

Alpha = 1.00
num abs(coeff) > 1.0: 4, r-squared training: 0.77, r-squared test: 0.58

Alpha = 10.00
num abs(coeff) > 1.0: 4, r-squared training: 0.77, r-squared test: 0.56

Alpha = 20.00
num abs(coeff) > 1.0: 3, r-squared training: 0.76, r-squared test: 0.56

Alpha = 50.00
num abs(coeff) > 1.0: 2, r-squared training: 0.76, r-squared test: 0.55

Alpha = 100.00
num abs(coeff) > 1.0: 2, r-squared training: 0.75, r-squared test: 0.54

Alpha = 1000.00
num abs(coeff) > 1.0: 0, r-squared training: 0.70, r-squared test: 0.48





### Lasso regression

In [56]:
from sklearn.linear_model import Lasso

X_train,X_test,y_train,y_test=train_test_split(X, y, train_size=0.8, random_state=0)

linlasso = Lasso(alpha=2.0, max_iter = 10000).fit(X_train, y_train)

print('lasso regression linear model intercept: {}'
     .format(linlasso.intercept_))
print('lasso regression linear model coeff:\n{}'
     .format(linlasso.coef_))
print('Non-zero features: {}'
     .format(np.sum(linlasso.coef_ != 0)))
print('R-squared score (training): {:.3f}'
     .format(linlasso.score(X_train, y_train)))
print('R-squared score (test): {:.3f}\n'
     .format(linlasso.score(X_test, y_test)))


lasso regression linear model intercept: [43.77379586]
lasso regression linear model coeff:
[-0.00871063  0.04023899 -0.          0.         -0.          0.
  0.03936009 -0.0356299   0.09816085 -0.01012858 -0.70674762  0.00528996
 -0.77775911]
Non-zero features: 9
R-squared score (training): 0.668
R-squared score (test): 0.424





#### Lasso regression with regularization parameter: alpha

In [57]:
print('Lasso regression: effect of alpha regularization\n\
parameter on number of features kept in final model\n')

X_train,X_test,y_train,y_test=train_test_split(X, y, train_size=0.8, random_state=0)
for alpha in [0.1,0.5, 1, 2, 3, 5, 10, 20, 50]:
    linlasso = Lasso(alpha, max_iter = 10000).fit(X_train, y_train)
    r2_train = linlasso.score(X_train, y_train)
    r2_test = linlasso.score(X_test, y_test)
    
    print('Alpha = {:.2f}\nFeatures kept: {}, r-squared training: {:.2f}, \
r-squared test: {:.2f}\n'
         .format(alpha, np.sum(linlasso.coef_ != 0), r2_train, r2_test))



Lasso regression: effect of alpha regularization
parameter on number of features kept in final model

Alpha = 0.10
Features kept: 12, r-squared training: 0.76, r-squared test: 0.56

Alpha = 0.50
Features kept: 10, r-squared training: 0.75, r-squared test: 0.54

Alpha = 1.00
Features kept: 10, r-squared training: 0.72, r-squared test: 0.49

Alpha = 2.00
Features kept: 9, r-squared training: 0.67, r-squared test: 0.42

Alpha = 3.00
Features kept: 7, r-squared training: 0.64, r-squared test: 0.40

Alpha = 5.00
Features kept: 5, r-squared training: 0.60, r-squared test: 0.38

Alpha = 10.00
Features kept: 4, r-squared training: 0.56, r-squared test: 0.33

Alpha = 20.00
Features kept: 4, r-squared training: 0.47, r-squared test: 0.22

Alpha = 50.00
Features kept: 3, r-squared training: 0.29, r-squared test: 0.07



### Polynomial regression

In [58]:
from sklearn.preprocessing import PolynomialFeatures

X_train,X_test,y_train,y_test=train_test_split(X, y, train_size=0.8, random_state=0)

linreg = LinearRegression().fit(X_train, y_train)

print('linear model coeff (w): {}'
     .format(linreg.coef_))
print('linear model intercept (b): {}'
     .format(linreg.intercept_))
print('R-squared score (training): {:.3f}'
     .format(linreg.score(X_train, y_train)))
print('R-squared score (test): {:.3f}'
     .format(linreg.score(X_test, y_test)))

print('\nNow we transform the original input data to add\n\
polynomial features up to degree 2 (quadratic)\n')
poly = PolynomialFeatures(degree=2)
X_poly = poly.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_poly, y,
                                                   random_state = 0)
linreg = LinearRegression().fit(X_train, y_train)

print('(poly deg 2) linear model coeff (w):\n{}'
     .format(linreg.coef_))
print('(poly deg 2) linear model intercept (b): {}'
     .format(linreg.intercept_))
print('(poly deg 2) R-squared score (training): {:.3f}'
     .format(linreg.score(X_train, y_train)))
print('(poly deg 2) R-squared score (test): {:.3f}\n'
     .format(linreg.score(X_test, y_test)))

print('\nAddition of many polynomial features often leads to\n\
overfitting, so we often use polynomial features in combination\n\
with regression that has a regularization penalty, like ridge\n\
regression.\n')

X_train, X_test, y_train, y_test = train_test_split(X_poly, y,
                                                   random_state = 0)
linreg = Ridge().fit(X_train, y_train)

print('(poly deg 2 + ridge) linear model coeff (w):\n{}'
     .format(linreg.coef_))
print('(poly deg 2 + ridge) linear model intercept (b): {}'
     .format(linreg.intercept_))
print('(poly deg 2 + ridge) R-squared score (training): {:.3f}'
     .format(linreg.score(X_train, y_train)))
print('(poly deg 2 + ridge) R-squared score (test): {:.3f}'
     .format(linreg.score(X_test, y_test)))

linear model coeff (w): [[-1.18410318e-01  4.47550643e-02  5.85674689e-03  2.34230117e+00
  -1.61634024e+01  3.70135143e+00 -3.04553661e-03 -1.38664542e+00
   2.43784171e-01 -1.09856157e-02 -1.04699133e+00  8.22014729e-03
  -4.93642452e-01]]
linear model intercept (b): [38.13869271]
R-squared score (training): 0.773
R-squared score (test): 0.589

Now we transform the original input data to add
polynomial features up to degree 2 (quadratic)

(poly deg 2) linear model coeff (w):
[[ 1.12562888e+09  7.52844972e-02  2.45927229e-02 -6.12658718e+00
   1.69107850e+01 -7.28978825e+00  5.08245028e+00  1.23555470e+00
  -1.04170967e+01  1.03245429e+00  4.59146094e-02  2.77483395e+00
  -2.08089566e-01 -1.52886411e+00  3.37356875e-04  4.36531908e-01
   5.86146980e-01  1.34224036e+00 -5.91231199e-01  1.10111407e-01
  -9.52395994e-04 -7.79101789e-02  6.01941620e-01 -4.84205081e-02
   3.29061506e-01 -2.57532842e-05  1.25390338e-02 -6.61130736e-04
  -1.63177918e-02  2.28088461e-02 -1.69959887e+00 -3.786



## Linear models for classification

### Logistic regression

#### Logistic regression for binary classification on fruits dataset using height, width features (positive class: apple, negative class: others)

In [59]:
from sklearn.linear_model import LogisticRegression

labeled_images = pd.read_csv('..\\data\\digits_train.csv')

#select images features from the second column to the last column, and only the first 2000 smaples(rows).
X = labeled_images.iloc[:4000,1:]
#select the first column which is the label, or the digit, and only the first 2000 smaples(rows).
y = labeled_images.iloc[:4000,:1].squeeze()# with .squeeze(), the pandas one column dataframe is turned into a series 
X_train,X_test,y_train,y_test=train_test_split(X, y, train_size=0.8, random_state=0)

clf = LogisticRegression(C=100).fit(X_train, y_train)
print('Logistic regression R-squared score (training): {:.3f}'
     .format(clf.score(X_train, y_train)))
print('Logistic regression R-squared score (test): {:.3f}'
     .format(clf.score(X_test, y_test)))



Logistic regression R-squared score (training): 1.000
Logistic regression R-squared score (test): 0.804


In [60]:
clf

LogisticRegression(C=100, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

#### Logistic regression regularization: C parameter

In [61]:
for this_C in [0.01,0.1, 1,10,50, 100]:
    clf = LogisticRegression(C=this_C).fit(X_train, y_train)
    print('Logistic regression C={0} R-squared score (training): {1:.3f}'
     .format(this_C,clf.score(X_train, y_train)))
    print('Logistic regression C={0} R-squared score (test): {1:.3f}'
     .format(this_C, clf.score(X_test, y_test)))

Logistic regression C=0.01 R-squared score (training): 1.000
Logistic regression C=0.01 R-squared score (test): 0.831
Logistic regression C=0.1 R-squared score (training): 1.000
Logistic regression C=0.1 R-squared score (test): 0.820
Logistic regression C=1 R-squared score (training): 1.000
Logistic regression C=1 R-squared score (test): 0.809
Logistic regression C=10 R-squared score (training): 1.000
Logistic regression C=10 R-squared score (test): 0.802
Logistic regression C=50 R-squared score (training): 1.000
Logistic regression C=50 R-squared score (test): 0.804
Logistic regression C=100 R-squared score (training): 1.000
Logistic regression C=100 R-squared score (test): 0.804


### Let's Try PCA n=[30,50,70]

In [62]:
pca_t=pd.DataFrame()

for n_c in [30,50,70]:
    pca=PCA(n_components=n_c, whiten=True)
    pca.fit(X_train)
    X_train_pca=pca.transform(X_train) #pca transform on training data
    X_test_pca=pca.transform(X_test) #you have to do the same transformation on test data
    print('{} number of PCA componets..................................................'.format(n_c))
    for this_C in [0.01,0.1, 1,10,50, 100]:
        clf = LogisticRegression(C=this_C).fit(X_train_pca, y_train)
        print('Logistic regression C={0} R-squared score (training): {1:.3f}'
         .format(this_C,clf.score(X_train_pca, y_train)))
        print('Logistic regression C={0} R-squared score (test): {1:.3f}'
         .format(this_C, clf.score(X_test_pca, y_test)))

30 number of PCA componets..................................................
Logistic regression C=0.01 R-squared score (training): 0.846
Logistic regression C=0.01 R-squared score (test): 0.856
Logistic regression C=0.1 R-squared score (training): 0.868
Logistic regression C=0.1 R-squared score (test): 0.869
Logistic regression C=1 R-squared score (training): 0.881
Logistic regression C=1 R-squared score (test): 0.879
Logistic regression C=10 R-squared score (training): 0.888
Logistic regression C=10 R-squared score (test): 0.884
Logistic regression C=50 R-squared score (training): 0.888
Logistic regression C=50 R-squared score (test): 0.882
Logistic regression C=100 R-squared score (training): 0.888
Logistic regression C=100 R-squared score (test): 0.882
50 number of PCA componets..................................................
Logistic regression C=0.01 R-squared score (training): 0.866
Logistic regression C=0.01 R-squared score (test): 0.869
Logistic regression C=0.1 R-squared sc

### Support Vector Machines
#### Linear Support Vector Machine

In [63]:
from sklearn.svm import SVC

clf = SVC(kernel = 'linear').fit(X_train, y_train)

print('Training score: {:.3f}'.format(clf.score(X_train, y_train)))
print('Test score: {:.3f}'.format(clf.score(X_test, y_test)))

Training score: 1.000
Test score: 0.909


#### Linear Support Vector Machine

In [64]:
from sklearn.svm import LinearSVC

clf = LinearSVC().fit(X_train, y_train)
print('Training score: {:.3f}'.format(clf.score(X_train, y_train)))
print('Test score: {:.3f}'.format(clf.score(X_test, y_test)))

Training score: 0.998
Test score: 0.812


#### Kernelized SVC

Default SVC is with kernel='rbf'

In [65]:
from sklearn.svm import SVC

clf = SVC().fit(X_train, y_train)

print('Training score: {:.3f}'.format(clf.score(X_train, y_train)))
print('Test score: {:.3f}'.format(clf.score(X_test, y_test)))

Training score: 1.000
Test score: 0.104


Use kernel='poly', degree=2

In [66]:
from sklearn.svm import SVC

clf = SVC(kernel='poly',degree=2).fit(X_train, y_train)

print('Training score: {:.3f}'.format(clf.score(X_train, y_train)))
print('Test score: {:.3f}'.format(clf.score(X_test, y_test)))

Training score: 1.000
Test score: 0.929


#### Let's try scalers

In [67]:
from sklearn.preprocessing import MinMaxScaler,StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [68]:
clf = SVC(kernel = 'linear').fit(X_train_scaled, y_train)

print('Linear SVC Training score: {:.3f}'.format(clf.score(X_train_scaled, y_train)))
print('Linear SVC Test score: {:.3f}'.format(clf.score(X_test_scaled, y_test)))

clf = SVC().fit(X_train_scaled, y_train)
print('RBF SVC Training score: {:.3f}'.format(clf.score(X_train_scaled, y_train)))
print('RBF SVC Test score: {:.3f}'.format(clf.score(X_test_scaled, y_test)))

clf = SVC(kernel='poly',degree=2).fit(X_train_scaled, y_train)
print('Poly SVC Training score: {:.3f}'.format(clf.score(X_train_scaled, y_train)))
print('Poly SVC Test score: {:.3f}'.format(clf.score(X_test_scaled, y_test)))


Linear SVC Training score: 1.000
Linear SVC Test score: 0.914
RBF SVC Training score: 0.980
RBF SVC Test score: 0.907
Poly SVC Training score: 0.961
Poly SVC Test score: 0.895


#### Let's try PCA

In [69]:
from sklearn.decomposition import PCA
pca=PCA(n_components=35, whiten=True)
pca.fit(X_train)
X_train_pca=pca.transform(X_train) #pca transform on training data
X_test_pca=pca.transform(X_test) #you have to do the same transformation on test data

In [70]:
clf = SVC(kernel = 'linear').fit(X_train_pca, y_train)

print('Linear SVC Training score: {:.3f}'.format(clf.score(X_train_pca, y_train)))
print('Linear SVC Test score: {:.3f}'.format(clf.score(X_test_pca, y_test)))

clf = SVC().fit(X_train_pca, y_train)
print('RBF SVC Training score: {:.3f}'.format(clf.score(X_train_pca, y_train)))
print('RBF SVC Test score: {:.3f}'.format(clf.score(X_test_pca, y_test)))

clf = SVC(kernel='poly',degree=2).fit(X_train_pca, y_train)
print('Poly SVC Training score: {:.3f}'.format(clf.score(X_train_pca, y_train)))
print('Poly SVC Test score: {:.3f}'.format(clf.score(X_test_pca, y_test)))

Linear SVC Training score: 0.956
Linear SVC Test score: 0.906
RBF SVC Training score: 0.990
RBF SVC Test score: 0.941
Poly SVC Training score: 0.991
Poly SVC Test score: 0.939


#### Support Vector Machine with RBF kernel: using both C and gamma parameter 

In [71]:
for this_gamma in [0.01,0.02,0.03,0.1,0.2, 1, 5]:
    
    for this_C in [0.01,0.1, 1,5,10, 15]:
        clf = SVC(kernel = 'rbf', gamma = this_gamma,
                 C = this_C).fit(X_train_pca, y_train)
        print('gamma is {0}, and c is {1}'.format(this_gamma,this_C))
        print('Training score: {:.3f}'.format(clf.score(X_train_pca, y_train)))
        print('Test score: {:.3f}'.format(clf.score(X_test_pca, y_test)))
        print('\n')

gamma is 0.01, and c is 0.01
Training score: 0.108
Test score: 0.104


gamma is 0.01, and c is 0.1
Training score: 0.888
Test score: 0.885


gamma is 0.01, and c is 1
Training score: 0.956
Test score: 0.926


gamma is 0.01, and c is 5
Training score: 0.990
Test score: 0.935


gamma is 0.01, and c is 10
Training score: 0.997
Test score: 0.927


gamma is 0.01, and c is 15
Training score: 0.998
Test score: 0.929


gamma is 0.02, and c is 0.01
Training score: 0.188
Test score: 0.201


gamma is 0.02, and c is 0.1
Training score: 0.920
Test score: 0.891


gamma is 0.02, and c is 1
Training score: 0.982
Test score: 0.940


gamma is 0.02, and c is 5
Training score: 0.998
Test score: 0.936


gamma is 0.02, and c is 10
Training score: 1.000
Test score: 0.940


gamma is 0.02, and c is 15
Training score: 1.000
Test score: 0.943


gamma is 0.03, and c is 0.01
Training score: 0.188
Test score: 0.201


gamma is 0.03, and c is 0.1
Training score: 0.928
Test score: 0.904


gamma is 0.03, and c is 1
Tra