This lab on Logistic Regression is a Python adaptation of p. 161-163 of "Introduction to Statistical Learning with Applications in R" by Gareth James, Daniela Witten, Trevor Hastie and Robert Tibshirani. Adapted by R. Jordan Crouser at Smith College for SDS293: Machine Learning (Spring 2016).

In [41]:
import pandas as pd
import numpy as np
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA
from sklearn.metrics import confusion_matrix, classification_report, precision_score
from sklearn import neighbors

%matplotlib inline

# 4.6.3 Linear Discriminant Analysis

Let's return to the ${\tt Smarket}$ data from ${\tt ISLR}$. 

In [12]:
df = pd.read_csv('Smarket.csv', usecols=range(1,10), index_col=0, parse_dates=True)
df.head()

Unnamed: 0_level_0,Lag1,Lag2,Lag3,Lag4,Lag5,Volume,Today,Direction
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2001-01-01,0.381,-0.192,-2.624,-1.055,5.01,1.1913,0.959,Up
2001-01-01,0.959,0.381,-0.192,-2.624,-1.055,1.2965,1.032,Up
2001-01-01,1.032,0.959,0.381,-0.192,-2.624,1.4112,-0.623,Down
2001-01-01,-0.623,1.032,0.959,0.381,-0.192,1.276,0.614,Up
2001-01-01,0.614,-0.623,1.032,0.959,0.381,1.2057,0.213,Up


In [33]:
X_train = df[:'2004'][['Lag1','Lag2']]
y_train = df[:'2004']['Direction']

X_test = df['2005':][['Lag1','Lag2']]
y_test = df['2005':]['Direction']

lda = LDA()
model = lda.fit(X_train, y_train)

print(model.priors_)

[0.49198397 0.50801603]


In [14]:
print(model.means_)

[[ 0.04279022  0.03389409]
 [-0.03954635 -0.03132544]]


In [15]:
print(model.coef_)

[[-0.05544078 -0.0443452 ]]


In [16]:
pred=model.predict(X_test)
print(np.unique(pred, return_counts=True))

(array(['Down', 'Up'], dtype='<U4'), array([ 70, 182], dtype=int64))


In [17]:
print(confusion_matrix(pred, y_test))
print(classification_report(y_test, pred, digits=3))

[[ 35  35]
 [ 76 106]]
              precision    recall  f1-score   support

        Down      0.500     0.315     0.387       111
          Up      0.582     0.752     0.656       141

    accuracy                          0.560       252
   macro avg      0.541     0.534     0.522       252
weighted avg      0.546     0.560     0.538       252



In [35]:
pred_p = model.predict_proba(X_test)

In [36]:
print(np.unique(pred_p[:,1]>0.5, return_counts=True))

(array([False,  True]), array([ 70, 182], dtype=int64))


In [37]:
print(np.stack((pred_p[1:20,1], pred[1:20])).T)

[['0.5207815009003174' 'Up']
 ['0.5331815201479345' 'Up']
 ['0.5259989305447519' 'Up']
 ['0.5072123360325546' 'Up']
 ['0.5061438450024964' 'Up']
 ['0.5048984353537768' 'Up']
 ['0.5127139005781852' 'Up']
 ['0.5092986510395949' 'Up']
 ['0.5155973759281307' 'Up']
 ['0.5093037238790318' 'Up']
 ['0.4880011537380811' 'Down']
 ['0.510484773063352' 'Up']
 ['0.5293238777881214' 'Up']
 ['0.5255407143881711' 'Up']
 ['0.5200416608518921' 'Up']
 ['0.5064224705341396' 'Up']
 ['0.4969106228816935' 'Down']
 ['0.5021193878585957' 'Up']
 ['0.5113669134834818' 'Up']]


In [38]:
print(np.unique(pred_p[:,1]>0.9, return_counts=True))

(array([False]), array([252], dtype=int64))


In [39]:
max(pred_p[:,1])

0.5422132554518978

# 4.6.4 Quadratic Discriminant Analysis


In [25]:
qda = QDA()
model2 = qda.fit(X_train, y_train)
print(model2.priors_)
print(model2.means_)

[0.49198397 0.50801603]
[[ 0.04279022  0.03389409]
 [-0.03954635 -0.03132544]]


In [26]:
pred2=model2.predict(X_test)
print(np.unique(pred2, return_counts=True))
print(confusion_matrix(pred2, y_test))
print(classification_report(y_test, pred2, digits=3))

(array(['Down', 'Up'], dtype=object), array([ 50, 202], dtype=int64))
[[ 30  20]
 [ 81 121]]
              precision    recall  f1-score   support

        Down      0.600     0.270     0.373       111
          Up      0.599     0.858     0.706       141

    accuracy                          0.599       252
   macro avg      0.600     0.564     0.539       252
weighted avg      0.599     0.599     0.559       252



In [27]:
df2 = pd.read_csv('Carseats.csv')
df2.head()

Unnamed: 0.1,Unnamed: 0,Sales,CompPrice,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban,US
0,1,9.5,138,73,11,276,120,Bad,42,17,Yes,Yes
1,2,11.22,111,48,16,260,83,Good,65,10,Yes,Yes
2,3,10.06,113,35,10,269,80,Medium,59,12,Yes,Yes
3,4,7.4,117,100,4,466,97,Medium,55,14,Yes,Yes
4,5,4.15,141,64,3,340,128,Bad,38,13,Yes,No


## 4.6.5 k-nearest Neighbor

In [42]:
knn = neighbors.KNeighborsClassifier(n_neighbors=1)
pred = knn.fit(X_train, y_train).predict(X_test)
print(confusion_matrix(y_test, pred).T)
print(classification_report(y_test, pred, digits=3))

[[43 58]
 [68 83]]
              precision    recall  f1-score   support

        Down      0.426     0.387     0.406       111
          Up      0.550     0.589     0.568       141

    accuracy                          0.500       252
   macro avg      0.488     0.488     0.487       252
weighted avg      0.495     0.500     0.497       252



In [43]:
knn = neighbors.KNeighborsClassifier(n_neighbors=3)
pred = knn.fit(X_train, y_train).predict(X_test)
print(confusion_matrix(y_test, pred).T)
print(classification_report(y_test, pred, digits=3))

[[48 55]
 [63 86]]
              precision    recall  f1-score   support

        Down      0.466     0.432     0.449       111
          Up      0.577     0.610     0.593       141

    accuracy                          0.532       252
   macro avg      0.522     0.521     0.521       252
weighted avg      0.528     0.532     0.529       252

