In [8]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
import scipy.stats as st

## Problem 2

\begin{equation*}
    p_y(4) = \frac{0.8 \cdot N(4 | 10, 36)}{0.8 \cdot N(4 | 10, 36) + 0.2\cdot N(4 | 0, 36)}
\end{equation*}

In [2]:
0.8 * st.norm.pdf(4, 10, 6) / (0.8 * st.norm.pdf(4, 10, 6) + 0.2 * st.norm.pdf(4, 0, 6))

0.7518524532975261

## Problem 3

 Load the dataset

In [6]:
data = pd.read_csv('Weekly.csv', index_col=0)
data.Direction = data.Direction.astype('category')
data.head()

Unnamed: 0,Year,Lag1,Lag2,Lag3,Lag4,Lag5,Volume,Today,Direction
1,1990,0.816,1.572,-3.936,-0.229,-3.484,0.154976,-0.27,Down
2,1990,-0.27,0.816,1.572,-3.936,-0.229,0.148574,-2.576,Down
3,1990,-2.576,-0.27,0.816,1.572,-3.936,0.159837,3.514,Up
4,1990,3.514,-2.576,-0.27,0.816,1.572,0.16163,0.712,Up
5,1990,0.712,3.514,-2.576,-0.27,0.816,0.153728,1.178,Up


In [42]:
data.Direction.cat.categories

Index(['Down', 'Up'], dtype='object')

#### b.

In [43]:
model = smf.glm('Direction ~ Volume + Lag1 + Lag2 + Lag3 + Lag4 + Lag5', data=data, family=sm.families.Binomial()).fit()

In [44]:
model.summary()

0,1,2,3
Dep. Variable:,"['Direction[Down]', 'Direction[Up]']",No. Observations:,1089
Model:,GLM,Df Residuals:,1082
Model Family:,Binomial,Df Model:,6
Link Function:,logit,Scale:,1.0000
Method:,IRLS,Log-Likelihood:,-743.18
Date:,"Wed, 10 Apr 2019",Deviance:,1486.4
Time:,00:50:05,Pearson chi2:,1.09e+03
No. Iterations:,4,Covariance Type:,nonrobust

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-0.2669,0.086,-3.106,0.002,-0.435,-0.098
Volume,0.0227,0.037,0.616,0.538,-0.050,0.095
Lag1,0.0413,0.026,1.563,0.118,-0.010,0.093
Lag2,-0.0584,0.027,-2.175,0.030,-0.111,-0.006
Lag3,0.0161,0.027,0.602,0.547,-0.036,0.068
Lag4,0.0278,0.026,1.050,0.294,-0.024,0.080
Lag5,0.0145,0.026,0.549,0.583,-0.037,0.066


In [49]:
predicted = ['Down' if p > 0.5 else 'Up' for p in model.predict()]

In [60]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(data.Direction, predicted)
cm

array([[ 54, 430],
       [ 48, 557]])

In [61]:
import numpy as np
np.trace(cm) / np.sum(cm)

0.5610651974288338

In [56]:
mask = (data.Year >= 1990) & (data.Year <= 2008)
train = data[mask]
test = data[~mask]
model2 = smf.glm('Direction ~ Lag2', data=train, family=sm.families.Binomial()).fit()
model2.summary()

0,1,2,3
Dep. Variable:,"['Direction[Down]', 'Direction[Up]']",No. Observations:,985
Model:,GLM,Df Residuals:,983
Model Family:,Binomial,Df Model:,1
Link Function:,logit,Scale:,1.0000
Method:,IRLS,Log-Likelihood:,-675.27
Date:,"Wed, 10 Apr 2019",Deviance:,1350.5
Time:,00:56:12,Pearson chi2:,985.
No. Iterations:,4,Covariance Type:,nonrobust

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-0.2033,0.064,-3.162,0.002,-0.329,-0.077
Lag2,-0.0581,0.029,-2.024,0.043,-0.114,-0.002


In [62]:
test_pred = ['Down' if p > 0.5 else 'Up' for p in model2.predict(test)]
cm2 = confusion_matrix(test.Direction, test_pred)
cm2

array([[ 9, 34],
       [ 5, 56]])

In [63]:
np.trace(cm2) / np.sum(cm2)

0.625

In [65]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
lda = LinearDiscriminantAnalysis().fit(train[['Lag2']], train.Direction)

In [69]:
lda_pred = lda.predict(test[['Lag2']])
cm_lda = confusion_matrix(test.Direction, lda_pred)
cm_lda

array([[ 8, 35],
       [ 4, 57]])

In [70]:
np.trace(cm_lda) / np.sum(cm_lda)

0.625

In [71]:
qda = QuadraticDiscriminantAnalysis().fit(train[['Lag2']], train.Direction)

In [72]:
qda_pred = qda.predict(test[['Lag2']])
cm_qda = confusion_matrix(test.Direction, qda_pred)
cm_qda

array([[ 0, 43],
       [ 0, 61]])

In [73]:
np.trace(cm_qda) / np.sum(cm_qda)

0.5865384615384616

In [84]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=1).fit(train[['Lag2']], train.Direction)

In [85]:
knn_pred = knn.predict(test[['Lag2']])
cm_knn = confusion_matrix(test.Direction, knn_pred)
cm_knn

array([[21, 22],
       [31, 30]])

In [86]:
np.trace(cm_knn) / np.sum(cm_knn)

0.49038461538461536