# Linear Discriminant Analysis

todo:
- factorize signal
- splits:
    - split in training and test split
    - split for Stratified-k-Fold Cross Validation
- Feed model with:
    - training set
    - standardized training set
    - balanced training set
    - balanced standardized training set
- checking the performance on test set

In [1]:
# import necessary modules
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

In [2]:
# import cleaned data frame
df_cleaned = pd.read_csv('cleaned_data/Cleaned Data.csv')

## Factorize signal

In [3]:
# code by Christian
SignalFac = []

for string in df_cleaned['Signal']:
    if string == 'Sell':
        SignalFac.append(0)
    elif string == 'Hold':
        SignalFac.append(1)    
    else:
        SignalFac.append(2)

In [4]:
df_cleaned['SignalFac'] = SignalFac
df_cleaned.drop('Signal', axis=1, inplace=True)

In [5]:
X = df_cleaned.drop('SignalFac',axis=1)
y = df_cleaned['SignalFac']

## Part 1: Split data in training and test split

In [6]:
# get training and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=0, stratify=y)

## Part 2: Setup for 5-fold CV on training set

In [7]:
# Create k-Fold CV
kFold = StratifiedKFold(n_splits = 5)

## Feed LDA with training data

In [8]:
# create LDA object and run classifier
lda = LDA(solver="lsqr")
lda = lda.fit(X_train, y_train)

In [9]:
# performance on training set
print('score: {0:.4f}'.format(lda.score(X_train, y_train)))
print('error-rate: {0:.4f}'.format(1-lda.score(X_train, y_train)))

score: 0.5430
error-rate: 0.4570


In [10]:
# performance on test set
y_pred = lda.predict(X_test)
print('score test set: {0:.4f}'.format(accuracy_score(y_test, y_pred)))

score test set: 0.5348


In [11]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.55      0.87      0.67      1846
           1       0.05      0.00      0.01       224
           2       0.47      0.18      0.26      1404

    accuracy                           0.53      3474
   macro avg       0.36      0.35      0.31      3474
weighted avg       0.49      0.53      0.46      3474



In [13]:
from sklearn import metrics
print('Confusion matrix: \n', 
      metrics.confusion_matrix(y_test, y_pred))

Confusion matrix: 
 [[1611   11  224]
 [ 175    1   48]
 [1148   10  246]]


## Feed LDA with standardized training data

In [11]:
# standardize features
# Apply StandardScaler on continuous columns only
stdsc = StandardScaler()
X_train_std = stdsc.fit_transform(X_train) # fit & transform
X_test_std = stdsc.transform(X_test) # ONLY transform

In [12]:
# create LDA object and run classifier on standardized features
lda_std = LDA(solver="lsqr")
lda_std = lda_std.fit(X_train_std, y_train)

In [13]:
# performance on standardized training set
print('score: {0:.4f}'.format(lda.score(X_train_std, y_train)))
print('error-rate: {0:.4f}'.format(1-lda.score(X_train_std, y_train)))

score: 0.4043
error-rate: 0.5957


In [14]:
# performance on standardized test set
y_pred_std = lda.predict(X_test_std)
print('score standardized test set: {0:.4f}'.format(accuracy_score(y_test, y_pred_std)))

score standardized test set: 0.4041


## Feed LDA with balanaced training data

In [24]:
print(X.shape,
      y[y==0].shape,
y[y==1].shape,
y[y==2].shape)


(11579, 231) (6153,) (745,) (4681,)


In [36]:
from sklearn.utils import resample
X_upsampled, y_upsampled = resample(X[y==1], y[y==1],
                                   replace = True,
                                   n_samples = X[y==0].shape[0],
                                   random_state=1)
X_upsampled_2, y_upsampled_2 = resample(X[y==2], y[y==2],
                                   replace = True,
                                   n_samples = X[y==0].shape[0],
                                   random_state=1)


X_bal = np.vstack((X[y==0], X_upsampled, X_upsampled_2))
y_bal = np.hstack((y[y==0], y_upsampled, y_upsampled_2))
y_bal.shape

(18459,)

In [43]:
X_train_bal, X_test_bal, y_train_bal, y_test_bal = \
    train_test_split(X_bal, y_bal, 
                     test_size = 0.3, 
                     random_state = 0, 
                     stratify = y_bal)

X_train_bal_std = stdsc.fit_transform(X_train_bal)
X_test_bal_std =stdsc.transform(X_test_bal)


In [44]:
from sklearn import metrics
model = LDA(solver="lsqr")
model.fit(X_train_bal_std, y_train_bal)


y_pred_bal = model.predict(X_test_bal_std)
print(metrics.classification_report(y_test_bal, y_pred_bal))
print(metrics.confusion_matrix(y_test_bal, y_pred_bal))
print("Test score : {:.2f}".format(model.score(X_test_bal_std, y_test_bal)))

              precision    recall  f1-score   support

           0       0.46      0.46      0.46      1846
           1       0.48      0.50      0.49      1846
           2       0.42      0.39      0.40      1846

    accuracy                           0.45      5538
   macro avg       0.45      0.45      0.45      5538
weighted avg       0.45      0.45      0.45      5538

[[854 469 523]
 [438 929 479]
 [581 547 718]]
Test score : 0.45
