# Linear Discriminant Analysis

todo:
- factorize signal
- splits:
    - split in training and test split
    - split for Stratified-k-Fold Cross Validation
- Feed model with:
    - training set
    - standardized training set
    - balanced training set
    - balanced standardized training set
- checking the performance on test set

In [47]:
# import necessary modules
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

In [48]:
# import cleaned data frame
df_cleaned = pd.read_csv('cleaned_data/Cleaned Data.csv')

## Factorize signal

In [50]:
# code by Christian
SignalFac = []

for string in df_cleaned['Signal']:
    if string == 'Sell':
        SignalFac.append(0)
    elif string == 'Hold':
        SignalFac.append(1)    
    else:
        SignalFac.append(2)

In [51]:
df_cleaned['SignalFac'] = SignalFac
df_cleaned.drop('Signal', axis=1, inplace=True)

In [55]:
X = df_cleaned.drop('SignalFac',axis=1)
y = df_cleaned['SignalFac']

## Part 1: Split data in training and test split

In [56]:
# get training and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=0, stratify=y)

## Part 2: Setup for 5-fold CV on training set

In [57]:
# Create k-Fold CV
kFold = StratifiedKFold(n_splits = 5)

## Feed LDA with training data

In [58]:
# create LDA object and run classifier
lda = LDA(solver="lsqr")
lda = lda.fit(X_train, y_train)

In [59]:
# performance on training set
print('score: {0:.4f}'.format(lda.score(X_train, y_train)))
print('error-rate: {0:.4f}'.format(1-lda.score(X_train, y_train)))

score: 0.5421
error-rate: 0.4579


In [60]:
# performance on test set
y_pred = lda.predict(X_test)
print('score test set: {0:.4f}'.format(accuracy_score(y_test, y_pred)))

score test set: 0.5331


## Feed LDA with standardized training data

In [61]:
# standardize features
# Apply StandardScaler on continuous columns only
stdsc = StandardScaler()
X_train_std = stdsc.fit_transform(X_train[cols[:-1]]) # fit & transform
X_test_std = stdsc.transform(X_test[cols[:-1]]) # ONLY transform

In [62]:
# create LDA object and run classifier on standardized features
lda_std = LDA(solver="lsqr")
lda_std = lda_std.fit(X_train_std, y_train)

In [63]:
# performance on standardized training set
print('score: {0:.4f}'.format(lda.score(X_train_std, y_train)))
print('error-rate: {0:.4f}'.format(1-lda.score(X_train_std, y_train)))

score: 0.4043
error-rate: 0.5957


In [64]:
# performance on standardized test set
y_pred_std = lda.predict(X_test_std)
print('score standardized test set: {0:.4f}'.format(accuracy_score(y_test, y_pred_std)))

score standardized test set: 0.4041


## Feed LDA with balanaced training data