# Linear Discriminant Analysis

- no need to factorize or creating dummy variables for categorical features because already done in data cleaning step
- 2 parts:
    - split in training and test split
    - split for Stratified-k-Fold Cross Validation
- standardize training set
- feed LDA model
- checking the performance on test set

In [48]:
# import necessary modules
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.preprocessing import StandardScaler

In [11]:
# import cleaned data frame
df_cleaned = pd.read_csv('cleaned_data/Cleaned Data.csv')
df_cleaned.head(3)

Unnamed: 0.1,Unnamed: 0,Revenue,Revenue Growth,Cost of Revenue,Gross Profit,R&D Expenses,SG&A Expense,Operating Expenses,Operating Income,Interest Expense,...,Consumer Cyclical,Consumer Defensive,Energy,Financial Services,Healthcare,Industrials,Real Estate,Technology,Utilities,Signal
0,2,3734148000.0,1.1737,2805625000.0,928522600.0,108330300.0,344141400.0,793926700.0,134595900.0,12148690.0,...,0,1,0,0,0,0,0,0,0,Sell
1,5,17909600000.0,0.0076,11539800000.0,6369800000.0,0.0,3474300000.0,3412400000.0,2957400000.0,302400000.0,...,0,1,0,0,0,0,0,0,0,Buy
2,15,5727000000.0,0.0214,3523600000.0,2203400000.0,0.0,1480500000.0,1598700000.0,604700000.0,60400000.0,...,0,1,0,0,0,0,0,0,0,Buy


## Part 1: Split data in training and test split

In [35]:
# get column names
cols = df_cleaned.columns.values

# seperate responses and features
y = df_cleaned[cols[-1]] # responses
X = df_cleaned[cols[:-1]] # features

# get training and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=0, stratify=y)

Unnamed: 0.1,Unnamed: 0,Revenue,Revenue Growth,Cost of Revenue,Gross Profit,R&D Expenses,SG&A Expense,Operating Expenses,Operating Income,Interest Expense,...,Communication Services,Consumer Cyclical,Consumer Defensive,Energy,Financial Services,Healthcare,Industrials,Real Estate,Technology,Utilities
6715,12614,25370000.0,-0.1235,17081000.0,8289000.0,0.0,6948000.0,6948000.0,1341000.0,0.0,...,0,0,0,0,0,0,0,0,1,0
10183,19529,3873800000.0,0.0648,3084900000.0,788900000.0,0.0,397900000.0,419700000.0,369200000.0,51000000.0,...,0,1,0,0,0,0,0,0,0,0
229,410,1153000.0,-0.1013,0.0,1153000.0,20707000.0,16758000.0,37465000.0,-36312000.0,82000.0,...,0,0,0,0,0,1,0,0,0,0


## Part 2: Setup for 5-fold CV on training set

In [33]:
# Create k-Fold CV
kFold = StratifiedKFold(n_splits = 5)

## Feed LDA with training data

In [None]:
# create LDA object and run classifier
lda = LDA(solver="lsqr")
lda = lda.fit(X_train, y_train)

In [46]:
# performance on training data
print('score: {0:.4f}'.format(lda.score(X_train, y_train)))
print('error-rate: {0:.4f}'.format(1-lda.score(X_train, y_train)))

score: 0.5395
error-rate: 0.4605


In [50]:
# standardize features
# Apply StandardScaler on continuous columns only
stdsc = StandardScaler()
X_train_std = stdsc.fit_transform(X_train[cols[:-1]]) # fit & transform
X_test_std = stdsc.transform(X_test[cols[:-1]]) # ONLY transform

In [51]:
# create LDA object and run classifier on standardized features
lda_std = LDA(solver="lsqr")
lda_std = lda_std.fit(X_train_std, y_train)

In [52]:
# performance on standardized training data
print('score: {0:.4f}'.format(lda.score(X_train_std, y_train)))
print('error-rate: {0:.4f}'.format(1-lda.score(X_train_std, y_train)))

score: 0.4043
error-rate: 0.5957


In [57]:
X_train.var()

Unnamed: 0         4.186468e+07
Revenue            1.539322e+19
Revenue Growth     1.507879e+02
Cost of Revenue    8.632944e+18
Gross Profit       1.730393e+18
                       ...     
Healthcare         1.300292e-01
Industrials        1.220356e-01
Real Estate        6.047910e-02
Technology         1.296014e-01
Utilities          2.617676e-02
Length: 231, dtype: float64

In [58]:
X_train_std.var()

0.9956709956709953