# ML in Finance Group Project
### Group 2: Barbara Capl, Mathias Lüthi, Pamela Matias, Stefanie Rentsch
## 2. Feature Selection

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Imputer
from sklearn.ensemble import RandomForestClassifier

### Import Attribute Matrix  and Response Vector

In [2]:
# Load in the response vector 
y = pd.read_csv('Data/generated_datasets/response_1.csv', sep=',', header=0)

# Choose which attribute matrice you want to load in by uncommenting it.
# X = pd.read_csv('Data/generated_datasets/attributes_ratios_1.csv', sep=',', header=0)
X = pd.read_csv('Data/generated_datasets/attributes_additional_1.csv', sep=',', header=0)

### Test/ Train Split and fill missing values with mean

In [3]:
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Use a mean fill for train
imp = Imputer(missing_values=np.nan, strategy = 'mean' , axis=0)
imputed_dataset = pd.DataFrame(imp.fit_transform(X_train))
imputed_dataset.columns = X_train.columns
imputed_dataset.index = X_train.index
X_train = imputed_dataset

# Use a mean fill for the test set
imputed_dataset = pd.DataFrame(imp.fit_transform(X_test))
imputed_dataset.columns = X_test.columns
imputed_dataset.index = X_test.index
X_test = imputed_dataset

# Extract the feature labels
feature_labels = list(X_train)

### Print out Shape and Form of Feature Matrix and Response Vector

In [4]:
print('Shape of Feature Matrix X_train = ' + str(X_train.shape))
print("")
print('Feature Matrix X_train')
display(X_train.head())
print("")
print('Response Vector y_train')
display(y_train.head())
print("")

print('Shape of Feature Matrix X_test = ' + str(X_test.shape))
print("")
print('Feature Matrix X_test')
display(X_test.head())
print("")
print('Response Vector y_test')
display(y_test.head())
print("")

print('Type of feature_labels = ' + str(type(feature_labels)))
print("")


# Check if there is the approximately same percentage of '1' in both training and test response vector
print('Ratio of "Ones" (Train) =  ' + str(y_train.sum() / y_train.size))
print('Ratio of "Ones" (Test)  =  ' + str(y_test.sum() / y_test.size))


Shape of Feature Matrix X_train = (2836, 181)

Feature Matrix X_train


Unnamed: 0,SHRFLG,SHRENDDT,BIDLO,ASKHI,PRC,VOL,RET,BID,ASK,SHROUT,...,CUSIP_65410610,CUSIP_71708110,CUSIP_74271810,CUSIP_88579Y10,CUSIP_89417E10,CUSIP_91301710,CUSIP_91324P10,CUSIP_92343V10,CUSIP_92826C83,CUSIP_93114210
1530,0.0,20140929.0,28.04,29.49,29.39,4611190.0,0.024042,29.4,29.41,6340863.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1397,0.0,20130730.0,32.17,34.65,32.17,1296447.0,-0.057168,32.16,32.17,1209589.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2238,0.0,20140330.0,54.31,58.49,56.82,3473222.0,0.026373,56.83,56.84,3786825.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3170,0.0,20120426.0,19.8,20.43,19.88,10100617.0,0.011962,19.87,19.88,5385938.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2244,0.0,20140929.0,55.91,59.74,59.45,2415634.0,0.030865,59.44,59.45,3761281.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0



Response Vector y_train


Unnamed: 0,0
1530,1
1397,0
2238,1
3170,1
2244,1



Shape of Feature Matrix X_test = (710, 181)

Feature Matrix X_test


Unnamed: 0,SHRFLG,SHRENDDT,BIDLO,ASKHI,PRC,VOL,RET,BID,ASK,SHROUT,...,CUSIP_65410610,CUSIP_71708110,CUSIP_74271810,CUSIP_88579Y10,CUSIP_89417E10,CUSIP_91301710,CUSIP_91324P10,CUSIP_92343V10,CUSIP_92826C83,CUSIP_93114210
817,0.0,20141009.0,97.87,103.3,100.75,15283673.0,-0.017073,100.75,100.76,5866161.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2592,0.0,20131030.0,72.43,77.64,75.52,746229.0,0.050202,75.53,75.54,1078864.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1475,0.0,20100225.0,18.53,20.0,18.66,10148052.0,0.025838,18.66,18.67,8069536.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1826,0.0,20090830.0,26.45,30.99,30.01,4201669.0,0.073319,29.95,29.98,2108780.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
599,0.0,20060730.0,56.78,62.45,62.06,2212972.0,0.037966,62.06,62.07,2216537.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0



Response Vector y_test


Unnamed: 0,0
817,0
2592,1
1475,1
1826,1
599,1



Type of feature_labels = <class 'list'>

Ratio of "Ones" (Train) =  0    0.559238
dtype: float64
Ratio of "Ones" (Test)  =  0    0.567606
dtype: float64


### Feature Selection with Random Forest

In [5]:
forest = RandomForestClassifier(random_state = 0)
forest.max_depth = 6
forest.fit(X_train, y_train)

# Check features for their importance for the prediction
feature_importances = forest.feature_importances_

# sort features in line with their importance for the prediction
indices = np.argsort(feature_importances)[::-1]

# print best n features
n = 15
for i in range(n):
    print('{0:2d} {1:7s} {2:6.4f}'.format(i + 1, feature_labels[indices[i]], 
                                          feature_importances[indices[i]]))
del i


 1 RET     0.4305
 2 RETX    0.1559
 3 vwretx  0.0864
 4 ewretx  0.0773
 5 sprtrn  0.0348
 6 vwretd  0.0341
 7 ewretd  0.0164
 8 pe_op_dil 0.0093
 9 cash_ratio 0.0091
10 pe_inc  0.0080
11 ALTPRC  0.0064
12 debt_ebitda 0.0063
13 ALTPRCDT 0.0055
14 fcf_ocf 0.0051
15 PRC     0.0048


  This is separate from the ipykernel package so we can avoid doing imports until


### Principal Component Analysis PCA