### Introduction to Machine Learning, UZH FS18, Group Project

### Group 2: Barbara Capl, Mathias Lüthi, Pamela Matias, Stefanie Rentsch


#     
# II. Feature Selection (II.i & Classification with Random Forest)

In [1]:
# hide unnecessary warnings ("depreciation" of packages etc.)
import warnings
warnings.filterwarnings('ignore')

# import packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Imputer
from sklearn.ensemble import RandomForestClassifier

# 1. Import Standardized Feature Matrix / Response Vector / Feature Labels    
####     
### Dataset Version 1: only ratios as predictive features
### Dataset Version 2: ratios + seasonality dummy variables + other market data
####     
### => All Datasets modified with sklearn Imputer (All Nan filled with "median")
###    

#### Version 1: only ratios as predictive features

In [None]:
# Import Feature Matrix X1 and response vector y1 and extracted feature labels of dataset 1

X1_train = pd.read.csv('Data/generated_splits/X1_train.csv')
y1_train = pd.read.csv('Data/generated_splits/y1_train.csv')

X1_test = pd.read.csv('Data/generated_splits/X1_test.csv')
y1_test = pd.read.csv('Data/generated_splits/y1_test.csv')

feature_labels1 = pd.read.csv('Data/generated_splits/feature_labels1.csv')

print('Shape of Feature Matrix X1_train = ' + str(X1_train.shape))
print("")
print('Feature Matrix X1_train')
display(X1_train.head())
print("")
print('Response Vector y1_train')
display(y1_train.head())
print("")

print('Shape of Feature Matrix X1_test = ' + str(X1_test.shape))
print("")
print('Feature Matrix X1_test')
display(X1_test.head())
print("")
print('Response Vector y1_test')
display(y1_test.head())
print("")

print('Type of feature_labels1 = ' + str(type(feature_labels1)))
print("")

# Check if there is the approximately same percentage of '1' in both training and test response vector
print('Ratio of "Ones" (Train) =  ' + str(y1_train.sum() / y1_train.size))
print('Ratio of "Ones" (Test)  =  ' + str(y1_test.sum() / y1_test.size))


#### Version 2: ratios + seasonality + other market data

In [None]:
# Import Feature Matrix X2, response vector y2 and extracted feature labels of dataset 2

# Train
X2_train = pd.read.csv('Data/generated_splits/X2_train.csv')
y2_train = pd.read.csv('Data/generated_splits/y2_train.csv')

# Test
X2_test = pd.read.csv('Data/generated_splits/X2_test.csv')
y2_test = pd.read.csv('Data/generated_splits/y2_test.csv')

feature_labels2 = pd.read.csv('Data/generated_splits/feature_labels2.csv')


print('Shape of Feature Matrix X2_train = ' + str(X2_train.shape))
print("")
print('Feature Matrix X2_train')
display(X2_train.head())
print("")
print('Response Vector y2_train')
display(y2_train.head())
print("")

print('Shape of Feature Matrix X2_test = ' + str(X2_test.shape))
print("")
print('Feature Matrix X2_test')
display(X2_test.head())
print("")
print('Response Vector y2_test')
display(y2_test.head())
print("")

print('Type of feature_labels2 = ' + str(type(feature_labels2)))
print("")

# Check if there is the approximately same percentage of '1' in both training and test response vector
print('Ratio of "Ones" (Train) =  ' + str(y2_train.sum() / y2_train.size))
print('Ratio of "Ones" (Test)  =  ' + str(y2_test.sum() / y2_test.size))


###   
# 2. Feature Selection with Random Forest
###   

## Select Features according to their explanatory value / importance for the response

# --- OLD CODE ---

## Feature Matrix and Response Vector 

## Train - / Test - Split

#### Version 1: Imputed Dataset

In [None]:
# Do the train - test- split
X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size = 0.2, random_state = 0, stratify = y1)

# Check if there is the approximately same percentage of '1' i both training and test response vector
display(y1_train.sum() / y1_train.size)
display(y1_test.sum() / y1_test.size)


#### Version 2: Dataset with rows dropped where Nan

In [None]:
# Do the train - test- split
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size = 0.2, random_state = 0, stratify = y2)

# Check if there is the approximately same percentage of '1' i both training and test response vector
display(y2_train.sum() / y2_train.size)
display(y2_test.sum() / y2_test.size)

## Standardize Variables

#### Version 1: Imputed Dataset

In [None]:
# Standardization with sklearn StandardScaler
standard_scaler_1 = preprocessing.StandardScaler().fit(X1_train)
X1_train = standard_scaler_1.transform(X1_train)
X1_test = standard_scaler_1.transform(X1_test)


#### Version 2: Dataset with rows dropped where Nan

In [None]:
# Standardization with sklearn StandardScaler
standard_scaler_2 = preprocessing.StandardScaler().fit(X2_train)
X2_train = standard_scaler_2.transform(X2_train)
X2_test = standard_scaler_2.transform(X2_test)


## Select Features according to their explanatory value / importance for the response

#### Version 1: Imputed Dataset

In [None]:
my_forest_1 = RandomForestClassifier(random_state = 1)
my_forest_1.max_depth = 6
my_forest_1.fit(X1_train, y1_train)

# Check features for their importance for the prediction
features_importances_1 = my_forest_1.feature_importances_

# sort features in line with their importance for the prediction
indices_1 = np.argsort(features_importances_1)[::-1]

# print best n features
n = 15
for i in range(n):
    print('{0:2d} {1:7s} {2:6.4f}'.format(i + 1, labels_of_features_1[indices_1[i]], 
                                          features_importances_1[indices_1[i]]))
del i


#### Version 2: Dataset with rows dropped where Nan

In [None]:
my_forest_2 = RandomForestClassifier(random_state = 1)
my_forest_2.max_depth = 8
my_forest_2.fit(X2_train, y2_train)

# Check features for their importance for the prediction
features_importances_2 = my_forest_2.feature_importances_

# sort features in line with their importance for the prediction
indices_2 = np.argsort(features_importances_2)[::-1]

# print best m features
m = 15
for i in range(m):
    print('{0:2d} {1:7s} {2:6.4f}'.format(i + 1, labels_of_features_2[indices_2[i]], 
                                          features_importances_2[indices_2[i]]))
del i


#### Version 1: Imputed Dataset

In [None]:
# Get cumsum of the n most important features
feat_imp_1 = np.sort(features_importances_1)[::-1]
sum_feat_imp_1 = np.cumsum(features_importances_1)[:n]

# Plot Feature Importance (both cumulative and individual)
plt.figure(figsize = (12, 8))
plt.title('Feature Importances: Version 1 - Imputed Dataset')
plt.bar(range(n), features_importances_1[indices_1[:n]], align = 'center')
plt.xticks(range(n), labels_of_features_1[indices_1[:n]], rotation = 90)
plt.xlim([-1, n])
plt.xlabel('Feature')
plt.ylabel('Relative Feature Importance')
plt.step(range(n), sum_feat_imp_1, where = 'mid', label = 'Cumulative Feature Importance')
plt.tight_layout();

#### Version 2: Dataset with rows dropped where Nan

In [None]:
# Get cumsum of the n most important features
feat_imp_2 = np.sort(features_importances_2)[::-1]
sum_feat_imp_2 = np.cumsum(features_importances_2)[:m]

# Plot Feature IMportance (both cumulative and individual)
plt.figure(figsize = (12, 8))
plt.title('Feature Importances: Version 2 - Dropnan Dataset')
plt.bar(range(m), features_importances_2[indices_2[:m]], align = 'center')
plt.xticks(range(m), labels_of_features_2[indices_2[:m]], rotation = 90)
plt.xlim([-1, m])
plt.xlabel('Feature')
plt.ylabel('Relative Feature Importance')
plt.step(range(m), sum_feat_imp_2, where = 'mid', label = 'Cumulative Feature Importance')
plt.tight_layout();

## Extract best 'n' / 'm' Variables

In [None]:
#from sklearn.feature_selection import SelectFromModel

#selected_1 = SelectFromModel(RandomForestClassifier(), threshold = 'median')
#selected_1.fit(X1, y1)
#n_features = selected_1.transform(X1).shape[1]
#X1.head()

# =====> AUTOMATED FEATURE EXTRACTION WOULD BE EXTREEEMELY COOL
# =====> SELECTFROMMODEL ALGO CAN ONLY SELECT FEATURES ACCORDING TO THRESHOLD, NOT "BET N FEATURES" (?)

#### Version 1: Imputed Dataset

In [None]:
# Extract only the n best variables + the responsa varable (NEXT_DAY_PREDICTION)
imputed_dataset_f = imputed_dataset[['sprtrn', 'divyield', 'BID', 'PEG_trailing', 'pe_op_basic', 'cash_lt', 
                                     'PEG_1yrforward', 'ASKHI', 'ptb', 'PEG_ltgforward',
                                   'NEXT_DAY_PREDICTION']]

imputed_dataset_f.to_csv('Data/generated/imputed_dataset_ml_f.csv')

print("")
print('Imputed Dataset with chosen n features')
display(imputed_dataset_f.head())




#### Version 2: Dataset with rows dropped where Nan

In [None]:
# Extract only the n best variables + the responsa varable (NEXT_DAY_PREDICTION)
dropnan_dataset_f = dropnan_dataset[['pretret_earnat', 'divyield', 'roe', 'pcf',
                                     'pe_op_dil', 'CAPEI', 'PEG_1yrforward', 'ptb', 'BID', 'accrual', 'ps',                        'pe_op_basic', 'cash_lt', 'PEG_1yrforward', 'ASKHI', 'ptb', 'PEG_ltgforward',
                                   'NEXT_DAY_PREDICTION']]

dropnan_dataset_f.to_csv('Data/generated/imputed_dataset_ml_f.csv')

display(dropnan_dataset_f.head())


###   
# Prediction with Random Forest
###   
### Prediction
### for Version 1: Imputed Dataset
### for Version 2: Dataset with rows dropped where Nan
###   



## Feature Matrix and Response Vector with Selected Features


#### Version 1: Imputed Dataset

In [None]:
# Extract labels of features
labels_of_features_1 = imputed_dataset_f.columns[:-1]

# X1 is the feature matrix
X1 = imputed_dataset_f.iloc[:, :-1]
#display(X1.head())

# y1 is the response vector
y1 = imputed_dataset_f.iloc[:, -1]
#display(y1.head())


#### Version 2: Dataset with rows dropped where Nan

In [None]:
# Extract labels of features
labels_of_features_2 = dropnan_dataset_f.columns[:-1]

# X2 is the feature matrix
X2 = dropnan_dataset_f.iloc[:, :-1]
#display(X2.head())

# y2 is the response vector
y2 = dropnan_dataset_f.iloc[:, -1]
#display(y2.head())

## Train - / Test - Split with Selected Features

#### Version 1: Imputed Dataset

In [None]:
# Do the train - test- split
X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size = 0.2, random_state = 0, stratify = y1)

# Check if there is the approximately same percentage of '1' i both training and test response vector
display(y1_train.sum() / y1_train.size)
display(y1_test.sum() / y1_test.size)

#### Version 2: Dataset with rows dropped where Nan

In [None]:
# Do the train - test- split
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size = 0.2, random_state = 0, stratify = y2)

# Check if there is the approximately same percentage of '1' i both training and test response vector
display(y2_train.sum() / y2_train.size)
display(y2_test.sum() / y2_test.size)


## Standardize Variables

In [None]:
# Version 1:
# Standardization with sklearn StandardScaler
standard_scaler_1 = preprocessing.StandardScaler().fit(X1_train)
X1_train = standard_scaler_1.transform(X1_train)
X1_test = standard_scaler_1.transform(X1_test)

# Version 2:
# Standardization with sklearn StandardScaler
standard_scaler_2 = preprocessing.StandardScaler().fit(X2_train)
X2_train = standard_scaler_2.transform(X2_train)
X2_test = standard_scaler_2.transform(X2_test)

## Prediction of Response Vector with Random Forest

#### Version 1: Imputed Dataset

In [None]:
# Test prediction of y1 with the test feature matrix: gives the prediction vector

my_forest_1 = RandomForestClassifier(random_state = 1)
my_forest_1.max_depth = 8
my_forest_1.fit(X1_train, y1_train)

prediction_1 = my_forest_1.predict(X1_test)

print("")
print('Predicted response Vector (First 10)')
display(prediction_1[0:10])


print("")
print('For comparison: Test-Response-Vector (First 10)')
display(np.array(y1_test[0:10]))

#### Version 2: Dataset with rows dropped where Nan

In [None]:
# Test prediction of y2 with the test feature matrix: gives the prediction vector

my_forest_2 = RandomForestClassifier(random_state = 1)
my_forest_2.max_depth = 8
my_forest_2.fit(X2_train, y2_train)

prediction_2 = my_forest_2.predict(X2_test)

print("")
print('Predicted response Vector (First 10)')
display(prediction_2[0:10])


print("")
print('For comparison: Test-Response-Vector (First 10)')
display(np.array(y2_test[0:10]))

# Cross-Validation of Prediction-Results

#### Version 1: Imputed Dataset

In [None]:
# Calculate percentage of of ones in the test response vector
print('Ratio of Ones (Test 1)   =  ' + str(y1_test.sum() / y1_test.size))

# Just to be sure the ones are distributed ca.the same in test and train response vector, check this:
print('Ratio of Ones (Train 1)  =  ' + str(y1_train.sum() / y1_train.size))

# Calculate precentage of ones predicted with the model
print('Score (Prediction 1)     =  ' + str(prediction_1.sum() / prediction_1.size))

# Calculate the score surplus above the test-set response vector score
print('Score Surplus Prediction over Guess = ' + str((prediction_1.sum()/prediction_1.size) - 
                                                     (y1_test.sum()/y1_test.size)))


#### Version 2: Dataset with rows dropped where Nan

In [None]:
# Calculate percentage of of ones in the test response vector
print('Ratio of Ones (Test 2)   =  ' + str(y2_test.sum() / y2_test.size))

# Just to be sure the ones are distributed ca.the same in test and train response vector, check this:
print('Ratio of Ones (Train 2)  =  ' + str(y2_train.sum() / y2_train.size))

# Calculate precentage of ones predicted with the model
print('Score (Prediction 2)     =  ' + str(prediction_2.sum() / prediction_2.size))

# Calculate the score surplus above the test-set response vector score
print('Score Surplus Prediction over Guess = ' + str((prediction_2.sum()/prediction_2.size) - 
                                                     (y2_test.sum()/y2_test.size)))


# Cross Validation: tbc