### Introduction to Machine Learning, UZH FS18, Group Project

### Group 2: Barbara Capl, Mathias Lüthi, Pamela Matias, Stefanie Rentsch


#     
# II. Feature Selection with RandomForestClassifier

In [1]:
# hide unnecessary warnings ("depreciation" of packages etc.)
import warnings
warnings.filterwarnings('ignore')

# import packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Imputer
from sklearn.ensemble import RandomForestClassifier

# 1. Import Standardized Feature Matrix / Response Vector / Feature Labels    
#### -------------------------------------------------------------------------------------------------------------------------------------------------
### Dataset Version 1: only ratios as predictive features
### Dataset Version 2: ratios + seasonality dummy variables + other market data
####     
### => All Datasets modified with sklearn Imputer (All Nan filled with "median")
#### -------------------------------------------------------------------------------------------------------------------------------------------------


#### Version 1: only ratios as predictive features

In [None]:
# Import Feature Matrix X1 and response vector y1 and extracted feature labels of dataset 1

X1_train = pd.read.csv('Data/generated_splits/X1_train.csv')
y1_train = pd.read.csv('Data/generated_splits/y1_train.csv')

X1_test = pd.read.csv('Data/generated_splits/X1_test.csv')
y1_test = pd.read.csv('Data/generated_splits/y1_test.csv')

feature_labels1 = pd.read.csv('Data/generated_splits/feature_labels_1.csv')

print('Shape of Feature Matrix X1_train = ' + str(X1_train.shape))
print("")
print('Feature Matrix X1_train')
display(X1_train.head())
print("")
print('Response Vector y1_train')
display(y1_train.head())
print("")

print('Shape of Feature Matrix X1_test = ' + str(X1_test.shape))
print("")
print('Feature Matrix X1_test')
display(X1_test.head())
print("")
print('Response Vector y1_test')
display(y1_test.head())
print("")

print('Type of feature_labels_1 = ' + str(type(feature_labels_1)))
print("")

# Check if there is the approximately same percentage of '1' in both training and test response vector
print('Ratio of "Ones" (Train) =  ' + str(y1_train.sum() / y1_train.size))
print('Ratio of "Ones" (Test)  =  ' + str(y1_test.sum() / y1_test.size))


#### Version 2: ratios + seasonality + other market data

In [None]:
# Import Feature Matrix X2, response vector y2 and extracted feature labels of dataset 2

# Train
X2_train = pd.read.csv('Data/generated_splits/X2_train.csv')
y2_train = pd.read.csv('Data/generated_splits/y2_train.csv')

# Test
X2_test = pd.read.csv('Data/generated_splits/X2_test.csv')
y2_test = pd.read.csv('Data/generated_splits/y2_test.csv')

feature_labels_2 = pd.read.csv('Data/generated_splits/feature_labels2.csv')


print('Shape of Feature Matrix X2_train = ' + str(X2_train.shape))
print("")
print('Feature Matrix X2_train')
display(X2_train.head())
print("")
print('Response Vector y2_train')
display(y2_train.head())
print("")

print('Shape of Feature Matrix X2_test = ' + str(X2_test.shape))
print("")
print('Feature Matrix X2_test')
display(X2_test.head())
print("")
print('Response Vector y2_test')
display(y2_test.head())
print("")

print('Type of feature_labels_2 = ' + str(type(feature_labels_2)))
print("")

# Check if there is the approximately same percentage of '1' in both training and test response vector
print('Ratio of "Ones" (Train) =  ' + str(y2_train.sum() / y2_train.size))
print('Ratio of "Ones" (Test)  =  ' + str(y2_test.sum() / y2_test.size))



# 2. Feature Selection with Random Forest  

## 2.1.  Fit forest and show list of best 'n' / 'm' features according to explanatory value

#### Version 1: only ratios as predictive features

In [None]:
forest_1 = RandomForestClassifier(random_state = 1)
forest_1.max_depth = 6
forest_1.fit(X1_train, y1_train)

# Check features for their importance for the prediction
feature_importances_1 = forest_1.feature_importances_

# sort features in line with their importance for the prediction
indices_1 = np.argsort(feature_importances_1)[::-1]

# print best n features
n = 15
for i in range(n):
    print('{0:2d} {1:7s} {2:6.4f}'.format(i + 1, feature_labels_1[indices_1[i]], 
                                          features_importances_1[indices_1[i]]))
del i


#### Version 2: ratios + seasonality + other market data

In [None]:
forest_2 = RandomForestClassifier(random_state = 1)
forest_2.max_depth = 6
forest_2.fit(X2_train, y2_train)

# Check features for their importance for the prediction
feature_importances_2 = forest_2.feature_importances_

# sort features in line with their importance for the prediction
indices_2 = np.argsort(feature_importances_2)[::-1]

# print best n features
m = 15
for i in range(m):
    print('{0:2d} {1:7s} {2:6.4f}'.format(i + 1, feature_labels_2[indices_2[i]], 
                                          features_importances_2[indices_2[i]]))
del i


## 2.2.  Plots

#### Version 1: only ratios as predictive features

In [None]:
# Get cumsum of the n most important features
feat_imp_1 = np.sort(feature_importances_1)[::-1]
sum_feat_imp_1 = np.cumsum(feature_importances_1)[:n]

# Plot Feature Importance (both cumulative and individual)
plt.figure(figsize = (12, 8))
plt.title('Feature Importances: Version 1 (only ratios)')
plt.bar(range(n), feature_importances_1[indices_1[:n]], align = 'center')
plt.xticks(range(n), feature_labels_1[indices_1[:n]], rotation = 90)
plt.xlim([-1, n])
plt.xlabel('Feature')
plt.ylabel('Relative Feature Importance')
plt.step(range(n), sum_feat_imp_1, where = 'mid', label = 'Cumulative Feature Importance')
plt.tight_layout();

#### Version 2: ratios + seasonality + other market data

In [None]:
# Get cumsum of the n most important features
feat_imp_2 = np.sort(features_importances_2)[::-1]
sum_feat_imp_2 = np.cumsum(features_importances_2)[:m]

# Plot Feature Importance (both cumulative and individual)
plt.figure(figsize = (12, 8))
plt.title('Feature Importances: Version 2 (ratios + seasonality + market)')
plt.bar(range(m), feature_importances_2[indices_2[:m]], align = 'center')
plt.xticks(range(m), feature_labels_2[indices_2[:m]], rotation = 90)
plt.xlim([-1, m])
plt.xlabel('Feature')
plt.ylabel('Relative Feature Importance')
plt.step(range(m), sum_feat_imp_2, where = 'mid', label = 'Cumulative Feature Importance')
plt.tight_layout();

## 2.3.  Extract best 'n' / 'm' Variables of datasets and save


In [None]:
#from sklearn.feature_selection import SelectFromModel

#selected_1 = SelectFromModel(RandomForestClassifier(), threshold = 'median')
#selected_1.fit(X1, y1)
#n_features = selected_1.transform(X1).shape[1]
#X1.head()

# =====> AUTOMATED FEATURE EXTRACTION WOULD BE EXTREEEMELY COOL
# =====> SELECTFROMMODEL ALGO CAN ONLY SELECT FEATURES ACCORDING TO THRESHOLD, NOT "BET N FEATURES" (?)

#### Version 1: only ratios as predictive features

In [None]:
# Extract only the n best variables directly from the feature matrix into a new one
X1_train_s = X1_train[['_', '_']]
X1_test_s = X1_test[['_', '_']]

feature_labels_1_s = X1_train_s.columns[:-1]

# Save to csv
X1_train_s.to_csv('Data/generated_splits/X1_train_s.csv')
X1_test_s.to_csv('Data/generated_splits/X1_test_s.csv')

feature_labels_1_s.to_csv('Data/generated_splits/feature_labels_1_s.csv')

print('Feature Matrix with selected Features (Train)')
display(X1_train_s.head())
print("")
print('Feature Matrix with selected Features (Test)')
display(X1_test_s.head())
print("")

In [None]:
### Maybe unnecessary, but maybe necessary for more randomness (if split again after feature selection, IDK)
###

# Load necessary Dataset
#df_1 = pd.read_csv('Data/generated_datasets/df_1.csv', sep = ',')

# Extract only the n best variables + the respons varable (NEXT_DAY_PREDICTION)
#df_1_selected = df_1[['_', '_', 'NEXT_DAY_PREDICTION']]

# Save to csv
#df_1_selected.to_csv('Data/generated_datasets/df_1_selected.csv')

#print("")
#print('Dataset 1 (only ratios) with chosen n features')
#display(df_1_selected.head())

In [None]:
# Extract labels of features
#feature_labels_1_s = df_1_selected.columns[:-1]

# X1_s is the feature matrix wit hselected features
#X1_s = df_1_selected.iloc[:, :-1]
#display(X1_s.head())

# y1 is the response vector
#y1 = df_1_selected.iloc[:, -1]

In [None]:
# Do the train - test- split
#X1_train_s, X1_test_s, y1_train_s, y1_test_s = train_test_split(X1_s, y1_s, test_size = 0.2, random_state = 0, 
                                                         #       stratify = y1_s)

# Check if there is the approximately same percentage of '1' i both training and test response vector
#display(y1_train.sum() / y1_train.size)
#display(y1_test.sum() / y1_test.size)

#### Version 2: ratios + seasonality + other market data

In [None]:
# Extract only the n best variables directly from the feature matrix into a new one
X2_train_s = X2_train[['_', '_']]
X2_test_s = X2_test[['_', '_']]

feature_labels_2_s = X2_train_s.columns[:-1]

# Save to csv
X2_train_s.to_csv('Data/generated_splits/X2_train_s.csv')
X2_test_s.to_csv('Data/generated_splits/X2_test_s.csv')

feature_labels_2_s.to_csv('Data/generated_splits/feature_labels_2_s.csv')


print('Feature Matrix with selected Features (Train)')
display(X2_train_s.head())
print("")
print('Feature Matrix with selected Features (Test)')
display(X2_test_s.head())
print("")

In [None]:
### Maybe unnecessary, but maybe necessary for more randomness (if split again after feature selection, IDK)
###

# Load necessary Dataset
#df_2 = pd.read_csv('Data/generated_datasets/df_2.csv', sep = ',')

# Extract only the n best variables + the respons varable (NEXT_DAY_PREDICTION)
#df_2_selected = df_2[['_', '_', 'NEXT_DAY_PREDICTION']]

# Save to csv
#df_2_selected.to_csv('Data/generated_datasets/df_2_selected.csv')

#print("")
#print('Dataset 2 (ratios + seasonality + market) with chosen m features')
#display(df_2_selected.head())

In [None]:
# Extract labels of features
#feature_labels_2_s = df_2_selected.columns[:-1]

# X2_s is the feature matrix with selected features
#X2_s = df_2_selected.iloc[:, :-1]
#display(X2_2.head())

# y2_s is the response vector
#y2_s = df_2_selected.iloc[:, -1]

In [None]:
# Do the train - test- split
#X2_train_s, X2_test_s, y2_train_s, y2_test_s = train_test_split(X2_s, y2_s, test_size = 0.2, random_state = 0, 
                                                         #       stratify = y2_s)

# Check if there is the approximately same percentage of '1' i both training and test response vector
#display(y2_s_train.sum() / y2_s_train.size)
#display(y2_s_test.sum() / y2_s_test.size)