# Introduction to Machine Learning, UZH 2018, Group Project
### Group 2: Barbara Capl, Mathias Lüthi, Pamela Matias, Stefanie Rentsch
##       
##    
# 4. Collection of all Pipes

###        
In this section we use the data we prepared in chapter 1.  



In [1]:
# hide unnecessary warnings ("depreciation" of packages etc.)
import warnings
warnings.filterwarnings('ignore')

# import packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.formula.api as smf

from sklearn import metrics
from sklearn import preprocessing
from sklearn.preprocessing import Imputer, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.feature_selection import SelectFromModel
from sklearn.svm import SVC
plt.style.use('seaborn-whitegrid')

###    
###    
# 4.1. PIPE 1

#### Feature Selection: In-Pipe with RandomForestClassifier or PCA
#### Scaling : In-Pipe with StandardScaler
#### Classification: SVM
#### Additional Classification: RandomForestClassifier, LogisticRegression
###      

## 4.1.1. DATA SETTINGS (P1)


### Choose the Dataset Version you want
##### Whole Feature Matrices (Features not pre-selected)
VERSION = 1; Feature Matrix with only ratios                                  
VERSION = 2;  Feature Matrix with ratios + saisonality + other market data



In [4]:
### Chose which dataset version you want the selection of features and the prediction to be based on 
VERSION = 1
"""
INSERT NUMBER 1 or 2
"""

# Defining sel_state variable for easier printing out    
if VERSION == 1: sel_version = 'Whole original Dataset with only the Ratios as predicive Features.'
elif VERSION == 2: sel_version = 'Whole original Dataset with Ratios + Seasonality + other Market Data as predictive Features.'
else: raise ValueError('VERSION must be either 1 or 2')

In [5]:
# print status
briefing_data='SUMMARY DATA SETTINGS:'+'\n'+'Selected Version ' + str(VERSION) + ': ' + str(sel_version)+'\n'
print(briefing_data)

SUMMARY DATA SETTINGS:
Selected Version 1: Whole original Dataset with only the Ratios as predicive Features.



##### Import necessary Data and impute

In [6]:
# Import Data
if VERSION == 1: 
# features not pre-selected, only ratios
    X = pd.read_csv('Data/generated_datasets/features_ratios_1.csv', sep=',', header=0)
    y = pd.read_csv('Data/generated_datasets/response_1.csv', sep=',', header=0)
elif VERSION == 2: 
# features not pre-selected, ratios + seasonality + market data
    X = pd.read_csv('Data/generated_datasets/features_additional_1.csv', sep=',', header=0)
    y = pd.read_csv('Data/generated_datasets/response_1.csv', sep=',', header=0)
else: raise ValueError('VERSION value must be either 1 or 2')   

In [7]:
# Train/test split, into 20% test size and 80% train size because it is a relatively small dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Use a median fill for train
imp = Imputer(missing_values=np.nan, strategy = 'median' , axis=0)
imputed_dataset = pd.DataFrame(imp.fit_transform(X_train))
imputed_dataset.columns = X_train.columns
imputed_dataset.index = X_train.index
X_train = imputed_dataset

# Use a median fill for the test set
imputed_dataset = pd.DataFrame(imp.fit_transform(X_test))
imputed_dataset.columns = X_test.columns
imputed_dataset.index = X_test.index
X_test = imputed_dataset

display(X_train.head(3), X_train.shape)
display(X_test.head(3), X_test.shape)
display(y_train.head(3), y_train.shape)
display(y_test.head(3), y_test.shape)

Unnamed: 0,CAPEI,bm,evm,pe_op_basic,pe_op_dil,pe_exi,pe_inc,ps,pcf,dpr,...,sale_nwc,rd_sale,adv_sale,staff_sale,accrual,ptb,PEG_trailing,divyield,PEG_1yrforward,PEG_ltgforward
1530,20.773,0.547,10.644,15.468,15.633,18.484,18.484,3.703,9.957,0.623,...,1.424,0.136,0.058,0.0,0.048,1.811,1.181,0.0354,13.678,5.963
1397,25.089,0.461,9.246,17.203,17.296,39.232,39.232,0.689,8.509,2.175,...,4.991,0.031,0.0,0.0,0.047,2.189,0.668,0.0398,1.521,5.702
2238,12.337,0.911,12.474,9.135,9.209,13.062,13.062,2.034,1.993,0.337,...,5.8105,0.0,0.024,0.291,0.038,1.076,3.048,0.0268,0.38,2.248


(2836, 71)

Unnamed: 0,CAPEI,bm,evm,pe_op_basic,pe_op_dil,pe_exi,pe_inc,ps,pcf,dpr,...,sale_nwc,rd_sale,adv_sale,staff_sale,accrual,ptb,PEG_trailing,divyield,PEG_1yrforward,PEG_ltgforward
817,20.983,0.254,8.585,16.127,16.224,16.224,16.224,3.318,10.484,0.287,...,6.768,0.031,0.006,0.0,0.084,4.189,1.022,0.0187,1.414,1.33
2592,21.663,0.235,13.619,16.311,16.489,18.555,18.555,2.379,7.709,0.199,...,7.169,0.0,0.0,0.195,0.039,4.281,1.147,0.0122,1.737,1.572
1475,16.327,0.513,6.716,9.52,9.52,15.681,15.55,3.293,8.49,0.792,...,1.286,0.162,0.054,0.0,0.077,2.198,1.022,0.0343,-0.973,6.149


(710, 71)

Unnamed: 0,0
1530,0
1397,1
2238,0


(2836, 1)

Unnamed: 0,0
817,1
2592,0
1475,1


(710, 1)

## 4.1.2. PIPING SETTINGS (P1)

In [8]:
# Set Kernel
set_kernel = 'linear'
"""
Select 'poly', 'linear' or 'rbf'
"""

# Choose In-Pipe Feature Selection Method
set_feature_selecter = 'RF'
"""
Select 'RF' for RandomForestClassifier 
Select 'PCA' for Principal Component Analysis
"""

# Choose Classifier
set_classifier = 'RF'
"""
Select 'LR' (for LogisticRegression) or 'RF' (for RandomForestClassifier)
"""

# Print result of settings
briefing_pipe='SUMMARY PIPE SETTINGS:'+'\n'+'Selected kernel: '+str(set_kernel)+'.'+'\n'+'Selected In-Pipe Feature Selecter: '+str(set_feature_selecter)+'.'+'\n'+'Additional Classifier: '+str(set_classifier)+'.'+'\n'
print(briefing_pipe)

SUMMARY PIPE SETTINGS:
Selected kernel: linear.
Selected In-Pipe Feature Selecter: RF.
Additional Classifier: RF.



## 4.1.3. Piping (P1)

In [9]:
# set conditions so according to choices under "settings", the methods are going to be used automatically

# For Loop defining paremeters for Feature Selecter
if set_feature_selecter == 'PCA': 
    feat_sel_function = PCA()
    fs_attr1  ='fs__n_components'
    fs_attr1_par = [10, 20, 30]
    fs_attr2 = 'fs__random_state'
    fs_attr2_par = [0, 10, 20]  

elif set_feature_selecter == 'RF': 
    feat_sel_function = SelectFromModel(estimator=RandomForestClassifier(), threshold = 'median')
    fs_attr1 = 'fs__estimator__n_estimators'
    fs_attr1_par = [10, 20, 30]
    fs_attr2 = 'fs__estimator__random_state'
    fs_attr2_par = [0, 10, 20]  
else: raise ValueError('feature_selecter must be either PCA,or RF.')

# For Loop defining paremeters for additional Classifier
if set_classifier == 'RF': 
    classifier_function = RandomForestClassifier()
    cl_attr1 = 'cl__n_estimators'
    cl_attr1_par = [10, 20, 30]
    cl_attr2 = 'cl__random_state'
    cl_attr2_par =  [0, 10, 20]
elif set_classifier == 'LR': 
    classifier_function = LogisticRegression()
    cl_attr1 = 'cl__C'
    cl_attr1_par = [1, 50, 100]
    cl_attr2 = 'cl__random_state'
    cl_attr2_par =  [0, 10, 20]
else: raise ValueError('feature_selecter must be either PCA or RandomForestClassifier')

#display(fs_attr1, fs_attr1_par)
#display(fs_attr2, fs_attr2_par)
#display(cl_attr1, cl_attr1_par)
#display(cl_attr2, cl_attr2_par)

# print settings
print(briefing_data+'\n'+briefing_pipe)

SUMMARY DATA SETTINGS:
Selected Version 1: Whole original Dataset with only the Ratios as predicive Features.

SUMMARY PIPE SETTINGS:
Selected kernel: linear.
Selected In-Pipe Feature Selecter: RF.
Additional Classifier: RF.



In [10]:
# Create pipeline object with standard scaler, chosen Feature Selecter and SVC estimator
# Standardscaler standardizes the input variables
pipe1_a = Pipeline([('scaler', StandardScaler()), 
                    ('fs', feat_sel_function), 
                  ('cl', SVC(kernel=set_kernel, random_state=0))])
pipe1_b = Pipeline([('scaler', StandardScaler()), 
                  ('fs', feat_sel_function), 
                  ('cl', classifier_function)])


# print used parameters
display(feat_sel_function, classifier_function)

SelectFromModel(estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
        norm_order=1, prefit=False, threshold='median')

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [11]:
# Display scores from resulting pipes
score1_a = pipe1_a.fit(X_train, y_train).score(X_test, y_test)
score1_b = pipe1_b.fit(X_train, y_train).score(X_test, y_test)

print('Score Pipe 1a = '+str(score1_a)+'   vs.   Ratio of "UP" (Test) = '+str(y_test['0'].sum()/len(y_test['0'])))
print('Score Pipe 1b = '+str(score1_b)+'   vs.   Ratio of "UP" (Test) = '+str(y_test['0'].sum()/len(y_test['0'])))

Score Pipe 1a = 0.6281690140845071   vs.   Ratio of "UP" (Test) = 0.5704225352112676
Score Pipe 1b = 0.5140845070422535   vs.   Ratio of "UP" (Test) = 0.5704225352112676


## 4.1.4. Grid Search (P1)

In [12]:
# Define parameter grid

# First part: scaling with StandardScaler, 
# Feature Selection with selected method, 
# Classification with SVC with selected kernel

# Second part:  scaling with StandardScaler,
# Feature Selection with selected method, 
# Classification with selected 'additional classifier' method

param_grid1 = [{'scaler': [StandardScaler()],
               'fs': [feat_sel_function],
                fs_attr1: fs_attr1_par,
                fs_attr2: fs_attr2_par,                
               'cl': [SVC(kernel=set_kernel)],
               'cl__C': [10, 100]},
               {'scaler': [StandardScaler()],
                'fs': [feat_sel_function],
                fs_attr1: fs_attr1_par,
                fs_attr2: fs_attr2_par, 
                'cl': [classifier_function],
                cl_attr1: cl_attr1_par,
                cl_attr2: cl_attr2_par}]

# display chosen attributes
show_attributes = 'yes' # if not 'yes', assumed 'no'
if show_attributes == 'yes':
    print(str('Chosen Feature Selection Method: '+str(set_feature_selecter)+'\n'+fs_attr1+': '+str(fs_attr1_par)+'\n'+fs_attr2+': '+str(fs_attr2_par))+'\n')
    print(str('Chosen additional Classification Method (beside SVM): '+str(set_classifier)+'\n'+cl_attr1+': '+str(cl_attr1_par)+'\n'+cl_attr2+': '+str(cl_attr2_par)))

Chosen Feature Selection Method: RF
fs__estimator__n_estimators: [10, 20, 30]
fs__estimator__random_state: [0, 10, 20]

Chosen additional Classification Method (beside SVM): RF
cl__n_estimators: [10, 20, 30]
cl__random_state: [0, 10, 20]


In [None]:
# Grid Search for pipe1_a
### Paatieeence (this can take some minutes)
grid1_a = GridSearchCV(pipe1_a, param_grid1, cv=5, n_jobs=-1)
grid1_a.fit(X_train, y_train)

In [None]:
# Print results
print('Best CV accuracy: {:.2f}'.format(grid1_a.best_score_)+'\n')
print('Test score:       {:.2f}'.format(grid1_a.score(X_test, y_test))+'\n')
print('Best parameters: {}'.format(grid1_a.best_params_)+'\n')

In [14]:
# Grid Search for pipe1_b
### Paatieeence (this can take some minutes)
grid1_b = GridSearchCV(pipe1_b, param_grid1, cv=5, n_jobs=-1)
grid1_b.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('fs', SelectFromModel(estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_i...nalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid=[{'scaler': [StandardScaler(copy=True, with_mean=True, with_std=True)], 'fs': [SelectFromModel(estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0,...1,
          verbose=0, warm_start=False)], 'cl__C': [1, 50, 100], 'cl__random_state': [0, 10, 20]}],
       pre_dispatch='2*n_jobs', refit=True, return_tr

In [15]:
# Print results
print('Best CV accuracy: {:.2f}'.format(grid1_b.best_score_)+'\n')
print('Test score:       {:.2f}'.format(grid1_b.score(X_test, y_test))+'\n')
print('Best parameters: {}'.format(grid1_b.best_params_))

Best CV accuracy: 0.63

Test score:       0.63

Best parameters: {'cl': LogisticRegression(C=100, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False), 'cl__C': 100, 'cl__random_state': 0, 'fs': SelectFromModel(estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=20, verbose=0, warm_start=False),
        norm_order=1, prefit=False, threshold='median'), 'fs__estimator__n_estimators': 10, 'fs__estimator__random_state': 20, 'scaler': StandardScaler(copy=True, with_mean=True, with_std=

###    
###     
###     
# 4.2. PIPE 2
#### Feature Selection:  before Pipe with RandomForestClassifier or PCA (Chapter 2.A and 2.B)
#### Scaling : In-Pipe with StandardScaler
#### Classification: SVM
#### additional Classification: RandomForestClassifier, LogisticRegression
###      

## 4.2.1. DATA SETTINGS (P2)


### Choose the Dataset Version you want
##### Reduced Feature Matrix (Features pre-selected)
VERSION = 1.1; Reduced Feature Matrix with only ratios                                  
VERSION = 2.1;  Reduced Feature Matrix with ratios + saisonality + other market data

In [16]:
### Chose which dataset version you want the selection of features and the prediction to be based on 
VERSION = 2.1
"""
INSERT NUMBER 1 or 2
"""

# Defining sel_state variable for easier printing out    
if VERSION == 1.1: sel_version = 'Reduced Dataset with only the Ratios as predicive Features.'
elif VERSION == 2.1: sel_version = 'Reduced Dataset with Ratios + Seasonality + other Market Data as predictive Features.'
else: raise ValueError('VERSION must be either 1.1 or 2.1')

### Choose with which method you want to have the features been pre-selected /reduced

##### You have the choice between:
SELECTION  = RF ; Features pre-selected with Random Forest Classifier                                                           
SELECTION = PCA; Features pre-selected with Principal Component Analysis (PCA)                                         

In [17]:
### Choose whether you want the datasets with features selected with RF or PCA or the original file
SELECTION = 'RF'
"""
INSERT WISHED METHOD 'RF', 'PCA'
"""



# This is the control loop. If something has been chosen wrong, it returns an error with explanation.
if SELECTION is not 'RF' and SELECTION is not 'PCA': raise ValueError('SELECTION must be set as either RF or PCA.')
# Defining of sel_feat (Selected Feature Selection Method) variable and briefing for later.   
if SELECTION == 'RF':
    sel_feat = 'Random Forest (RF)'
    briefing = ('You chose dataset VERSION '+str(VERSION)+' and SELECTION method '+str(SELECTION)+'.'+'\n'+'Features therefore pre-selected with '+str(sel_feat)+'.')
elif SELECTION == 'PCA':
    sel_feat = 'Principal Component Analysis (PCA)'
    briefing = ('You chose dataset VERSION '+str(VERSION)+' and SELECTION method '+str(SELECTION)+'.'+'\n'+'Features therefore pre-selected with '+str(sel_feat)+'.')
else: raise ValueError('SELECTION must be chosen as either RF or PCA')
print('You chose SELECTION method '+str(sel_feat)+'.'+'\n')
#print(sel_feat)

You chose SELECTION method Random Forest (RF).



In [18]:
# print status
briefing_data='SUMMARY DATA SETTINGS:'+'\n'+'Selected Version ' + str(VERSION) + ': ' + str(sel_version)+'\n'+'Pre-Selected with '+str(sel_feat)+'.'+'\n'    
print(briefing_data)

SUMMARY DATA SETTINGS:
Selected Version 2.1: Reduced Dataset with Ratios + Seasonality + other Market Data as predictive Features.
Pre-Selected with Random Forest (RF).



##### Import necessary Data. Imputing not necessary because already done (Data Importet as Results from Chapter 2.A and 2.B)

In [19]:
# Import Data
if VERSION == 1.1: 
    if SELECTION == 'RF':
        X_train = pd.read_csv('Data/generated_splits/features_selected_randomforest/X1_train_f.csv', sep=',', header=0)
        y_train = pd.read_csv('Data/generated_splits/features_selected_randomforest/y1_train_f.csv', sep=',', header=0)
        X_test = pd.read_csv('Data/generated_splits/features_selected_randomforest/X1_test_f.csv', sep=',', header=0)
        y_test = pd.read_csv('Data/generated_splits/features_selected_randomforest/y1_test_f.csv', sep=',', header=0)
    if SELECTION == 'PCA':
        X_train = pd.read_csv('Data/generated_splits/features_selected_pca/X1_train_p.csv', sep=',', header=0)
        y_train = pd.read_csv('Data/generated_splits/features_selected_pca/y1_train_p.csv', sep=',', header=0)
        X_test = pd.read_csv('Data/generated_splits/features_selected_pca/X1_test_p.csv', sep=',', header=0)
        y_test = pd.read_csv('Data/generated_splits/features_selected_pca/y1_test_p.csv', sep=',', header=0)
elif VERSION == 2.1: 
    if SELECTION == 'RF':
        X_train = pd.read_csv('Data/generated_splits/features_selected_randomforest/X2_train_f.csv', sep=',', header=0)
        y_train = pd.read_csv('Data/generated_splits/features_selected_randomforest/y2_train_f.csv', sep=',', header=0)
        X_test = pd.read_csv('Data/generated_splits/features_selected_randomforest/X2_test_f.csv', sep=',', header=0)
        y_test = pd.read_csv('Data/generated_splits/features_selected_randomforest/y2_test_f.csv', sep=',', header=0)
    if SELECTION == 'PCA':
        X_train = pd.read_csv('Data/generated_splits/features_selected_pca/X2_train_p.csv', sep=',', header=0)
        y_train = pd.read_csv('Data/generated_splits/features_selected_pca/y2_train_p.csv', sep=',', header=0)
        X_test = pd.read_csv('Data/generated_splits/features_selected_pca/X2_test_p.csv', sep=',', header=0)
        y_test = pd.read_csv('Data/generated_splits/features_selected_pca/y2_test_p.csv', sep=',', header=0)
else: raise ValueError('VERSION value must be either 1.1 or 2.1, SELECTION must be either "RF" or "PCA".')   

In [21]:
display(X_train.head(3), X_train.shape)
display(X_test.head(3), X_test.shape)
display(y_train.head(3), y_train.shape)
display(y_test.head(3), y_test.shape)

Unnamed: 0,ewretd,ewretx,vwretx,vwretd,sprtrn,SHRENDDT,ALTPRCDT,pe_op_dil,CAPEI,pe_op_basic,VOL,RETX,RET,divyield,pe_inc,PEG_ltgforward,ps,SEASON_07,pcf,PEG_1yrforward
0,0.033697,0.032156,0.038183,0.040186,0.037655,20140929.0,20140829.0,15.633,20.773,15.468,4611190.0,0.024042,0.024042,0.0354,18.484,5.963,3.703,0.0,9.957,13.678
1,-0.013012,-0.015631,-0.017055,-0.015039,-0.014999,20130730.0,20130628.0,17.296,25.089,17.203,1296447.0,-0.066454,-0.057168,0.0398,39.232,5.702,0.689,0.0,8.509,1.521
2,0.044257,0.042687,0.043937,0.046173,0.043117,20140330.0,20140228.0,9.209,12.337,9.135,3473222.0,0.026373,0.026373,0.0268,13.062,2.248,2.034,0.0,1.993,0.38


(2836, 20)

Unnamed: 0,ewretd,ewretx,vwretx,vwretd,sprtrn,SHRENDDT,ALTPRCDT,pe_op_dil,CAPEI,pe_op_basic,VOL,RETX,RET,divyield,pe_inc,PEG_ltgforward,ps,SEASON_07,pcf,PEG_1yrforward
0,-0.044988,-0.046891,-0.026823,-0.025129,-0.015514,20141009.0,20140930.0,16.224,20.983,16.127,15283673.0,-0.017073,-0.017073,0.0187,16.224,1.33,3.318,0.0,10.484,1.414
1,0.051828,0.049754,0.035682,0.037477,0.029749,20131030.0,20130930.0,16.489,21.663,16.311,746229.0,0.050202,0.050202,0.0122,18.555,1.572,2.379,0.0,7.709,1.737
2,-0.011388,-0.012292,-0.038109,-0.037096,-0.036974,20100225.0,20100129.0,9.52,16.327,9.52,10148052.0,0.025838,0.025838,0.0343,15.55,6.149,3.293,0.0,8.49,-0.973


(710, 20)

Unnamed: 0,0
0,0
1,1
2,0


(2836, 1)

Unnamed: 0,0
0,1
1,0
2,1


(710, 1)

## 4.2.2. PIPING SETTINGS (P2)

In [22]:
# Set Kernel
set_kernel = 'rbf'
"""
Select 'poly', 'linear' or 'rbf'
"""

# Choose In-Pipe Feature Selection Method
set_feature_selecter = 'RF'
"""
Select 'RF' (for RandomForestClassifier) or 'PCA' (for Principal Component Analysis)
"""

# Choose Classifier
set_classifier = 'LR'
"""
Select 'LR' (for LogisticRegression) or 'RF' (for RandomForestClassifier)
"""

# Print result of settings
briefing_pipe='SUMMARY PIPE SETTINGS:'+'\n'+'Selected kernel: '+str(set_kernel)+'.'+'\n'+'Selected In-Pipe Feature Selecter: '+str(set_feature_selecter)+'.'+'\n'+'Additional Classifier: '+str(set_classifier)+'.'+'\n'
print(briefing_pipe)

SUMMARY PIPE SETTINGS:
Selected kernel: rbf.
Selected In-Pipe Feature Selecter: RF.
Additional Classifier: LR.



## 4.2.3. Piping (P2)

In [23]:
# set conditions so according to choices under "settings", the methods are going to be used automatically


# For Loop defining paremeters for additional Classifier
if set_classifier == 'RF': 
    classifier_function = RandomForestClassifier()
    cl_attr1 = 'cl__n_estimators'
    cl_attr1_par = [10, 20, 30]
    cl_attr2 = 'cl__random_state'
    cl_attr2_par =  [0, 10, 20]
elif set_classifier == 'LR': 
    classifier_function = LogisticRegression()
    cl_attr1 = 'cl__C'
    cl_attr1_par = [1, 50, 100]
    cl_attr2 = 'cl__random_state'
    cl_attr2_par =  [0, 10, 20]
else: raise ValueError('feature_selecter must be either PCA or RandomForestClassifier')

#display(fs_attr1, fs_attr1_par)
#display(fs_attr2, fs_attr2_par)
#display(cl_attr1, cl_attr1_par)
#display(cl_attr2, cl_attr2_par)

# print settings
print(briefing_data+'\n'+briefing_pipe)

SUMMARY DATA SETTINGS:
Selected Version 2.1: Reduced Dataset with Ratios + Seasonality + other Market Data as predictive Features.
Pre-Selected with Random Forest (RF).

SUMMARY PIPE SETTINGS:
Selected kernel: rbf.
Selected In-Pipe Feature Selecter: RF.
Additional Classifier: LR.



In [27]:
# Create pipeline object with standard scaler, chosen Feature Selecter and SVC estimator
# Standardscaler standardizes the input variables
pipe2_a = Pipeline([('scaler', StandardScaler()), ('cl', SVC(kernel=set_kernel, random_state=0))])
pipe2_b = Pipeline([('scaler', StandardScaler()), ('cl', classifier_function)])


# print used parameters
display(feat_sel_function, classifier_function)

SelectFromModel(estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=20, verbose=0, warm_start=False),
        norm_order=1, prefit=False, threshold='median')

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [28]:
# Display scores from resulting pipes
score2_a = pipe2_a.fit(X_train, y_train).score(X_test, y_test)
score2_b = pipe2_b.fit(X_train, y_train).score(X_test, y_test)

print('Score Pipe 2a = '+str(score2_a)+'   vs.   Ratio of "UP" (Test) = '+str(y_test['0'].sum()/len(y_test['0'])))
print('Score Pipe 2b = '+str(score2_b)+'   vs.   Ratio of "UP" (Test) = '+str(y_test['0'].sum()/len(y_test['0'])))

Score Pipe 2a = 0.6676056338028169   vs.   Ratio of "UP" (Test) = 0.5704225352112676
Score Pipe 2b = 0.6197183098591549   vs.   Ratio of "UP" (Test) = 0.5704225352112676


## 4.2.4. Grid Search (P2)

In [29]:
# Define parameter grid

# First part: scaling with StandardScaler, 
# Feature Selection with selected method, 
# Classification with SVC with selected kernel

# Second part:  scaling with StandardScaler,
# Feature Selection with selected method, 
# Classification with selected 'additional classifier' method

param_grid2 = [{'scaler': [StandardScaler()],                
               'cl': [SVC(kernel=set_kernel)],
               'cl__C': [10, 100]},
               {'scaler': [StandardScaler()],
                'cl': [classifier_function],
                cl_attr1: cl_attr1_par,
                cl_attr2: cl_attr2_par}]

# display chosen attributes
show_attributes = 'yes' # if not 'yes', assumed 'no'
if show_attributes == 'yes':
    print(str('Chosen Feature Selection Method: '+str(set_feature_selecter)+'\n'+fs_attr1+': '+str(fs_attr1_par)+'\n'+fs_attr2+': '+str(fs_attr2_par))+'\n')
    print(str('Chosen additional Classification Method (beside SVM): '+str(set_classifier)+'\n'+cl_attr1+': '+str(cl_attr1_par)+'\n'+cl_attr2+': '+str(cl_attr2_par)))

Chosen Feature Selection Method: RF
fs__estimator__n_estimators: [10, 20, 30]
fs__estimator__random_state: [0, 10, 20]

Chosen additional Classification Method (beside SVM): LR
cl__C: [1, 50, 100]
cl__random_state: [0, 10, 20]


In [30]:
# Grid Search for pipe_2a
### Paatieeence (this can take some minutes)
grid2_a = GridSearchCV(pipe2_a, param_grid2, cv=5, n_jobs=-1)
grid2_a.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('cl', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=0, shrinking=True,
  tol=0.001, verbose=False))]),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid=[{'scaler': [StandardScaler(copy=True, with_mean=True, with_std=True)], 'cl': [SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, v...1,
          verbose=0, warm_start=False)], 'cl__C': [1, 50, 100], 'cl__random_state': [0, 10, 20]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [31]:
# Print results
print('Best CV accuracy: {:.2f}'.format(grid2_a.best_score_))
print('Test score:       {:.2f}'.format(grid2_a.score(X_test, y_test)))
print('Best parameters: {}'.format(grid2_a.best_params_))

Best CV accuracy: 0.64
Test score:       0.67
Best parameters: {'cl': SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False), 'cl__C': 10, 'scaler': StandardScaler(copy=True, with_mean=True, with_std=True)}


In [33]:
# Grid Search for pipe2_b
### Paatieeence (this can take some minutes)
grid2_b = GridSearchCV(pipe2_b, param_grid2, cv=5, n_jobs=-1)
grid2_b.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('cl', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid=[{'scaler': [StandardScaler(copy=True, with_mean=True, with_std=True)], 'cl': [SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, v...1,
          verbose=0, warm_start=False)], 'cl__C': [1, 50, 100], 'cl__random_state': [0, 10, 20]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [34]:
# Print results
print('Best CV accuracy: {:.2f}'.format(grid2_b.best_score_)+'\n')
print('Test score:       {:.2f}'.format(grid2_b.score(X_test, y_test))+'\n')
print('Best parameters: {}'.format(grid2_b.best_params_))

Best CV accuracy: 0.64

Test score:       0.67

Best parameters: {'cl': SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False), 'cl__C': 10, 'scaler': StandardScaler(copy=True, with_mean=True, with_std=True)}


###    
###     
###     
# 4.3. PIPE 3
#### Feature Selection:  NONE
#### Scaling : In-Pipe with StandardScaler
#### Classification: SVM
#### additional Classification: RandomForestClassifier, LogisticRegression
###      

## 4.3.1. DATA SETTINGS (P3)


### Choose the Dataset Version you want
##### Whole Feature Matrices (Features not pre-selected)
VERSION = 1; Feature Matrix with only ratios                                  
VERSION = 2;  Feature Matrix with ratios + saisonality + other market data



In [43]:
### Chose which dataset version you want the selection of features and the prediction to be based on 
VERSION = 2
"""
INSERT NUMBER 1 or 2
"""

# Defining sel_state variable for easier printing out    
if VERSION == 1: sel_version = 'Whole original Dataset with only the Ratios as predicive Features.'
elif VERSION == 2: sel_version = 'Whole original Dataset with Ratios + Seasonality + other Market Data as predictive Features.'
else: raise ValueError('VERSION must be either 1 or 2')

In [44]:
# print status
briefing_data='SUMMARY DATA SETTINGS:'+'\n'+'Selected Version ' + str(VERSION) + ': ' + str(sel_version)+'\n'
print(briefing_data)

SUMMARY DATA SETTINGS:
Selected Version 2: Whole original Dataset with Ratios + Seasonality + other Market Data as predictive Features.



##### Import necessary Data and impute

In [45]:
# Import Data
if VERSION == 1: 
# features not pre-selected, only ratios
    X = pd.read_csv('Data/generated_datasets/features_ratios_1.csv', sep=',', header=0)
    y = pd.read_csv('Data/generated_datasets/response_1.csv', sep=',', header=0)
elif VERSION == 2: 
# features not pre-selected, ratios + seasonality + market data
    X = pd.read_csv('Data/generated_datasets/features_additional_1.csv', sep=',', header=0)
    y = pd.read_csv('Data/generated_datasets/response_1.csv', sep=',', header=0)
else: raise ValueError('VERSION value must be either 1 or 2')   

In [46]:
# Train/test split, into 20% test size and 80% train size because it is a relatively small dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Use a median fill for train
imp = Imputer(missing_values=np.nan, strategy = 'median' , axis=0)
imputed_dataset = pd.DataFrame(imp.fit_transform(X_train))
imputed_dataset.columns = X_train.columns
imputed_dataset.index = X_train.index
X_train = imputed_dataset

# Use a median fill for the test set
imputed_dataset = pd.DataFrame(imp.fit_transform(X_test))
imputed_dataset.columns = X_test.columns
imputed_dataset.index = X_test.index
X_test = imputed_dataset

display(X_train.head(3),X_train.shape)
display(X_test.head(3), X_test.shape)
display(y_train.head(3), y_train.shape)
display(y_test.head(3), y_test.shape)

Unnamed: 0,SHRFLG,SHRENDDT,BIDLO,ASKHI,PRC,VOL,RET,BID,ASK,SHROUT,...,CUSIP_65410610,CUSIP_71708110,CUSIP_74271810,CUSIP_88579Y10,CUSIP_89417E10,CUSIP_91301710,CUSIP_91324P10,CUSIP_92343V10,CUSIP_92826C83,CUSIP_93114210
1530,0.0,20140929.0,28.04,29.49,29.39,4611190.0,0.024042,29.4,29.41,6340863.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1397,0.0,20130730.0,32.17,34.65,32.17,1296447.0,-0.057168,32.16,32.17,1209589.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2238,0.0,20140330.0,54.31,58.49,56.82,3473222.0,0.026373,56.83,56.84,3786825.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


(2836, 181)

Unnamed: 0,SHRFLG,SHRENDDT,BIDLO,ASKHI,PRC,VOL,RET,BID,ASK,SHROUT,...,CUSIP_65410610,CUSIP_71708110,CUSIP_74271810,CUSIP_88579Y10,CUSIP_89417E10,CUSIP_91301710,CUSIP_91324P10,CUSIP_92343V10,CUSIP_92826C83,CUSIP_93114210
817,0.0,20141009.0,97.87,103.3,100.75,15283673.0,-0.017073,100.75,100.76,5866161.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2592,0.0,20131030.0,72.43,77.64,75.52,746229.0,0.050202,75.53,75.54,1078864.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1475,0.0,20100225.0,18.53,20.0,18.66,10148052.0,0.025838,18.66,18.67,8069536.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


(710, 181)

Unnamed: 0,0
1530,0
1397,1
2238,0


(2836, 1)

Unnamed: 0,0
817,1
2592,0
1475,1


(710, 1)

## 4.3.2. PIPING SETTINGS (P3)

In [47]:
# Set Kernel
set_kernel = 'linear'
"""
Select 'poly', 'linear' or 'rbf'
"""


# Choose Classifier
set_classifier = 'LR'
"""
Select 'LR' (for LogisticRegression) or 'RF' (for RandomForestClassifier)
"""

# Print result of settings
briefing_pipe='SUMMARY PIPE SETTINGS:'+'\n'+'Selected kernel: '+str(set_kernel)+'.'+'\n'+'Additional Classifier: '+str(set_classifier)+'.'+'\n'
print(briefing_pipe)

SUMMARY PIPE SETTINGS:
Selected kernel: linear.
Additional Classifier: LR.



## 4.3.3. Piping (P3)

In [48]:
# set conditions so according to choices under "settings", the methods are going to be used automatically

# For Loop defining paremeters for additional Classifier
if set_classifier == 'RF': 
    classifier_function = RandomForestClassifier()
    cl_attr1 = 'cl__n_estimators'
    cl_attr1_par = [10, 20, 30]
    cl_attr2 = 'cl__random_state'
    cl_attr2_par =  [0, 10, 20]
elif set_classifier == 'LR': 
    classifier_function = LogisticRegression()
    cl_attr1 = 'cl__C'
    cl_attr1_par = [1, 50, 100]
    cl_attr2 = 'cl__random_state'
    cl_attr2_par =  [0, 10, 20]
else: raise ValueError('feature_selecter must be either PCA or RandomForestClassifier')

#display(fs_attr1, fs_attr1_par)
#display(fs_attr2, fs_attr2_par)
#display(cl_attr1, cl_attr1_par)
#display(cl_attr2, cl_attr2_par)

# print settings
print(briefing_data+'\n'+briefing_pipe)

SUMMARY DATA SETTINGS:
Selected Version 2: Whole original Dataset with Ratios + Seasonality + other Market Data as predictive Features.

SUMMARY PIPE SETTINGS:
Selected kernel: linear.
Additional Classifier: LR.



In [49]:
# Create pipeline object with standard scaler, chosen Feature Selecter and SVC estimator
# Standardscaler standardizes the input variables
pipe3_a = Pipeline([('scaler', StandardScaler()), ('cl', SVC(kernel=set_kernel, random_state=0))])
pipe3_b = Pipeline([('scaler', StandardScaler()), ('cl', classifier_function)])


# print used parameters
display(feat_sel_function, classifier_function)

SelectFromModel(estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=20, verbose=0, warm_start=False),
        norm_order=1, prefit=False, threshold='median')

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [50]:
# Display scores from resulting pipes
score3_a = pipe3_a.fit(X_train, y_train).score(X_test, y_test)
score3_b = pipe3_b.fit(X_train, y_train).score(X_test, y_test)

print('Score Pipe 3a = '+str(score2_a)+'   vs.   Ratio of "UP" (Test) = '+str(y_test['0'].sum()/len(y_test['0'])))
print('Score Pipe 3b = '+str(score2_b)+'   vs.   Ratio of "UP" (Test) = '+str(y_test['0'].sum()/len(y_test['0'])))

Score Pipe 3a = 0.6676056338028169   vs.   Ratio of "UP" (Test) = 0.5704225352112676
Score Pipe 3b = 0.6197183098591549   vs.   Ratio of "UP" (Test) = 0.5704225352112676


## 4.3.4. Grid Search (P3)

In [None]:
# Define parameter grid

# First part: scaling with StandardScaler, 
# Feature Selection with selected method, 
# Classification with SVC with selected kernel

# Second part:  scaling with StandardScaler,
# Feature Selection with selected method, 
# Classification with selected 'additional classifier' method

param_grid3 = [{'scaler': [StandardScaler()],                
               'cl': [SVC(kernel=set_kernel)],
               'cl__C': [10, 100]},
               {'scaler': [StandardScaler()], 
                'cl': [classifier_function],
                cl_attr1: cl_attr1_par,
                cl_attr2: cl_attr2_par}]

# display chosen attributes
show_attributes = 'yes' # if not 'yes', assumed 'no'
if show_attributes == 'yes':
    print(str('Chosen additional Classification Method (beside SVM): '+str(set_classifier)+'\n'+cl_attr1+': '+str(cl_attr1_par)+'\n'+cl_attr2+': '+str(cl_attr2_par)))

In [54]:
# Grid Search for pipe3_a
### Paatieeence (this can take some minutes)
grid3_a = GridSearchCV(pipe3_a, param_grid3, cv=5, n_jobs=-1)
grid3_a.fit(X_train, y_train)

KeyboardInterrupt: 

In [None]:
# Print results
print('Best CV accuracy: {:.2f}'.format(grid3_a.best_score_)+'\n')
print('Test score:       {:.2f}'.format(grid3_a.score(X_test, y_test))+'\n')
print('Best parameters: {}'.format(grid3_a.best_params_))

In [None]:
# Grid Search for pipe3_b
### Paatieeence (this can take some minutes)
grid3_b = GridSearchCV(pipe3_b, param_grid3, cv=5, n_jobs=-1)
grid3_b.fit(X_train, y_train)

In [None]:
# Print results
print('Best CV accuracy: {:.2f}'.format(grid3_b.best_score_)+'\n')
print('Test score:       {:.2f}'.format(grid3_b.score(X_test, y_test))+'\n')
print('Best parameters: {}'.format(grid3_b.best_params_))