# Introduction to Machine Learning, UZH 2018, Group Project
### Group 2: Barbara Capl, Mathias Lüthi, Pamela Matias, Stefanie Rentsch
##       
##    
# 4. Collection of all Pipes

###        
In this section we use the data we prepared in chapter 1.  



In [1]:
# hide unnecessary warnings ("depreciation" of packages etc.)
import warnings
warnings.filterwarnings('ignore')

# import packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.formula.api as smf

from sklearn import metrics
from sklearn import preprocessing
from sklearn.preprocessing import Imputer, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.feature_selection import SelectFromModel
from sklearn.svm import SVC
plt.style.use('seaborn-whitegrid')

###    
###    
## 4.1. Pipe 1 (P1)

#### Feature Selection: In-Pipe with RandomForestClassifier or PCA
#### Scaling : In-Pipe with StandardScaler
#### Classification: SVM
#### Additional Classification: RandomForestClassifier, LogisticRegression
###      

### 4.1.0.  (i) DATA SETTINGS (P1)


### Choose the Dataset Version you want
##### Whole Feature Matrices (Features not pre-selected)
VERSION = 1; Feature Matrix with only ratios                                  
VERSION = 2;  Feature Matrix with ratios + saisonality + other market data



In [2]:
### Chose which dataset version you want the selection of features and the prediction to be based on 
VERSION = 1
"""
INSERT NUMBER 1 or 2
"""

# Defining sel_state variable for easier printing out    
if VERSION == 1: sel_version = 'Whole original Dataset with only the Ratios as predicive Features.'
elif VERSION == 2: sel_version = 'Whole original Dataset with Ratios + Seasonality + other Market Data as predictive Features.'
else: raise ValueError('VERSION must be either 1 or 2')

In [3]:
# print status
briefing_data='SUMMARY DATA SETTINGS:'+'\n'+'Selected Version ' + str(VERSION) + ': ' + str(sel_version)+'\n'
print(briefing_data)

SUMMARY DATA SETTINGS:
Selected Version 1: Whole original Dataset with only the Ratios as predicive Features.



##### Import necessary Data and impute

In [4]:
# Import Data
if VERSION == 1: 
# features not pre-selected, only ratios
    X = pd.read_csv('Data/generated_datasets/features_ratios_1.csv', sep=',', header=0)
    y = pd.read_csv('Data/generated_datasets/response_1.csv', sep=',', header=0)
elif VERSION == 2: 
# features not pre-selected, ratios + seasonality + market data
    X = pd.read_csv('Data/generated_datasets/features_additional_1.csv', sep=',', header=0)
    y = pd.read_csv('Data/generated_datasets/response_1.csv', sep=',', header=0)
else: raise ValueError('VERSION value must be either 1 or 2')   

In [5]:
# Train/test split, into 20% test size and 80% train size because it is a relatively small dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Use a median fill for train
imp = Imputer(missing_values=np.nan, strategy = 'median' , axis=0)
imputed_dataset = pd.DataFrame(imp.fit_transform(X_train))
imputed_dataset.columns = X_train.columns
imputed_dataset.index = X_train.index
X_train = imputed_dataset

# Use a median fill for the test set
imputed_dataset = pd.DataFrame(imp.fit_transform(X_test))
imputed_dataset.columns = X_test.columns
imputed_dataset.index = X_test.index
X_test = imputed_dataset

display(X_train.head(3))
display(X_test.head(3))
display(y_train.head(3))
display(y_test.head(3))

Unnamed: 0,CAPEI,bm,evm,pe_op_basic,pe_op_dil,pe_exi,pe_inc,ps,pcf,dpr,...,sale_nwc,rd_sale,adv_sale,staff_sale,accrual,ptb,PEG_trailing,divyield,PEG_1yrforward,PEG_ltgforward
1530,20.773,0.547,10.644,15.468,15.633,18.484,18.484,3.703,9.957,0.623,...,1.424,0.136,0.058,0.0,0.048,1.811,1.181,0.0354,13.678,5.963
1397,25.089,0.461,9.246,17.203,17.296,39.232,39.232,0.689,8.509,2.175,...,4.991,0.031,0.0,0.0,0.047,2.189,0.668,0.0398,1.521,5.702
2238,12.337,0.911,12.474,9.135,9.209,13.062,13.062,2.034,1.993,0.337,...,5.8105,0.0,0.024,0.291,0.038,1.076,3.048,0.0268,0.38,2.248


Unnamed: 0,CAPEI,bm,evm,pe_op_basic,pe_op_dil,pe_exi,pe_inc,ps,pcf,dpr,...,sale_nwc,rd_sale,adv_sale,staff_sale,accrual,ptb,PEG_trailing,divyield,PEG_1yrforward,PEG_ltgforward
817,20.983,0.254,8.585,16.127,16.224,16.224,16.224,3.318,10.484,0.287,...,6.768,0.031,0.006,0.0,0.084,4.189,1.022,0.0187,1.414,1.33
2592,21.663,0.235,13.619,16.311,16.489,18.555,18.555,2.379,7.709,0.199,...,7.169,0.0,0.0,0.195,0.039,4.281,1.147,0.0122,1.737,1.572
1475,16.327,0.513,6.716,9.52,9.52,15.681,15.55,3.293,8.49,0.792,...,1.286,0.162,0.054,0.0,0.077,2.198,1.022,0.0343,-0.973,6.149


Unnamed: 0,0
1530,0
1397,1
2238,0


Unnamed: 0,0
817,1
2592,0
1475,1


### 4.2.0. (ii) PIPING SETTINGS (P1)

In [6]:
# Set Kernel
set_kernel = 'linear'
"""
Select 'poly', 'linear' or 'rbf'
"""

# Choose In-Pipe Feature Selection Method
set_feature_selecter = 'RF'
"""
Select 'RF' (for RandomForestClassifier) or 'PCA' (for Principal Component Analysis)
"""

# Choose Classifier
set_classifier = 'LR'
"""
Select 'LR' (for LogisticRegression) or 'RF' (for RandomForestClassifier)
"""

# Print result of settings
briefing_pipe='SUMMARY PIPE SETTINGS:'+'\n'+'Selected kernel: '+str(set_kernel)+'.'+'\n'+'Selected In-Pipe Feature Selecter: '+str(set_feature_selecter)+'.'+'\n'+'Additional Classifier: '+str(set_classifier)+'.'+'\n'
print(briefing_pipe)

SUMMARY PIPE SETTINGS:
Selected kernel: linear.
Selected In-Pipe Feature Selecter: RF.
Additional Classifier: LR.



### 4.2.1. Piping (P1)

In [7]:
# set conditions so according to choices under "settings", the methods are going to be used automatically

# For Loop defining paremeters for Feature Selecter
if set_feature_selecter == 'PCA': 
    feat_sel_function = PCA()
    fs_attr1  ='fs__n_components'
    fs_attr1_par = [10, 20, 30]
    fs_attr2 = 'fs__random_state'
    fs_attr2_par = [0, 10, 20]  

elif set_feature_selecter == 'RF': 
    feat_sel_function = SelectFromModel(estimator=RandomForestClassifier(), threshold = 'median')
    fs_attr1 = 'fs__estimator__n_estimators'
    fs_attr1_par = [10, 20, 30]
    fs_attr2 = 'fs__estimator__random_state'
    fs_attr2_par = [0, 10, 20]
else: raise ValueError('feature_selecter must be either PCA or RandomForestClassifier')

# For Loop defining paremeters for additional Classifier
if set_classifier == 'RF': 
    classifier_function = RandomForestClassifier()
    cl_attr1 = 'cl__n_estimators'
    cl_attr1_par = [10, 20, 30]
    cl_attr2 = 'cl__random_state'
    cl_attr2_par =  [0, 10, 20]
elif set_classifier == 'LR': 
    classifier_function = LogisticRegression()
    cl_attr1 = 'cl__C'
    cl_attr1_par = [1, 50, 100]
    cl_attr2 = 'cl__random_state'
    cl_attr2_par =  [0, 10, 20]
else: raise ValueError('feature_selecter must be either PCA or RandomForestClassifier')

#display(fs_attr1, fs_attr1_par)
#display(fs_attr2, fs_attr2_par)
#display(cl_attr1, cl_attr1_par)
#display(cl_attr2, cl_attr2_par)

# print settings
print(briefing_data+'\n'+briefing_pipe)

SUMMARY DATA SETTINGS:
Selected Version 1: Whole original Dataset with only the Ratios as predicive Features.

SUMMARY PIPE SETTINGS:
Selected kernel: linear.
Selected In-Pipe Feature Selecter: RF.
Additional Classifier: LR.



In [8]:
# Create pipeline object with standard scaler, chosen Feature Selecter and SVC estimator
# Standardscaler standardizes the input variables
pipe1_a = Pipeline([('scaler', StandardScaler()), 
                  ('fs', feat_sel_function), 
                  ('cl', SVC(kernel=set_kernel, random_state=0))])
pipe1_b = Pipeline([('scaler', StandardScaler()), 
                  ('fs', feat_sel_function), 
                  ('cl', classifier_function)])


# print used parameters
display(feat_sel_function, classifier_function)


SelectFromModel(estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
        norm_order=1, prefit=False, threshold='median')

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [9]:
# Display scores from resulting pipes
display(pipe1_a.fit(X_train, y_train).score(X_test, y_test))
display(pipe1_b.fit(X_train, y_train).score(X_test, y_test))


0.6169014084507042

0.6507042253521127

In [10]:
# Define parameter grid

# First part: scaling with StandardScaler, 
# Feature Selection with selected method, 
# Classification with SVC with selected kernel

# Second part:  scaling with StandardScaler,
# Feature Selection with selected method, 
# Classification with selected 'additional classifier' method

param_grid1 = [{'scaler': [StandardScaler()],
               'fs': [feat_sel_function],
                fs_attr1: fs_attr1_par,
                fs_attr2: fs_attr2_par,                
               'cl': [SVC(kernel=set_kernel)],
               'cl__C': [10, 100]},
               {'scaler': [StandardScaler()],
                'fs': [feat_sel_function],
                cl_attr1: cl_attr1_par,
                cl_attr2: cl_attr2_par, 
                'cl': [classifier_function],
                cl_attr1: cl_attr1_par,
                cl_attr2: cl_attr2_par}]

# display chosen attributes
show_attributes ='yes' # if not 'yes', assumed 'no'
if show_attributes == 'yes':
    print(str('Chosen Feature Selection Method: '+str(set_feature_selecter)+'\n'+fs_attr1+': '+str(fs_attr1_par)+'\n'+fs_attr2+': '+str(fs_attr2_par))+'\n')
    print(str('Chosen additional Classification Method (beside SVM): '+str(set_classifier)+'\n'+cl_attr1+': '+str(cl_attr1_par)+'\n'+cl_attr2+': '+str(cl_attr2_par)))

Chosen Feature Selection Method: RF
fs__estimator__n_estimators: [10, 20, 30]
fs__estimator__random_state: [0, 10, 20]

Chosen additional Classification Method (beside SVM): LR
cl__C: [1, 50, 100]
cl__random_state: [0, 10, 20]


In [None]:
# Grid Search
grid1_a = GridSearchCV(pipe1_a, param_grid1, cv=5, n_jobs=-1)
grid1_a.fit(X_train, y_train)




In [None]:
grid1_b = GridSearchCV(pipe1_b, param_grid1, cv=5, n_jobs=-1)
grid1_b.fit(X_train, y_train)

In [None]:
# Print results
print('Best CV accuracy: {:.2f}'.format(grid.best_score_))
print('Test score:       {:.2f}'.format(grid.score(X_test_bal, y_test_bal)))
print('Best parameters: {}'.format(grid.best_params_))

###    
###    
## 4.3. Pipe 2: 
#### Feature Selection:  before Pipe with RandomForestClassifier or PCA (Chapter 2.A and 2.B)
#### Scaling : In-Pipe with StandardScaler
#### Classification: SVM
#### additional Classification: RandomForestClassifier, LogisticRegression
###      

###    
###    
## 4.3. Pipe 3: 
### Feature Selection:  none
### Classification: SVM
### additional Classification: RandomForestClassifier, LogisticRegression