# Introduction to Machine Learning, UZH 2018, Group Project
### Group 2: Barbara Capl, Mathias Lüthi, Pamela Matias, Stefanie Rentsch
##       
# 3. Support Vector Machines (SVM)

In this section we use the feature matrices and response vectors with features selected in chapter 2.  

#### We use two different versions (created in chapter 1):
Version 1: Feature Matrix consists only of the Ratios                                                                        
Version 2: Feature Matrix consists of Ratios + dummy variables for seasonality + other market data
####   

In [22]:
# hide unnecessary warnings ("depreciation" of packages etc.)
import warnings
warnings.filterwarnings('ignore')

# Load packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.formula.api as smf
plt.style.use('seaborn-whitegrid')
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

## 3.0. Choose which Feature Matrix (Version 1 or 2) you want to load in by choosing the Case
### Available are: Case 1 and Case 2

In [23]:
# If case = 1, Feature Matrix Version 1 is used (see description above below title)
# If case = 2, feature matrix Version 2 is used (see description above below title)

_CASE_ = 1

## 3.1. Preparation

### 3.1.1. Import the Response Vector and the Feature Matrix

In [24]:
# import Data (already splitted to train/test-data and selected features-> bc_randomforest_feature_selection)
if _CASE_ == 1:
    X_train_s = pd.read_csv('Data/generated_splits/features_selected_randomforest/X1_train_s.csv', sep=',', header=0)
    X_test_s = pd.read_csv('Data/generated_splits/features_selected_randomforest/X1_test_s.csv', sep=',', header=0)
    y_train_s = pd.read_csv('Data/generated_splits/features_selected_randomforest/y1_train_s.csv', sep=',', header=0)
    y_test_s = pd.read_csv('Data/generated_splits/features_selected_randomforest/y1_test_s.csv', sep=',', header=0)
elif _CASE_ == 2:
    X_train_s = pd.read_csv('Data/generated_splits/features_selected_randomforest/X2_train_s.csv', sep=',', header=0)
    X_test_s = pd.read_csv('Data/generated_splits/features_selected_randomforest/X2_test_s.csv', sep=',', header=0)
    y_train_s = pd.read_csv('Data/generated_splits/features_selected_randomforest/y2_train_s.csv', sep=',', header=0)
    y_test_s = pd.read_csv('Data/generated_splits/features_selected_randomforest/y2_test_s.csv', sep=',', header=0)
else: raise ValueError('_CASE_ value must be either 1 or 2')


display(X_train_s.head(3))
display(y_train_s.head(3))

Unnamed: 0,CAPEI,pcf,divyield,pe_inc,evm,bm,pe_op_dil,PEG_ltgforward,pe_op_basic,ptb,aftret_equity,accrual,pe_exi,PEG_1yrforward,fcf_ocf
0,20.773,9.957,0.0354,18.484,10.644,0.547,15.633,5.963,15.468,1.811,0.134,0.048,18.484,13.678,0.932
1,25.089,8.509,0.0398,39.232,9.246,0.461,17.296,5.702,17.203,2.189,0.058,0.047,39.232,1.521,0.441
2,12.337,1.993,0.0268,13.062,12.474,0.911,9.209,2.248,9.135,1.076,0.086,0.038,13.062,0.38,1.0


Unnamed: 0,0
0,1
1,0
2,1


### 3.1.2. Print out Shape and Form of Feature Matrix and Response Vector

### Train Set

In [25]:
print("")
print('Shape (rows, columns) of Feature Matrix X (Train), Case ' + str(_CASE_), ' = ' + str(X_train_s.shape))
print("")
print("")
print('Feature Matrix X (Train) with selected Features, Case ' + str(_CASE_))
display(X_train_s.head(3))
print("")
print('Response Vector y (Train) after Feature Selection, Case ' + str(_CASE_))
display(y_train_s.head(3))
print("")


Shape (rows, columns) of Feature Matrix X (Train), Case 1  = (2836, 15)


Feature Matrix X (Train) with selected Features, Case 1


Unnamed: 0,CAPEI,pcf,divyield,pe_inc,evm,bm,pe_op_dil,PEG_ltgforward,pe_op_basic,ptb,aftret_equity,accrual,pe_exi,PEG_1yrforward,fcf_ocf
0,20.773,9.957,0.0354,18.484,10.644,0.547,15.633,5.963,15.468,1.811,0.134,0.048,18.484,13.678,0.932
1,25.089,8.509,0.0398,39.232,9.246,0.461,17.296,5.702,17.203,2.189,0.058,0.047,39.232,1.521,0.441
2,12.337,1.993,0.0268,13.062,12.474,0.911,9.209,2.248,9.135,1.076,0.086,0.038,13.062,0.38,1.0



Response Vector y (Train) after Feature Selection, Case 1


Unnamed: 0,0
0,1
1,0
2,1





### Test Set

In [26]:
print("")
print('Shape (rows, columns) of Feature Matrix X (Test), Case ' + str(_CASE_), ' = ' + str(X_test_s.shape))
print("")
print('Feature Matrix X (Test) with selected Features, Case ' + str(_CASE_))
display(X_test_s.head(3))
print("")
print('Response Vector y (Test) after Feature Selection, Case ' + str(_CASE_))
display(y_test_s.head(3))


Shape (rows, columns) of Feature Matrix X (Test), Case 1  = (710, 15)

Feature Matrix X (Test) with selected Features, Case 1


Unnamed: 0,CAPEI,pcf,divyield,pe_inc,evm,bm,pe_op_dil,PEG_ltgforward,pe_op_basic,ptb,aftret_equity,accrual,pe_exi,PEG_1yrforward,fcf_ocf
0,20.983,10.484,0.0187,16.224,8.585,0.254,16.224,1.33,16.127,4.189,0.31,0.084,16.224,1.414,0.863
1,21.663,7.709,0.0122,18.555,13.619,0.235,16.489,1.572,16.311,4.281,0.238,0.039,18.555,1.737,0.902
2,16.327,8.49,0.0343,15.55,6.716,0.513,9.52,6.149,9.52,2.198,0.13,0.077,15.681,-0.973,0.934



Response Vector y (Test) after Feature Selection, Case 1


Unnamed: 0,0
0,0
1,1
2,1


## 3.2. SVM

### Two different SVM tests are applied:
#### => SVM1 = SVM with random parameters
#### => SVM2 = SVM with other parameters

### 3.2.1. SVM1 : SVM with random parameters

In [27]:
# Create pipeline object with standard scaler and SVC estimator
# Standardscaler standardizes the input variables
pipe1 = Pipeline([('scaler', StandardScaler()), 
                 ('classifier', SVC(random_state=0))])

In [28]:
# Define parameter grid
param_grid1 = [{'scaler': [StandardScaler()],
               'classifier': [SVC(kernel='rbf')],
               'classifier__gamma': [1, 10],
               'classifier__C': [10, 100]},
              {'scaler': [StandardScaler(), None],
               'classifier': [LogisticRegression()],
               'classifier__C': [10, 100]}]

In [29]:
# Run grid search
grid1 = GridSearchCV(pipe1, param_grid1, cv=5, n_jobs=-1)
grid1.fit(X_train_s, y_train_s)

GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('classifier', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=0, shrinking=True,
  tol=0.001, verbose=False))]),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid=[{'scaler': [StandardScaler(copy=True, with_mean=True, with_std=True)], 'classifier': [SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol...olver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)], 'classifier__C': [10, 100]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [30]:
# Print results
print("")
print('Best CV accuracy: {:.2f}'.format(grid1.best_score_))
print('Test score:       {:.2f}'.format(grid1.score(X_test_s, y_test_s)))
print("")
print('Best parameters: {}'.format(grid1.best_params_))


Best CV accuracy: 0.59
Test score:       0.62

Best parameters: {'classifier': LogisticRegression(C=100, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False), 'classifier__C': 100, 'scaler': None}


In [31]:
# Predict classes
y_pred1 = grid1.predict(X_test_s)
display(y_pred1[0:20])

array([1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0])

In [32]:
print("")
print('Metrics of Classification with SVM1 (random parameters), kernel rbf:')
print("")
print(metrics.classification_report(y_test_s, y_pred1))
print(metrics.confusion_matrix(y_test_s, y_pred1))


Metrics of Classification with SVM1 (random parameters), kernel rbf:

             precision    recall  f1-score   support

          0       0.65      0.28      0.39       307
          1       0.62      0.89      0.73       403

avg / total       0.63      0.62      0.58       710

[[ 86 221]
 [ 46 357]]


### 3.2.1. SVM2 : SVM with other parameters

### Kernel: linear

In [36]:
# Create pipeline object with standard scaler and SVC estimator
# Standardscaler standardizes the input variables
pipe2 = Pipeline([('scaler', StandardScaler()), 
                 ('classifier', SVC(random_state=0))])

In [37]:
# Define parameter grid
param_grid2 = [{'scaler': [StandardScaler()],
               'classifier': [SVC(kernel='linear')],
               'classifier__gamma': [1, 10],
               'classifier__C': [10, 100]},
              {'scaler': [StandardScaler(), None],
               'classifier': [LogisticRegression()],
               'classifier__C': [10, 100]}]

In [38]:
# Run grid search
grid2 = GridSearchCV(pipe2, param_grid2, cv=5, n_jobs=-1)
grid2.fit(X_train_s, y_train_s)

GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('classifier', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=0, shrinking=True,
  tol=0.001, verbose=False))]),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid=[{'scaler': [StandardScaler(copy=True, with_mean=True, with_std=True)], 'classifier': [SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  ...olver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)], 'classifier__C': [10, 100]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [39]:
# Print results
print("")
print('Best CV accuracy: {:.2f}'.format(grid2.best_score_))
print('Test score:       {:.2f}'.format(grid2.score(X_test_s, y_test_s)))
print("")
print('Best parameters: {}'.format(grid2.best_params_))


Best CV accuracy: 0.59
Test score:       0.62

Best parameters: {'classifier': LogisticRegression(C=100, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False), 'classifier__C': 100, 'scaler': None}


In [16]:
# Predict   classes
y_pred2 = grid2.predict(X_test_s)
display(y_pred2[0:20])

array([1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0])

In [17]:
print("")
print('Metrics of Classification with SVM2 (random parameters), kernel linear:')
print("")
print(metrics.classification_report(y_test_s, y_pred2))
print(metrics.confusion_matrix(y_test_s, y_pred2))


Metrics of Classification with SVM2 (random parameters), kernel linear:

             precision    recall  f1-score   support

          0       0.65      0.28      0.39       307
          1       0.62      0.89      0.73       403

avg / total       0.63      0.62      0.58       710

[[ 86 221]
 [ 46 357]]


### Kernel: poly

In [18]:
# Create pipeline object with standard scaler and SVC estimator
pipe3 = Pipeline([('scaler', StandardScaler()), 
                 ('classifier', SVC(random_state=0))])

In [20]:
# Define parameter grid
param_grid3 = [{'scaler': [StandardScaler()],
               'classifier': [SVC(kernel= "poly")],
               'classifier__gamma': [1, 10],
               'classifier__C': [10, 100]},
              {'scaler': [StandardScaler(), None],
               'classifier': [LogisticRegression()],
               'classifier__C': [10, 100]}]

In [21]:
# Run grid search
grid3 = GridSearchCV(pipe3, param_grid3, cv=5, n_jobs=-1)
grid3.fit(X_train_s, y_train_s)

KeyboardInterrupt: 

In [None]:
# Print results
print("")
print('Best CV accuracy: {:.2f}'.format(grid3.best_score_))
print('Test score:       {:.2f}'.format(grid3.score(X_test_s, y_test_s)))
print("")
print('Best parameters: {}'.format(grid3.best_params_))

In [None]:
# Predict   classes
y_pred3 = grid3.predict(X_test_s)
display(y_pred3[0:20])

In [None]:
print("")
print('Metrics of Classification with SVM2 (random parameters), kernel poly:')
print("")
print(metrics.classification_report(y_test_s, y_pred3))
print(metrics.confusion_matrix(y_test_s, y_pred3))