# Introduction to Machine Learning, UZH 2018, Group Project
### Group 2: Barbara Capl, Mathias Lüthi, Pamela Matias, Stefanie Rentsch
##       
# 3. Support Vector Machines (SVM)


In this section we use the feature matrices and response vectors with features selected in chapter 2.  

#### We use two different versions (created in chapter 1, features-selected in chapter 2):
Version 1: Feature Matrix consists only of the Ratios                                                                        
Version 2: Feature Matrix consists of Ratios + dummy variables for seasonality + other market data
####  

In [1]:
# hide unnecessary warnings ("depreciation" of packages etc.)
import warnings
warnings.filterwarnings('ignore')

# Load packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.formula.api as smf
plt.style.use('seaborn-whitegrid')
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

## 3.0. Choose which Feature Matrix (Version 1 or 2) you want to load in by choosing the Case

#### 1) Choose the Dataset Version you want
VERSION = 1; Feature Matrix with only ratios                                  
VERSION = 2;  Feature Matrix with ratios + saisonality + other market data


In [2]:
### Chose which dataset version you want the selection of features and the prediction to be based on 
VERSION = 2
"""
INSERT NUMBER 1 or 2
"""


# Define sel_state variable for easier printing out    
if VERSION == 1:
    sel_version = 'Based on Dataset with only the Ratios Dataset as predicive Features'
elif VERSION == 2:
    sel_version = 'Based on Dataset with Ratios + Seasonality + other Market Data as predictive Features'
else: raise ValueError('VERSION must be either 1 or 2')

#### 2) Choose with which method you want to have the features been pre-selected
SELECTION  = RF ; Features pre-selected with Random Forest Classifier                                                           
SELECTION = PCA; Features pre-selected with Principal Component Analysis (PCA)

In [3]:
### Choose whether you want the datasets with features selected with RF or PCA
SELECTION = 'RF'
"""
INSERT 'RF' OR 'PCA'
"""


# Define sel_state variable for easier printing out    
if SELECTION == 'RF':
    sel_feat = 'Random Forest (RF)'
elif SELECTION == 'PCA':
    sel_feat = 'Principal Component Analysis (PCA)'
else: raise ValueError('SELECTION must be either RF or PCA')

## 3.1. Preparation

### 3.1.1. Import the Response Vector and the Feature Matrix

In [4]:
# import Data (already splitted to train/test-data and selected features-> bc_randomforest_feature_selection)
if VERSION == 1:
    if SELECTION == 'RF':
        X_train_s = pd.read_csv('Data/generated_splits/features_selected_randomforest/X1_train_f.csv', sep=',', header=0)
        X_test_s = pd.read_csv('Data/generated_splits/features_selected_randomforest/X1_test_f.csv', sep=',', header=0)
        y_train_s = pd.read_csv('Data/generated_splits/features_selected_randomforest/y1_train_f.csv', sep=',', header=0)
        y_test_s = pd.read_csv('Data/generated_splits/features_selected_randomforest/y1_test_f.csv', sep=',', header=0)
    elif SELECTION == 'PCA':
        X_train_s = pd.read_csv('Data/generated_splits/features_selected_pca/X1_train_p.csv', sep=',', header=0)
        X_test_s = pd.read_csv('Data/generated_splits/features_selected_pca/X1_test_p.csv', sep=',', header=0)
        y_train_s = pd.read_csv('Data/generated_splits/features_selected_pca/y1_train_p.csv', sep=',', header=0)
        y_test_s = pd.read_csv('Data/generated_splits/features_selected_pca/y1_test_p.csv', sep=',', header=0)
elif VERSION == 2:
    if SELECTION == 'RF':
        X_train_s = pd.read_csv('Data/generated_splits/features_selected_randomforest/X2_train_f.csv', sep=',', header=0)
        X_test_s = pd.read_csv('Data/generated_splits/features_selected_randomforest/X2_test_f.csv', sep=',', header=0)
        y_train_s = pd.read_csv('Data/generated_splits/features_selected_randomforest/y2_train_f.csv', sep=',', header=0)
        y_test_s = pd.read_csv('Data/generated_splits/features_selected_randomforest/y2_test_f.csv', sep=',', header=0)
    elif SELECTION == 'PCA':
        X_train_s = pd.read_csv('Data/generated_splits/features_selected_pca/X2_train_p.csv', sep=',', header=0)
        X_test_s = pd.read_csv('Data/generated_splits/features_selected_pca/X2_test_p.csv', sep=',', header=0)
        y_train_s = pd.read_csv('Data/generated_splits/features_selected_pca/y2_train_p.csv', sep=',', header=0)
        y_test_s = pd.read_csv('Data/generated_splits/features_selected_pca/y2_test_p.csv', sep=',', header=0)
else: raise ValueError('VERSION value must be either 1 or 2, SELECTION must be either RF or PCA')   

display(X_train_s.head(3))
display(y_train_s.head(3))

Unnamed: 0,RET,RETX,sprtrn,ewretx,vwretx,ewretd,pe_inc,pe_op_dil,ALTPRC,debt_ebitda,cash_ratio,ps,SHRENDDT,VOL,int_debt
0,0.024042,0.024042,0.037655,0.032156,0.038183,0.033697,18.484,15.633,29.39,1.704,1.481,3.703,20140929.0,4611190.0,0.045
1,-0.057168,-0.066454,-0.014999,-0.015631,-0.017055,-0.013012,39.232,17.296,32.17,3.055,0.321,0.689,20130730.0,1296447.0,0.07
2,0.026373,0.026373,0.043117,0.042687,0.043937,0.044257,13.062,9.209,56.82,8.714,0.4785,2.034,20140330.0,3473222.0,0.052


Unnamed: 0,0
0,1
1,0
2,1


### 3.1.2. Print out Shape and Form of Feature Matrix and Response Vector

### Train Set

In [5]:
print('Features Selected with ' + str(sel_feat))
print('Version ' + str(VERSION) + '; ' + str(sel_version))

print("")
print('Shape (rows, columns) of Feature Matrix X (Train) ' + '=' + str(X_train_s.shape))
print("")

print('Feature Matrix X (Train) with Selected Features')
display(X_train_s[0:3])
print("")

print('Response Vector y (Train) after Feature Selection')
display(y_train_s[0:3])
print("")

Features Selected with Random Forest (RF)
Version 2; Based on Dataset with Ratios + Seasonality + other Market Data as predictive Features

Shape (rows, columns) of Feature Matrix X (Train) =(2836, 15)

Feature Matrix X (Train) with Selected Features


Unnamed: 0,RET,RETX,sprtrn,ewretx,vwretx,ewretd,pe_inc,pe_op_dil,ALTPRC,debt_ebitda,cash_ratio,ps,SHRENDDT,VOL,int_debt
0,0.024042,0.024042,0.037655,0.032156,0.038183,0.033697,18.484,15.633,29.39,1.704,1.481,3.703,20140929.0,4611190.0,0.045
1,-0.057168,-0.066454,-0.014999,-0.015631,-0.017055,-0.013012,39.232,17.296,32.17,3.055,0.321,0.689,20130730.0,1296447.0,0.07
2,0.026373,0.026373,0.043117,0.042687,0.043937,0.044257,13.062,9.209,56.82,8.714,0.4785,2.034,20140330.0,3473222.0,0.052



Response Vector y (Train) after Feature Selection


Unnamed: 0,0
0,1
1,0
2,1





### Test Set

In [6]:
print('Features Selected with ' + str(sel_feat))
print('Version ' + str(VERSION) + '; ' + str(sel_version))

print("")
print('Shape (rows, columns) of Feature Matrix X (Test) ' + '=' + str(X_test_s.shape))
print("")

print('Feature Matrix X (Train) with Selected Features')
display(X_test_s[0:3])
print("")

print('Response Vector y (Test) after Feature Selection')
display(y_test_s[0:3])
print("")

Features Selected with Random Forest (RF)
Version 2; Based on Dataset with Ratios + Seasonality + other Market Data as predictive Features

Shape (rows, columns) of Feature Matrix X (Test) =(710, 15)

Feature Matrix X (Train) with Selected Features


Unnamed: 0,RET,RETX,sprtrn,ewretx,vwretx,ewretd,pe_inc,pe_op_dil,ALTPRC,debt_ebitda,cash_ratio,ps,SHRENDDT,VOL,int_debt
0,-0.017073,-0.017073,-0.015514,-0.046891,-0.026823,-0.044988,16.224,16.224,100.75,0.346,0.859,3.318,20141009.0,15283673.0,0.018
1,0.050202,0.050202,0.029749,0.049754,0.035682,0.051828,18.555,16.489,75.52,6.228,0.399,2.379,20131030.0,746229.0,0.018
2,0.025838,0.025838,-0.036974,-0.012292,-0.038109,-0.011388,15.55,9.52,18.66,1.467,1.592,3.293,20100225.0,10148052.0,0.038



Response Vector y (Test) after Feature Selection


Unnamed: 0,0
0,0
1,1
2,1





## 3.2. SVM

### Two different SVM tests are applied:
#### => SVM1 = SVM with random parameters
#### => SVM2 = SVM with other parameters

### 3.2.1. SVM1 : SVM with random parameters

### Kernel: rbf

In [7]:
# Create pipeline object with standard scaler and SVC estimator
# Standardscaler standardizes the input variables
pipe1 = Pipeline([('scaler', StandardScaler()), 
                 ('classifier', SVC(random_state=0))])

In [8]:
# Define parameter grid
param_grid1 = [{'scaler': [StandardScaler()],
               'classifier': [SVC(kernel='rbf')],
               'classifier__gamma': [1, 10],
               'classifier__C': [10, 100]},
              {'scaler': [StandardScaler(), None],
               'classifier': [LogisticRegression()],
               'classifier__C': [10, 100]}]

In [9]:
# Run grid search
grid1 = GridSearchCV(pipe1, param_grid1, cv=5, n_jobs=-1)
grid1.fit(X_train_s, y_train_s)

GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('classifier', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=0, shrinking=True,
  tol=0.001, verbose=False))]),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid=[{'scaler': [StandardScaler(copy=True, with_mean=True, with_std=True)], 'classifier': [SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol...olver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)], 'classifier__C': [10, 100]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [10]:
# Print results
print("")
print('Best CV accuracy: {:.2f}'.format(grid1.best_score_))
print('Test score:       {:.2f}'.format(grid1.score(X_test_s, y_test_s)))
print("")
print('Best parameters: {}'.format(grid1.best_params_))


Best CV accuracy: 0.99
Test score:       0.99

Best parameters: {'classifier': LogisticRegression(C=100, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False), 'classifier__C': 100, 'scaler': StandardScaler(copy=True, with_mean=True, with_std=True)}


In [11]:
# Predict classes
y_pred1 = grid1.predict(X_test_s)
display(y_pred1[0:20])

array([0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0])

In [12]:
print("")
print('Metrics of Classification with SVM1 (random parameters), kernel rbf:')
print("")
print(metrics.classification_report(y_test_s, y_pred1))
print("")
print('Confusion Matrix with SVM1 (random parameters), kernel rbf:')
print("")
print(metrics.confusion_matrix(y_test_s, y_pred1))


Metrics of Classification with SVM1 (random parameters), kernel rbf:

             precision    recall  f1-score   support

          0       0.98      1.00      0.99       307
          1       1.00      0.99      0.99       403

avg / total       0.99      0.99      0.99       710


Confusion Matrix with SVM1 (random parameters), kernel rbf:

[[307   0]
 [  5 398]]


### 3.2.1. SVM2 : SVM with other parameters

### Kernel: linear

In [13]:
# Create pipeline object with standard scaler and SVC estimator
# Standardscaler standardizes the input variables
pipe2 = Pipeline([('scaler', StandardScaler()), 
                 ('classifier', SVC(random_state=0))])

In [14]:
# Define parameter grid
param_grid2 = [{'scaler': [StandardScaler()],
               'classifier': [SVC(kernel='linear')],
               'classifier__gamma': [1, 10],
               'classifier__C': [10, 100]},
              {'scaler': [StandardScaler(), None],
               'classifier': [LogisticRegression()],
               'classifier__C': [10, 100]}]

In [15]:
# Run grid search
grid2 = GridSearchCV(pipe2, param_grid2, cv=5, n_jobs=-1)
grid2.fit(X_train_s, y_train_s)

GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('classifier', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=0, shrinking=True,
  tol=0.001, verbose=False))]),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid=[{'scaler': [StandardScaler(copy=True, with_mean=True, with_std=True)], 'classifier': [SVC(C=100, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=1, kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0...olver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)], 'classifier__C': [10, 100]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [16]:
# Print results
print("")
print('Best CV accuracy: {:.2f}'.format(grid2.best_score_))
print('Test score:       {:.2f}'.format(grid2.score(X_test_s, y_test_s)))
print("")
print('Best parameters: {}'.format(grid2.best_params_))


Best CV accuracy: 0.99
Test score:       0.99

Best parameters: {'classifier': SVC(C=100, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=1, kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False), 'classifier__C': 100, 'classifier__gamma': 1, 'scaler': StandardScaler(copy=True, with_mean=True, with_std=True)}


In [17]:
# Predict   classes
y_pred2 = grid2.predict(X_test_s)
display(y_pred2[0:20])

array([0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0])

In [18]:
print("")
print('Metrics of Classification with SVM2 (random parameters), kernel linear:')
print("")
print(metrics.classification_report(y_test_s, y_pred2))
print(metrics.confusion_matrix(y_test_s, y_pred2))


Metrics of Classification with SVM2 (random parameters), kernel linear:

             precision    recall  f1-score   support

          0       0.99      1.00      0.99       307
          1       1.00      0.99      0.99       403

avg / total       0.99      0.99      0.99       710

[[306   1]
 [  4 399]]


### Kernel: poly

In [19]:
# Create pipeline object with standard scaler and SVC estimator
pipe3 = Pipeline([('scaler', StandardScaler()), 
                 ('classifier', SVC(random_state=0))])

In [20]:
# Define parameter grid
param_grid3 = [{'scaler': [StandardScaler()],
               'classifier': [SVC(kernel= "poly")],
               'classifier__gamma': [1, 10],
               'classifier__C': [10, 100]},
              {'scaler': [StandardScaler(), None],
               'classifier': [LogisticRegression()],
               'classifier__C': [10, 100]}]

In [21]:
# Run grid search
grid3 = GridSearchCV(pipe3, param_grid3, cv=5, n_jobs=-1)
grid3.fit(X_train_s, y_train_s)

GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('classifier', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=0, shrinking=True,
  tol=0.001, verbose=False))]),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid=[{'scaler': [StandardScaler(copy=True, with_mean=True, with_std=True)], 'classifier': [SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='poly',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  to...olver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)], 'classifier__C': [10, 100]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [22]:
# Print results
print("")
print('Best CV accuracy: {:.2f}'.format(grid3.best_score_))
print('Test score:       {:.2f}'.format(grid3.score(X_test_s, y_test_s)))
print("")
print('Best parameters: {}'.format(grid3.best_params_))


Best CV accuracy: 0.99
Test score:       0.99

Best parameters: {'classifier': LogisticRegression(C=100, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False), 'classifier__C': 100, 'scaler': StandardScaler(copy=True, with_mean=True, with_std=True)}


In [23]:
# Predict   classes
y_pred3 = grid3.predict(X_test_s)
display(y_pred3[0:20])

array([0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0])

In [24]:
print("")
print('Metrics of Classification with SVM2 (random parameters), kernel poly:')
print("")
print(metrics.classification_report(y_test_s, y_pred3))
print(metrics.confusion_matrix(y_test_s, y_pred3))


Metrics of Classification with SVM2 (random parameters), kernel poly:

             precision    recall  f1-score   support

          0       0.98      1.00      0.99       307
          1       1.00      0.99      0.99       403

avg / total       0.99      0.99      0.99       710

[[307   0]
 [  5 398]]
