# Introduction to Machine Learning, UZH 2018, Group Project
### Group 2: Barbara Capl, Mathias Lüthi, Pamela Matias, Stefanie Rentsch
##       
# 3. Support Vector Machines (SVM)

###   
In this section we use the feature matrices and response vectors with features selected in chapter 2. 


In [20]:
# hide unnecessary warnings ("depreciation" of packages etc.)
import warnings
warnings.filterwarnings('ignore')

# Load packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.formula.api as smf

from sklearn import metrics
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
plt.style.use('seaborn-whitegrid')

## 3.0. SETTINGS

### (1) Choose the Dataset Version you want

##### Whole Feature Matrix (Features not pre-selected)
VERSION = 1; Feature Matrix with only ratios                                  
VERSION = 2;  Feature Matrix with ratios + saisonality + other market data

##### Reduced Feature Matrix (Features pre-selected)
VERSION = 1.1; Reduced Feature Matrix with only ratios                                  
VERSION = 2.1;  Reduced Feature Matrix with ratios + saisonality + other market data

In [21]:
### Chose which dataset version you want the selection of features and the prediction to be based on 
VERSION = 2
"""
INSERT NUMBER 1, 2, 1.1 or 2.1
"""



# Defining sel_state variable for easier printing out    
if VERSION == 1:
    sel_version = 'Based on whole original Dataset with only the Ratios Dataset as predicive Features.'
elif VERSION == 2:
    sel_version = 'Based on whole original Dataset with Ratios + Seasonality + other Market Data as predictive Features.'
elif VERSION == 1.1:
    sel_version = 'Based on reduced Dataset with only the Ratios Dataset as predicive Features.'
elif VERSION == 2.1:
    sel_version = 'Based on reduced Dataset with Ratios + Seasonality + other Market Data as predictive Features.'
else: raise ValueError('VERSION must be either 1, 2, 1.1 or 2.1')

### (2) If you chose VERSION 1.1. or VERSION 2.1:  (Reduced Feature Matrix)                                                                  
### => Choose with which method you want to have the features been pre-selected /reduced

##### You have the choice between:
mySELECTION  = RF ; Features pre-selected with Random Forest Classifier                                                           
mySELECTION = PCA; Features pre-selected with Principal Component Analysis (PCA)                                         

##### By Default;
If VERSION 1 or VERSION 2 was chosen above: SELECTION = none by Default; no features pre-selected. You don't need to define variable mySELECTION.

In [22]:
### Choose whether you want the datasets with features selected with RF or PCA or the original file
mySELECTION = 'PCA'
"""
INSERT WISHED METHOD 'RF', 'PCA'
"""



# This is the control loop. If something has been chosen wrong, it returns an error with explanation.
if VERSION == 1 or VERSION == 2:
    SELECTION = 'none'
elif VERSION == 1.1 or VERSION == 2.1:
    SELECTION = mySELECTION
    if mySELECTION is not 'RF' and mySELECTION is not 'PCA':
        raise ValueError('Because VERSION '+str(VERSION)+' is chosen, mySELECTION must be set as either RF or PCA.')
else: raise ValueError('VERSION must be either 1, 2, 1.1 or 2.1. mySELECTION must be chosen as either RF or PCA.')

# Defining of sel_feat (Selected Feature Selection Method) variable and briefing for later.   
if SELECTION == 'RF':
    sel_feat = 'Random Forest (RF)'
    briefing = ('You chose dataset VERSION '+str(VERSION)+' and SELECTION method '+str(SELECTION)+'.'+'\n'+'Features therefore pre-selected with '+str(sel_feat)+'.')
elif SELECTION == 'PCA':
    sel_feat = 'Principal Component Analysis (PCA)'
    briefing = ('You chose dataset VERSION '+str(VERSION)+' and SELECTION method '+str(SELECTION)+'.'+'\n'+'Features therefore pre-selected with '+str(sel_feat)+'.')
elif SELECTION == 'none':
    sel_feat = 'No Feature Selection Method available.'
    briefing = ('You chose VERSION '+str(VERSION)+'. This Version has no Feature Selection Method because Feature Matrix is whole, not reduced.'+'\n'+'SELECTION is therefore "none" by Default.')
else: raise ValueError('mySELECTION must be chosen as either RF or PCA')
print('You chose SELECTION method '+str(sel_feat)+'.')
#print(sel_feat)

You chose SELECTION method No Feature Selection Method available..


### (3) SUMMARY OF SETTINGS

In [23]:
print(briefing, '\n')
print('VERSION '+str(VERSION)+' is '+str(sel_version),'\n')
print('You are now done with the Settings. You can run the whole Code now by Default.')

You chose VERSION 2. This Version has no Feature Selection Method because Feature Matrix is whole, not reduced.
SELECTION is therefore "none" by Default. 

VERSION 2 is Based on whole original Dataset with Ratios + Seasonality + other Market Data as predictive Features. 

You are now done with the Settings. You can run the whole Code now by Default.


## 3.1. Preparation

### 3.1.1. Import the Response Vector and the Feature Matrix

In [24]:
####################### NEW COMMENT
# In version 1 und 2; ganze Feature matrix und ganzen Response vector rein? ODER gesplittet?
# dann müsste in DataPrep split gemacht werden für datenset ohne feature pre-selection
# abgespeichert in generated_splits ohne Unterordner
# hier eingelesen als X_train und y_train und X_test und y_test
#######################


### import Data (already splitted to train/test-data and selected features-> bc_randomforest_feature_selection)
if VERSION == 1: 
# features not pre-selected, only ratios
    X = pd.read_csv('Data/generated_datasets/features_ratios_1.csv', sep=',', header=0)
    y = pd.read_csv('Data/generated_datasets/response_1.csv', sep=',', header=0)
elif VERSION == 2: 
# features not pre-selected, ratios + seasonality + market data
    X = pd.read_csv('Data/generated_datasets/features_additional_1.csv', sep=',', header=0)
    y = pd.read_csv('Data/generated_datasets/response_1.csv', sep=',', header=0)
elif VERSION == 1.1: 
# features pre-selected, only ratios
    if SELECTION == 'RF':
        X_train_s = pd.read_csv('Data/generated_splits/features_selected_randomforest/X1_train_f.csv', sep=',', header=0)
        X_test_s = pd.read_csv('Data/generated_splits/features_selected_randomforest/X1_test_f.csv', sep=',', header=0)
        y_train_s = pd.read_csv('Data/generated_splits/features_selected_randomforest/y1_train_f.csv', sep=',', header=0)
        y_test_s = pd.read_csv('Data/generated_splits/features_selected_randomforest/y1_test_f.csv', sep=',', header=0)
    elif SELECTION == 'PCA':
        X_train_s = pd.read_csv('Data/generated_splits/features_selected_pca/X1_train_p.csv', sep=',', header=0)
        X_test_s = pd.read_csv('Data/generated_splits/features_selected_pca/X1_test_p.csv', sep=',', header=0)
        y_train_s = pd.read_csv('Data/generated_splits/features_selected_pca/y1_train_p.csv', sep=',', header=0)
        y_test_s = pd.read_csv('Data/generated_splits/features_selected_pca/y1_test_p.csv', sep=',', header=0)
elif VERSION == 2.1: 
# features pre-selected, ratios + seasonality + market data
    if SELECTION == 'RF':
        X_train_s = pd.read_csv('Data/generated_splits/features_selected_randomforest/X2_train_f.csv', sep=',', header=0)
        X_test_s = pd.read_csv('Data/generated_splits/features_selected_randomforest/X2_test_f.csv', sep=',', header=0)
        y_train_s = pd.read_csv('Data/generated_splits/features_selected_randomforest/y2_train_f.csv', sep=',', header=0)
        y_test_s = pd.read_csv('Data/generated_splits/features_selected_randomforest/y2_test_f.csv', sep=',', header=0)
    elif SELECTION == 'PCA':
        X_train_s = pd.read_csv('Data/generated_splits/features_selected_pca/X2_train_p.csv', sep=',', header=0)
        X_test_s = pd.read_csv('Data/generated_splits/features_selected_pca/X2_test_p.csv', sep=',', header=0)
        y_train_s = pd.read_csv('Data/generated_splits/features_selected_pca/y2_train_p.csv', sep=',', header=0)
        y_test_s = pd.read_csv('Data/generated_splits/features_selected_pca/y2_test_p.csv', sep=',', header=0)
else: raise ValueError('VERSION value must be either 1, 2, 1.1 or 2.1, mySELECTION must be chosen as either RF or PCA.')   

#### Train-Test-split for whole original files. Automatically executed only if VERSION = 1 or 2.

In [25]:
####################### NEW COMMENT
# Split could also be already done in Datapreparation file because there we have more space etc.^ maybe
# but it can also be made here doesnt matterrr
#######################

# For VERSION == 1 or 2 -> train-test-split for the importet sets must be done
if VERSION == 1 or VERSION == 2:
    # import package imputer
    from sklearn.preprocessing import Imputer
    # # Train/test split, into 20% test size and 80% train size because it is a relatively small dataset
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

    # Use a median fill for train
    imp = Imputer(missing_values=np.nan, strategy = 'median' , axis=0)
    imputed_dataset = pd.DataFrame(imp.fit_transform(X_train))
    imputed_dataset.columns = X_train.columns
    imputed_dataset.index = X_train.index
    X_train = imputed_dataset

    # Use a median fill for the test set
    imputed_dataset = pd.DataFrame(imp.fit_transform(X_test))
    imputed_dataset.columns = X_test.columns
    imputed_dataset.index = X_test.index
    X_test = imputed_dataset

    from sklearn.preprocessing import StandardScaler

    # Create StandardScaler object
    sc = StandardScaler()
    # Standardize features; equal results as if done in two
    X_train = sc.fit_transform(X_train)
    # Transform test set
    X_test = sc.transform(X_test)
    # Extract the feature labels
    feature_labels = list(X)
    print('Type of feature_labels = ' + str(type(feature_labels)), '\n')

else: print('No Train/Test split, no Imputing, no Standardization needed for chosen VERSION '+str(VERSION)+
            '. '+'\n'+'Loaded Datasets were already pre-splitted and imputed in Feature Selection (Chapter 2).')

Type of feature_labels = <class 'list'> 



### 3.1.2. Print out Shape and Form of Feature Matrix and Response Vector

### Train Set

In [26]:
# print status
print('Features Selected with ' + str(sel_feat)+'.')
print('Version ' + str(VERSION) + '; ' + str(sel_version), '\n')

# print properties and head
if VERSION == 1 or VERSION == 2:
    print('Shape (rows, columns) of Feature Matrix X (Train) ' + '= ' + str(X_train.shape)+'\n')
    print('Feature Matrix X (Train) with no Feature pre-Selection:')
    display(X_train[0:3])
    print("")
    print('Response Vector y (Train) after no Feature pre-Selection:')
    display(y_train[0:3])
    print("")
else:
    print('Shape (rows, columns) of Feature Matrix X (Train) ' + '= ' + str(X_train_s.shape), '\n')
    print('Feature Matrix X (Train) with Selected Features:'+'\n')
    display(X_train_s[0:3])
    print("")
    print('Response Vector y (Train) after Feature Selection:')
    display(y_train_s[0:3])

Features Selected with No Feature Selection Method available..
Version 2; Based on whole original Dataset with Ratios + Seasonality + other Market Data as predictive Features. 

Shape (rows, columns) of Feature Matrix X (Train) = (2836, 181)

Feature Matrix X (Train) with no Feature pre-Selection:


array([[-1.88e-02,  1.22e+00, -6.71e-01, -6.76e-01, -6.64e-01,  2.05e-01,
         2.08e-01, -6.63e-01, -6.64e-01,  1.41e+00, -2.98e-01, -2.97e-01,
        -6.64e-01,  2.81e-01,  1.22e+00,  2.38e-01,  7.54e-01,  7.50e-01,
         5.43e-01,  5.49e-01,  7.56e-01,  2.77e-02,  5.67e-01, -8.05e-02,
         1.24e-02, -4.05e-03,  1.18e-01,  1.56e-01,  1.08e+00, -8.67e-02,
         3.55e-01,  9.59e-01,  1.28e+00,  8.44e-01,  1.72e+00,  7.88e-01,
         1.26e+00, -5.32e-01, -8.40e-01, -7.02e-01,  4.68e-02, -4.13e-01,
        -7.65e-01, -4.13e-01,  1.72e-02, -2.75e-01, -4.31e-01,  2.11e-01,
        -1.36e-01, -2.66e-01, -1.63e-01, -6.02e-02, -5.85e-02,  6.96e-02,
        -5.02e-01, -7.97e-01, -4.69e-02, -2.27e-01, -3.65e-01, -1.73e+00,
         3.85e-01,  1.56e-01,  3.51e-01, -3.84e-01,  1.04e+00, -1.72e-01,
        -2.73e-01, -1.96e-01, -4.45e-01, -3.06e-01, -1.34e-01, -1.09e-01,
         1.43e+00,  1.54e+00,  1.55e+00,  4.13e-02, -5.20e-01, -9.66e-01,
        -3.69e-01, -7.55e-01, -8.83e-0


Response Vector y (Train) after no Feature pre-Selection:


Unnamed: 0,0
1530,0
1397,1
2238,0





### Test Set

In [27]:
# print status
print('Features Selected with ' + str(sel_feat))
print('Version ' + str(VERSION) + '; ' + str(sel_version),'\n')

# print properties and head
if VERSION == 1 or VERSION == 2:
    print('Shape (rows, columns) of Feature Matrix X (Test) ' + '= ' + str(X_test.shape)+'\n')
    print('Feature Matrix X (Test) with no Feature pre-Selection:')
    display(X_test[0:3])
    print("")
    print('Response Vector y (Test) after no Feature pre-Selection:')
    display(y_test[0:3])
    print("")
else:
    print('Shape (rows, columns) of Feature Matrix X (Test) ' + '= ' + str(X_test_s.shape)+'\n')
    print('Feature Matrix X (Test) with Selected Features:')
    display(X_test_s[0:3])
    print("")
    print('Response Vector y (Test) after Feature Selection:')
    display(y_test_s[0:3])

Features Selected with No Feature Selection Method available.
Version 2; Based on whole original Dataset with Ratios + Seasonality + other Market Data as predictive Features. 

Shape (rows, columns) of Feature Matrix X (Test) = (710, 181)

Feature Matrix X (Test) with no Feature pre-Selection:


array([[-1.88e-02,  1.22e+00,  4.88e-01,  4.22e-01,  4.52e-01,  2.72e+00,
        -3.79e-01,  4.52e-01,  4.52e-01,  1.22e+00, -2.98e-01, -2.97e-01,
         4.52e-01,  2.81e-01,  1.23e+00, -3.49e-01, -6.92e-01, -6.89e-01,
        -9.73e-01, -9.75e-01, -4.66e-01,  3.75e-02, -6.29e-01, -4.71e-01,
         6.56e-02,  3.30e-02,  2.49e-02,  6.28e-02,  8.07e-01, -3.81e-02,
        -2.94e-01,  1.09e+00,  5.23e-01,  6.06e-01, -2.18e-01,  1.03e+00,
         6.28e-01,  1.37e+00,  3.02e-01,  1.28e+00, -6.60e-02,  2.16e-01,
         7.97e-01,  2.17e-01,  8.14e-01,  2.81e+00,  1.61e-01,  8.21e-01,
        -7.62e-01, -6.86e-01, -7.83e-01, -6.25e-02, -6.01e-02,  3.19e-01,
        -9.49e-01, -1.06e-01, -7.99e-01, -5.00e-01, -1.18e+00,  2.47e-01,
        -2.79e-01,  7.87e-01,  1.31e+00,  1.78e+00,  7.28e-01, -2.38e-01,
        -3.61e-01, -8.61e-01, -7.71e-01, -3.99e-01,  2.97e-01,  1.63e-01,
         3.36e-01,  3.77e-01, -1.41e-02, -2.37e-01,  1.62e+00,  5.74e-02,
        -1.67e-01, -3.86e-01, -1.78e-0


Response Vector y (Test) after no Feature pre-Selection:


Unnamed: 0,0
817,1
2592,0
1475,1





### Define Variables for further use

In [28]:
if VERSION == 1 or VERSION == 2:
    feature_train = X_train
    feature_test = X_test
    response_train = y_train
    response_test = y_test
if VERSION == 1.1 or VERSION == 2.1:
    feature_train = X_train_s
    feature_test = X_test_s
    response_train = y_train_s
    response_test = y_test_s

## 3.2. SVC

### Two different SVC tests are applied:
#### => SVC1 = rbf
#### => SVC2 = linear

### 3.2.1. Kernel: rbf

In [29]:
# Import necessary functions
from sklearn.model_selection import StratifiedKFold,cross_val_score
from sklearn.linear_model import LogisticRegression

# Create k-Fold CV and rbf object
kFold = StratifiedKFold(n_splits =5, random_state =0)
rbf = SVC(C=10, cache_size=200, class_weight=None, coef0=0.0, 
          decision_function_shape='ovr', degree=3, gamma=1, kernel='rbf',
          max_iter=-1, probability=False, random_state=None, shrinking=True,
          tol=0.001, verbose=False)
# Run CV and print results
scores = cross_val_score(rbf, feature_train, response_train, cv= kFold)
print(scores)
print('CV accuracy on train set: {0: .3f} +/- {1: .3f}'.format(np.mean(scores), np.std(scores)))

[0.58 0.58 0.58 0.58 0.58]
CV accuracy on train set:  0.583 +/-  0.001


In [30]:
# ROC score
scores = cross_val_score(rbf, feature_train, response_train, cv=kFold , scoring = 'roc_auc')
print ('CV  AUC on train set: {0: .3f} +/- {1: .3f}'.format(np.mean(scores),   np.std(scores)))

CV  AUC on train set:  0.529 +/-  0.027


In [31]:
from sklearn.model_selection import cross_validate
# Calculate return
measures = ['accuracy', 'recall', 'roc_auc']
scores = cross_validate(rbf, feature_train, response_train, cv=kFold, scoring = measures, n_jobs =2)
scores

{'fit_time': array([3.22, 3.18, 3.2 , 3.21, 3.14]),
 'score_time': array([2.1 , 2.11, 2.21, 2.2 , 2.08]),
 'test_accuracy': array([0.58, 0.58, 0.58, 0.58, 0.58]),
 'test_recall': array([1.  , 1.  , 0.99, 1.  , 1.  ]),
 'test_roc_auc': array([0.54, 0.55, 0.48, 0.54, 0.53]),
 'train_accuracy': array([1., 1., 1., 1., 1.]),
 'train_recall': array([1., 1., 1., 1., 1.]),
 'train_roc_auc': array([1., 1., 1., 1., 1.])}

In [32]:
print('Train set accuracy (CV=5): ' ,scores ['train_accuracy'].mean())
print('Validation set scores (CV=5): ',scores ['test_accuracy'].mean())
print('Test set accuracy : ',rbf.fit(feature_test , response_test).score(feature_test, response_test))

Train set accuracy (CV=5):  1.0
Validation set scores (CV=5):  0.582862710881175
Test set accuracy :  1.0


#### Prediction

In [33]:
y_pred1 = rbf.fit(feature_train, response_train).predict(feature_test)
display(y_pred1[0:20])

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
      dtype=int64)

In [34]:
print("")
print('Metrics of Classification with SVM1 (random parameters), kernel rbf:')
print("")
print(metrics.classification_report(response_test, y_pred1))
print("")
print('Confusion Matrix with SVM1 (random parameters), kernel rbf:')
print("")
print(metrics.confusion_matrix(response_test, y_pred1))


Metrics of Classification with SVM1 (random parameters), kernel rbf:

             precision    recall  f1-score   support

          0       0.00      0.00      0.00       305
          1       0.57      1.00      0.72       405

avg / total       0.32      0.57      0.41       710


Confusion Matrix with SVM1 (random parameters), kernel rbf:

[[  0 305]
 [  2 403]]


In [35]:
# Precision
from sklearn.metrics import precision_score
precision_score(response_test, y_pred1, labels=None, pos_label=1, average= 'binary', sample_weight=None)

0.5692090395480226

In [36]:
# Recall
from sklearn.metrics import recall_score
recall_score(response_test, y_pred1, labels=None, pos_label=1, average= 'binary', sample_weight=None)

0.9950617283950617

### 3.2.2. Kernel: linear

In [37]:
# Import necessary functions
from sklearn.model_selection import StratifiedKFold,cross_val_score
from sklearn.linear_model import LogisticRegression
# Create k-Fold CV and SVM (linear) object
kFold = StratifiedKFold(n_splits =5, random_state =0)
linear = SVC(C=10, cache_size=200, class_weight=None, coef0=0.0, 
          decision_function_shape='ovr', degree=3, gamma=1, kernel='rbf',
          max_iter=-1, probability=False, random_state=None, shrinking=True,
          tol=0.001, verbose=False)
# Run CV and print results
scores = cross_val_score(linear, feature_train ,  response_train , cv= kFold )
print(scores)
print('CV accuracy on train set: {0: .3f} +/- {1: .3f}'.format(np.mean(scores), np.std(scores)))

[0.58 0.58 0.58 0.58 0.58]
CV accuracy on train set:  0.583 +/-  0.001


In [38]:
# ROC score
scores = cross_val_score(linear, feature_train, response_train, cv=kFold , scoring = 'roc_auc')
print ('CV  AUC on train set: {0: .3f} +/- {1: .3f}'.format(np.mean(scores),   np.std(scores)))

CV  AUC on train set:  0.529 +/-  0.027


In [39]:
from sklearn.model_selection import cross_validate
# Calculate return
measures = ['accuracy', 'recall', 'roc_auc']
scores = cross_validate(linear, feature_train, response_train, cv=kFold, scoring = measures, n_jobs =2)
scores

{'fit_time': array([3.24, 3.22, 3.2 , 3.21, 3.2 ]),
 'score_time': array([2.09, 2.09, 2.21, 2.23, 2.07]),
 'test_accuracy': array([0.58, 0.58, 0.58, 0.58, 0.58]),
 'test_recall': array([1.  , 1.  , 0.99, 1.  , 1.  ]),
 'test_roc_auc': array([0.54, 0.55, 0.48, 0.54, 0.53]),
 'train_accuracy': array([1., 1., 1., 1., 1.]),
 'train_recall': array([1., 1., 1., 1., 1.]),
 'train_roc_auc': array([1., 1., 1., 1., 1.])}

In [40]:
print('Train set accuracy (CV=5): ' ,scores ['train_accuracy'].mean())
print('Validation set scores (CV=5): ',scores ['test_accuracy'].mean())
print('Test set accuracy : ',linear.fit(feature_test , response_test).score(feature_test, response_test))

Train set accuracy (CV=5):  1.0
Validation set scores (CV=5):  0.582862710881175
Test set accuracy :  1.0


#### Prediction

In [41]:
y_pred2 = linear.fit(feature_train, response_train).predict(feature_test)
display(y_pred1[0:20])

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
      dtype=int64)

In [42]:
print("")
print('Metrics of Classification with SVM1 (random parameters), kernel rbf:')
print("")
print(metrics.classification_report(response_test, y_pred2))
print("")
print('Confusion Matrix with SVM1 (random parameters), kernel rbf:')
print("")
print(metrics.confusion_matrix(response_test, y_pred2))


Metrics of Classification with SVM1 (random parameters), kernel rbf:

             precision    recall  f1-score   support

          0       0.00      0.00      0.00       305
          1       0.57      1.00      0.72       405

avg / total       0.32      0.57      0.41       710


Confusion Matrix with SVM1 (random parameters), kernel rbf:

[[  0 305]
 [  2 403]]


In [43]:
# Precision
from sklearn.metrics import precision_score
precision_score(response_test, y_pred2, labels=None, pos_label=1, average= 'binary', sample_weight=None)

0.5692090395480226

In [44]:
# Recall
from sklearn.metrics import recall_score
recall_score(response_test, y_pred2, labels=None, pos_label=1, average= 'binary', sample_weight=None)

0.9950617283950617