# Introduction to Machine Learning, UZH 2018, Group Project
### Group 2: Barbara Capl, Mathias Lüthi, Pamela Matias, Stefanie Rentsch
##       
# 3. Support Vector Machines (SVM)


In this section we use the feature matrices and response vectors with features selected in chapter 2.  

#### We use two different versions (created in chapter 1, features-selected in chapter 2):
Version 1: Feature Matrix consists only of the Ratios                                                                        
Version 2: Feature Matrix consists of Ratios + dummy variables for seasonality + other market data
####  We will do Classification and Prediction with SVM


In [1]:
# hide unnecessary warnings ("depreciation" of packages etc.)
import warnings
warnings.filterwarnings('ignore')

# Load packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.formula.api as smf
plt.style.use('seaborn-whitegrid')
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

## 3.0. SETTINGS

### (1) Choose the Dataset Version you want

##### Whole Feature Matrix (Features not pre-selected)
VERSION = 1; Feature Matrix with only ratios                                  
VERSION = 2;  Feature Matrix with ratios + saisonality + other market data

##### Reduced Feature Matrix (Features pre-selected)
VERSION = 1.1; Reduced Feature Matrix with only ratios                                  
VERSION = 2.1;  Reduced Feature Matrix with ratios + saisonality + other market data



In [2]:
### Chose which dataset version you want the selection of features and the prediction to be based on 
VERSION = 1.1
"""
INSERT NUMBER 1, 2, 1.1 or 2.1
"""



# Defining sel_state variable for easier printing out    
if VERSION == 1:
    sel_version = 'Based on whole original Dataset with only the Ratios Dataset as predicive Features.'
elif VERSION == 2:
    sel_version = 'Based on whole original Dataset with Ratios + Seasonality + other Market Data as predictive Features.'
elif VERSION == 1.1:
    sel_version = 'Based on reduced Dataset with only the Ratios Dataset as predicive Features.'
elif VERSION == 2.1:
    sel_version = 'Based on reduced Dataset with Ratios + Seasonality + other Market Data as predictive Features.'
else: raise ValueError('VERSION must be either 1, 2, 1.1 or 2.1')

### (2) If you chose VERSION 1.1. or VERSION 2.1:  (Reduced Feature Matrix)                                                                  
### => Choose with which method you want to have the features been pre-selected /reduced

##### You have the choice between:
mySELECTION  = RF ; Features pre-selected with Random Forest Classifier                                                           
mySELECTION = PCA; Features pre-selected with Principal Component Analysis (PCA)                                         

##### By Default;
If VERSION 1 or VERSION 2 was chosen above: SELECTION = none by Default; no features pre-selected. You don't need to define variable mySELECTION.

In [15]:
### Choose whether you want the datasets with features selected with RF or PCA or the original file
mySELECTION = 'RF'
"""
INSERT WISHED METHOD 'RF', 'PCA'
"""



# This is the control loop. If something has been chosen wrong, it returns an error with explanation.
if VERSION == 1 or VERSION == 2:
    SELECTION = 'none'
elif VERSION == 1.1 or VERSION == 2.1:
    SELECTION = mySELECTION
    if mySELECTION is not 'RF' and mySELECTION is not 'PCA':
        raise ValueError('Because VERSION '+str(VERSION)+' is chosen, mySELECTION must be set as either RF or PCA.')
else: raise ValueError('VERSION must be either 1, 2, 1.1 or 2.1. mySELECTION must be chosen as either RF or PCA.')

# Defining of sel_feat (Selected Feature Selection Method) variable and briefing for later.   
if SELECTION == 'RF':
    sel_feat = 'Random Forest (RF)'
    briefing = ('Chosen VERSION '+str(VERSION)+', Feature Matrix reduced, Features pre-selected with '+str(sel_feat)+'.')
elif SELECTION == 'PCA':
    sel_feat = 'Principal Component Analysis (PCA)'
    briefing = ('Chosen VERSION '+str(VERSION)+', Feature Matrix reduced, Features pre-selected with '+str(sel_feat)+'.')
elif SELECTION == 'none':
    sel_feat = 'No Feature Selection Method available.'
    briefing = ('Chosen VERSION '+str(VERSION)+' has no Feature Selection Method. Feature Matrix is whole, not reduced.'+' SELECTION is therefore "none" by Default.')
else: raise ValueError('mySELECTION must be chosen as either RF or PCA')

#print(sel_feat)

### (3) SUMMARY OF SETTINGS

In [16]:
print(briefing, '\n')
print('VERSION '+str(VERSION)+' is '+str(sel_version),'\n')
print('You are now done with the Settings. You can run the whole Code now by Default.')

Chosen VERSION 1.1, Feature Matrix reduced, Features pre-selected with Random Forest (RF). 

VERSION 1.1 is Based on reduced Dataset with only the Ratios Dataset as predicive Features. 

You are now done with the Settings. You can run the whole Code now by Default.


## 3.1. Preparation

### 3.1.1. Import the Response Vector and the Feature Matrix

In [17]:
####################### NEW COMMENT
# In version 1 und 2; ganze Feature matrix und ganzen Response vector rein? ODER gesplittet?
# dann müsste in DataPrep split gemacht werden für datenset ohne feature pre-selection
# abgespeichert in generated_splits ohne Unterordner
# hier eingelesen als X_train und y_train und X_test und y_test
#######################


### import Data (already splitted to train/test-data and selected features-> bc_randomforest_feature_selection)
if VERSION == 1: 
# features not pre-selected, only ratios
    X = pd.read_csv('Data/generated_datasets/features_ratios_1.csv', sep=',', header=0)
    y = pd.read_csv('Data/generated_datasets/response_1.csv', sep=',', header=0)
elif VERSION == 2: 
# features not pre-selected, ratios + seasonality + market data
    X = pd.read_csv('Data/generated_datasets/features_additional_1.csv', sep=',', header=0)
    y = pd.read_csv('Data/generated_datasets/response_1.csv', sep=',', header=0)
elif VERSION == 1.1: 
# features pre-selected, only ratios
    if SELECTION == 'RF':
        X_train_s = pd.read_csv('Data/generated_splits/features_selected_randomforest/X1_train_f.csv', sep=',', header=0)
        X_test_s = pd.read_csv('Data/generated_splits/features_selected_randomforest/X1_test_f.csv', sep=',', header=0)
        y_train_s = pd.read_csv('Data/generated_splits/features_selected_randomforest/y1_train_f.csv', sep=',', header=0)
        y_test_s = pd.read_csv('Data/generated_splits/features_selected_randomforest/y1_test_f.csv', sep=',', header=0)
    elif SELECTION == 'PCA':
        X_train_s = pd.read_csv('Data/generated_splits/features_selected_pca/X1_train_p.csv', sep=',', header=0)
        X_test_s = pd.read_csv('Data/generated_splits/features_selected_pca/X1_test_p.csv', sep=',', header=0)
        y_train_s = pd.read_csv('Data/generated_splits/features_selected_pca/y1_train_p.csv', sep=',', header=0)
        y_test_s = pd.read_csv('Data/generated_splits/features_selected_pca/y1_test_p.csv', sep=',', header=0)
elif VERSION == 2.1: 
# features pre-selected, ratios + seasonality + market data
    if SELECTION == 'RF':
        X_train_s = pd.read_csv('Data/generated_splits/features_selected_randomforest/X2_train_f.csv', sep=',', header=0)
        X_test_s = pd.read_csv('Data/generated_splits/features_selected_randomforest/X2_test_f.csv', sep=',', header=0)
        y_train_s = pd.read_csv('Data/generated_splits/features_selected_randomforest/y2_train_f.csv', sep=',', header=0)
        y_test_s = pd.read_csv('Data/generated_splits/features_selected_randomforest/y2_test_f.csv', sep=',', header=0)
    elif SELECTION == 'PCA':
        X_train_s = pd.read_csv('Data/generated_splits/features_selected_pca/X2_train_p.csv', sep=',', header=0)
        X_test_s = pd.read_csv('Data/generated_splits/features_selected_pca/X2_test_p.csv', sep=',', header=0)
        y_train_s = pd.read_csv('Data/generated_splits/features_selected_pca/y2_train_p.csv', sep=',', header=0)
        y_test_s = pd.read_csv('Data/generated_splits/features_selected_pca/y2_test_p.csv', sep=',', header=0)
else: raise ValueError('VERSION value must be either 1, 2, 1.1 or 2.1, mySELECTION must be chosen as either RF or PCA.')   

#### Train-Test-split for original files

In [29]:
####################### NEW COMMENT
# Split could also be already done in Datapreparation file because there we have more space etc.^ maybe
# but it can also be made here doesnt matterrr
#######################


# For VERSION == 1 or 2 -> train-test-split for the importet sets must be done
if VERSION == 1:
    # import package imputer
    from sklearn.preprocessing import Imputer
    # # Train/test split, into 20% test size and 80% train size because it is a relatively small dataset
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

    # Use a median fill for train
    imp = Imputer(missing_values=np.nan, strategy = 'median' , axis=0)
    imputed_dataset = pd.DataFrame(imp.fit_transform(X_train))
    imputed_dataset.columns = X_train.columns
    imputed_dataset.index = X_train.index
    X_train = imputed_dataset

    # Use a median fill for the test set
    imputed_dataset = pd.DataFrame(imp.fit_transform(X_test))
    imputed_dataset.columns = X_test.columns
    imputed_dataset.index = X_test.index
    X_test = imputed_dataset

    # Extract the feature labels
    feature_labels = list(X)
    print('Type of feature_labels = ' + str(type(feature_labels)), '\n')
    
elif VERSION == 2:
    # import package imputer
    from sklearn.preprocessing import Imputer
    
    # # Train/test split, into 20% test size and 80% train size because it is a relatively small dataset
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

    # Use a median fill for train
    imp = Imputer(missing_values=np.nan, strategy = 'median' , axis=0)
    imputed_dataset = pd.DataFrame(imp.fit_transform(X_train))
    imputed_dataset.columns = X_train.columns
    imputed_dataset.index = X_train.index
    X_train = imputed_dataset

    # Use a median fill for the test set
    imputed_dataset = pd.DataFrame(imp.fit_transform(X_test))
    imputed_dataset.columns = X_test.columns
    imputed_dataset.index = X_test.index
    X_test = imputed_dataset

    # Extract the feature labels
    feature_labels = list(X)
    print('Type of feature_labels = ' + str(type(feature_labels)), '\n')

else: print('No Train/Test split needed for chosen VERSION '+str(VERSION)+
            '. Loaded Datasets were already pre-splitted and imputed in'+'\n'+'Feature Selection (Chapter 2).')

No Train/Test split needed for chosen VERSION 1.1. Loaded Datasets were already pre-splitted and imputed in
Feature Selection (Chapter 2).


### 3.1.2. Print out Shape and Form of Feature Matrix and Response Vector

### Train Set

In [None]:
print('Features Selected with ' + str(sel_feat))
print('Version ' + str(VERSION) + '; ' + str(sel_version))

print("")
if VERSION = 1:
    print('Shape (rows, columns) of Feature Matrix X (Train) ' + '=' + str(X_train.shape))
    print("")
    print('Feature Matrix X (Train) with Selected Features')
    display(X_train[0:3])
    print("")
    print('Response Vector y (Train) after Feature Selection')
    display(y_train[0:3])
    print("")
elif VERSION = 2:
    print('Shape (rows, columns) of Feature Matrix X (Train) ' + '=' + str(X_train.shape))
    print("")
    print('Feature Matrix X (Train) with Selected Features')
    display(X_train[0:3])
    print("")
    print('Response Vector y (Train) after Feature Selection')
    display(y_train[0:3])
    print("")
else:
    print('Shape (rows, columns) of Feature Matrix X (Train) ' + '=' + str(X_train_s.shape))
    print("")
    print('Feature Matrix X (Train) with Selected Features')
    display(X_train_s[0:3])
    print("")
    print('Response Vector y (Train) after Feature Selection')
    display(y_train_s[0:3])
    print("")

### Test Set

In [None]:
print('Features Selected with ' + str(sel_feat))
print('Version ' + str(VERSION) + '; ' + str(sel_version))

print("")
if VERSION = 1:
    print('Shape (rows, columns) of Feature Matrix X (Test) ' + '=' + str(X_test.shape))
    print("")
    print('Feature Matrix X (Test) with Selected Features')
    display(X_test[0:3])
    print("")
    print('Response Vector y (Test) after Feature Selection')
    display(y_test[0:3])
    print("")
elif VERSION = 2:
    print('Shape (rows, columns) of Feature Matrix X (Test) ' + '=' + str(X_test.shape))
    print("")
    print('Feature Matrix X (Test) with Selected Features')
    display(X_test[0:3])
    print("")
    print('Response Vector y (Test) after Feature Selection')
    display(y_test[0:3])
    print("")
else:
    print('Shape (rows, columns) of Feature Matrix X (Test) ' + '=' + str(X_test_s.shape))
    print("")
    print('Feature Matrix X (Test) with Selected Features')
    display(X_test_s[0:3])
    print("")
    print('Response Vector y (Test) after Feature Selection')
    display(y_test_s[0:3])
    print("")

## 3.2. SVM

### Two different SVM tests are applied:
#### => SVM1 = SVM with random parameters
#### => SVM2 = SVM with other parameters

### 3.2.1. SVM1 : SVM with random parameters

### Kernel: rbf

In [None]:
# Create pipeline object with standard scaler and SVC estimator
# Standardscaler standardizes the input variables
pipe1 = Pipeline([('scaler', StandardScaler()), 
                 ('classifier', SVC(random_state=0))])

In [None]:
# Define parameter grid
param_grid1 = [{'scaler': [StandardScaler()],
               'classifier': [SVC(kernel='rbf')],
               'classifier__gamma': [1, 10],
               'classifier__C': [10, 100]},
              {'scaler': [StandardScaler(), None],
               'classifier': [LogisticRegression()],
               'classifier__C': [10, 100]}]

In [None]:
# Run grid search
if VERSION == 1:
    grid1 = GridSearchCV(pipe1, param_grid1, cv=5, n_jobs=-1)
    grid1.fit(X_train, y_train)
elif VERSION == 2:
    grid1 = GridSearchCV(pipe1, param_grid1, cv=5, n_jobs=-1)
    grid1.fit(X_train, y_train)
else: 
    grid1 = GridSearchCV(pipe1, param_grid1, cv=5, n_jobs=-1)
    grid1.fit(X_train_s, y_train_s)

In [None]:
# Print results
print("")
print('Best CV accuracy: {:.2f}'.format(grid1.best_score_))
if VERSION == 1:
    print('Test score:       {:.2f}'.format(grid1.score(X_test, y_test)))
elif VERSION == 2:
    print('Test score:       {:.2f}'.format(grid1.score(X_test, y_test)))
else: 
    print('Test score:       {:.2f}'.format(grid1.score(X_test_s, y_test_s)))
print("")
print('Best parameters: {}'.format(grid1.best_params_))

In [None]:
# Predict classes
if VERSION == 1:
    y_pred1 = grid1.predict(X_test)
elif VERSION == 2:
    y_pred1 = grid1.predict(X_test)
else:
    y_pred1 = grid1.predict(X_test_s)
    
display(y_pred1[0:20])

In [None]:
print("")
print('Metrics of Classification with SVM1 (random parameters), kernel rbf:')
print("")
if VERSION == 1:
    print(metrics.classification_report(y_test, y_pred1))
elif VERSION == 2:
    print(metrics.classification_report(y_test, y_pred1))
else:
    print(metrics.classification_report(y_test_s, y_pred1))
    
print("")
print('Confusion Matrix with SVM1 (random parameters), kernel rbf:')
print("")
if VERSION == 1:
    print(metrics.confusion_matrix(y_test, y_pred1))
elif VERSION == 2:
    print(metrics.confusion_matrix(y_test, y_pred1))
else: 
    print(metrics.confusion_matrix(y_test_s, y_pred1))

### 3.2.1. SVM2 : SVM with other parameters

### Kernel: linear

In [None]:
# Create pipeline object with standard scaler and SVC estimator
# Standardscaler standardizes the input variables
pipe2 = Pipeline([('scaler', StandardScaler()), 
                 ('classifier', SVC(random_state=0))])

In [None]:
# Define parameter grid
param_grid2 = [{'scaler': [StandardScaler()],
               'classifier': [SVC(kernel='linear')],
               'classifier__gamma': [1, 10],
               'classifier__C': [10, 100]},
              {'scaler': [StandardScaler(), None],
               'classifier': [LogisticRegression()],
               'classifier__C': [10, 100]}]

In [None]:
# Run grid search
if VERSION == 1:
    grid2 = GridSearchCV(pipe2, param_grid2, cv=5, n_jobs=-1)
    grid2.fit(X_train, y_train)
elif VERSION == 2:
    grid2 = GridSearchCV(pipe2, param_grid2, cv=5, n_jobs=-1)
    grid2.fit(X_train, y_train)
else: 
    grid2 = GridSearchCV(pipe2, param_grid2, cv=5, n_jobs=-1)
    grid2.fit(X_train_s, y_train_s)

In [None]:
# Print results
print("")
print('Best CV accuracy: {:.2f}'.format(grid2.best_score_))
if VERSION == 1:
    print('Test score:       {:.2f}'.format(grid2.score(X_test, y_test)))
elif VERSION == 2:
    print('Test score:       {:.2f}'.format(grid2.score(X_test, y_test)))
else: 
    print('Test score:       {:.2f}'.format(grid2.score(X_test_s, y_test_s)))
print("")
print('Best parameters: {}'.format(grid2.best_params_))

In [None]:
# Predict classes
if VERSION == 1:
    y_pred2 = grid2.predict(X_test)
elif VERSION == 2:
    y_pred2 = grid2.predict(X_test)
else:
    y_pred2 = grid2.predict(X_test_s)
    
display(y_pred2[0:20])

In [None]:
print("")
print('Metrics of Classification with SVM2 (random parameters), kernel linear:')
print("")
if VERSION == 1:
    print(metrics.classification_report(y_test, y_pred2))
elif VERSION == 2:
    print(metrics.classification_report(y_test, y_pred2))
else:
    print(metrics.classification_report(y_test_s, y_pred2))
    
print("")
print('Confusion Matrix with SVM2 (random parameters), kernel linear:')
print("")
if VERSION == 1:
    print(metrics.confusion_matrix(y_test, y_pred2))
elif VERSION == 2:
    print(metrics.confusion_matrix(y_test, y_pred2))
else: 
    print(metrics.confusion_matrix(y_test_s, y_pred2))

### Kernel: poly

In [None]:
# Create pipeline object with standard scaler and SVC estimator
pipe3 = Pipeline([('scaler', StandardScaler()), 
                 ('classifier', SVC(random_state=0))])

In [None]:
# Define parameter grid
param_grid3 = [{'scaler': [StandardScaler()],
               'classifier': [SVC(kernel= "poly")],
               'classifier__gamma': [1, 10],
               'classifier__C': [10, 100]},
              {'scaler': [StandardScaler(), None],
               'classifier': [LogisticRegression()],
               'classifier__C': [10, 100]}]

In [None]:
# Run grid search
if VERSION == 1:
    grid3 = GridSearchCV(pipe3, param_grid3, cv=5, n_jobs=-1)
    grid3.fit(X_train, y_train)
elif VERSION == 2:
    grid3 = GridSearchCV(pipe3, param_grid3, cv=5, n_jobs=-1)
    grid3.fit(X_train, y_train)
else: 
    grid3 = GridSearchCV(pipe3, param_grid3, cv=5, n_jobs=-1)
    grid3.fit(X_train_s, y_train_s)

In [None]:
# Print results
print("")
print('Best CV accuracy: {:.2f}'.format(grid3.best_score_))
if VERSION == 1:
    print('Test score:       {:.2f}'.format(grid3.score(X_test, y_test)))
elif VERSION == 2:
    print('Test score:       {:.2f}'.format(grid3.score(X_test, y_test)))
else: 
    print('Test score:       {:.2f}'.format(grid3.score(X_test_s, y_test_s)))
print("")
print('Best parameters: {}'.format(grid3.best_params_))

In [None]:
# Predict classes
if VERSION == 1:
    y_pred3 = grid3.predict(X_test)
elif VERSION == 2:
    y_pred3 = grid3.predict(X_test)
else:
    y_pred3 = grid3.predict(X_test_s)
    
display(y_pred3[0:20])

In [None]:
print("")
print('Metrics of Classification with SVM3 (random parameters), kernel poly:')
print("")
if VERSION == 1:
    print(metrics.classification_report(y_test, y_pred3))
elif VERSION == 2:
    print(metrics.classification_report(y_test, y_pred3))
else:
    print(metrics.classification_report(y_test_s, y_pred3))
    
print("")
print('Confusion Matrix with SVM3 (random parameters), kernel poly:')
print("")
if VERSION == 1:
    print(metrics.confusion_matrix(y_test, y_pred3))
elif VERSION == 2:
    print(metrics.confusion_matrix(y_test, y_pred3))
else: 
    print(metrics.confusion_matrix(y_test_s, y_pred3))