# SVM 

In [2]:
# Load packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.formula.api as smf
plt.style.use('seaborn-whitegrid')
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

  from pandas.core import datetools


## 1. Import Standardized Data

### 1.1 Version 1 with only ratios as predictive features

In [2]:
# import Data (already splitted to train/test-data and selected features-> bc_randomforest_feature_selection)
X1_train_s = pd.read_csv("Data/generated_splits/X1_train_s.csv")
y1_train = pd.read_csv("Data/generated_splits/y1_train")

X1_test_s = pd.read_csv("Data/generated_splits/X1_test_s.csv")
y1_test = pd.read_csv("Data/generated_splits/y1_test")

#### 1.1.1 Set index

In [3]:
# rename column 1 from unnamed to index_number
colNms_X1_train = X1_train_s.columns.values
colNms_X1_train[0] = "index_number"
colNms_y1_train = y1_train.columns.values
colNms_y1_train[0] = "index_number"
colNms_X1_test = X1_test_s.columns.values
colNms_X1_test[0] = "index_number"
colNms_y1_test = y1_test.columns.values
colNms_y1_test[0] = "index_number"

# set index
X1_train_s = X1_train_s.set_index(["index_number"])
y1_train = y1_train.set_index(["index_number"])
X1_test_s = X1_test_s.set_index(["index_number"])
y1_test = y1_test.set_index(["index_number"])

### Version 2 with ratios + seasonality and other market data

In [3]:
# import Data (already splitted to train/test-data and selected features-> bc_randomforest_feature_selection)
X2_train_s = pd.read_csv("Data/generated_splits/X2_train_s.csv")
y2_train = pd.read_csv("Data/generated_splits/y2_train")

X2_test_s = pd.read_csv("Data/generated_splits/X2_test_s.csv")
y2_test = pd.read_csv("Data/generated_splits/y2_test")

#### 1.2.1 Set index

In [4]:
# rename column 1 from unnamed to index_number
colNms_X2_train = X2_train_s.columns.values
colNms_X2_train[0] = "index_number"
colNms_y2_train = y2_train.columns.values
colNms_y2_train[0] = "index_number"
colNms_X2_test = X2_test_s.columns.values
colNms_X2_test[0] = "index_number"
colNms_y2_test = y2_test.columns.values
colNms_y2_test[0] = "index_number"

# set index
X2_train_s = X2_train_s.set_index(["index_number"])
y2_train = y2_train.set_index(["index_number"])
X2_test_s = X2_test_s.set_index(["index_number"])
y2_test = y2_test.set_index(["index_number"])

## 2. SVM

### 2.1 Version 1

#### 2.1.1 Version 1 with random parameters (1)

In [18]:
# Create pipeline object with standard scaler and SVC estimator
pipe1 = Pipeline([('scaler', StandardScaler()), 
                 ('classifier', SVC(random_state=0))])

In [19]:
# Define parameter grid
param_grid1 = [{'scaler': [StandardScaler()],
               'classifier': [SVC(kernel='rbf')],
               'classifier__gamma': [1, 10],
               'classifier__C': [10, 100]},
              {'scaler': [StandardScaler(), None],
               'classifier': [LogisticRegression()],
               'classifier__C': [10, 100]}]

In [20]:
# Run grid search
grid1 = GridSearchCV(pipe1, param_grid1, cv=5, n_jobs=-1)
grid1.fit(X1_train_s, y1_train)

  y = column_or_1d(y, warn=True)


GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('classifier', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=0, shrinking=True,
  tol=0.001, verbose=False))]),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid=[{'scaler': [StandardScaler(copy=True, with_mean=True, with_std=True)], 'classifier': [SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol...olver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)], 'classifier__C': [10, 100]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [21]:
# Print results
print('Best CV accuracy: {:.2f}'.format(grid1.best_score_))
print('Test score:       {:.2f}'.format(grid1.score(X1_test_s, y1_test)))
print('Best parameters: {}'.format(grid1.best_params_))

Best CV accuracy: 0.59
Test score:       0.62
Best parameters: {'classifier': LogisticRegression(C=100, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False), 'classifier__C': 100, 'scaler': StandardScaler(copy=True, with_mean=True, with_std=True)}


In [22]:
# Predict   classes
y_pred1 = grid1.predict(X1_test_s)

In [23]:
print(metrics.classification_report(y1_test, y_pred1))
print(metrics.confusion_matrix(y1_test, y_pred1))

             precision    recall  f1-score   support

          0       0.62      0.29      0.39       307
          1       0.61      0.86      0.72       403

avg / total       0.62      0.62      0.58       710

[[ 89 218]
 [ 55 348]]


#### 2.1.2 Version 1 with other parameters (2)

In [24]:
# Create pipeline object with standard scaler and SVC estimator
pipe2 = Pipeline([('scaler', StandardScaler()), 
                 ('classifier', SVC(random_state=0))])

In [25]:
# Define parameter grid
param_grid2 = [{'scaler': [StandardScaler()],
               'classifier': [SVC(kernel='linear')],
               'classifier__gamma': [1, 10],
               'classifier__C': [10, 100]},
              {'scaler': [StandardScaler(), None],
               'classifier': [LogisticRegression()],
               'classifier__C': [10, 100]}]

In [26]:
# Run grid search
grid2 = GridSearchCV(pipe2, param_grid2, cv=5, n_jobs=-1)
grid2.fit(X1_train_s, y1_train)

  y = column_or_1d(y, warn=True)


GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('classifier', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=0, shrinking=True,
  tol=0.001, verbose=False))]),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid=[{'scaler': [StandardScaler(copy=True, with_mean=True, with_std=True)], 'classifier': [SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  ...olver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)], 'classifier__C': [10, 100]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [27]:
# Print results
print('Best CV accuracy: {:.2f}'.format(grid2.best_score_))
print('Test score:       {:.2f}'.format(grid2.score(X1_test_s, y1_test)))
print('Best parameters: {}'.format(grid2.best_params_))

Best CV accuracy: 0.59
Test score:       0.62
Best parameters: {'classifier': LogisticRegression(C=100, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False), 'classifier__C': 100, 'scaler': StandardScaler(copy=True, with_mean=True, with_std=True)}


In [28]:
# Predict   classes
y_pred2 = grid2.predict(X1_test_s)

In [29]:
print(metrics.classification_report(y1_test, y_pred2))
print(metrics.confusion_matrix(y1_test, y_pred2))

             precision    recall  f1-score   support

          0       0.62      0.29      0.39       307
          1       0.61      0.86      0.72       403

avg / total       0.62      0.62      0.58       710

[[ 89 218]
 [ 55 348]]


#### 2.1.3 Version 1 with other parameters (3)

In [6]:
# Create pipeline object with standard scaler and SVC estimator
#pipe3 = Pipeline([('scaler', StandardScaler()), 
                 #('classifier', SVC(random_state=0))])

In [7]:
# Define parameter grid
#param_grid3 = [{'scaler': [StandardScaler()],
               #'classifier': [SVC(kernel= "poly")],
               #'classifier__gamma': [1, 10],
               #'classifier__C': [10, 100]},
              #{'scaler': [StandardScaler(), None],
               #'classifier': [LogisticRegression()],
               #'classifier__C': [10, 100]}]

In [None]:
# Run grid search
#grid3 = GridSearchCV(pipe3, param_grid3, cv=5, n_jobs=-1)
#grid3.fit(X1_train_s, y1_train)

In [None]:
# Print results
#print('Best CV accuracy: {:.2f}'.format(grid3.best_score_))
#print('Test score:       {:.2f}'.format(grid3.score(X1_test_s, y1_test)))
#print('Best parameters: {}'.format(grid3.best_params_))

In [None]:
# Predict   classes
#y_pred3 = grid3.predict(X1_test_s)

In [None]:
#print(metrics.classification_report(y1_test, y_pred3))
#print(metrics.confusion_matrix(y1_test, y_pred3))

In [None]:
# compare different svm versions with different parameters and try the best one with balanced data

### 2.2 Version 2

#### 2.2.1 Version 2 with random parameters (1)

In [5]:
# Create pipeline object with standard scaler and SVC estimator
pipe5 = Pipeline([('scaler', StandardScaler()), 
                 ('classifier', SVC(random_state=0))])

In [6]:
# Define parameter grid
param_grid5 = [{'scaler': [StandardScaler()],
               'classifier': [SVC(kernel='rbf')],
               'classifier__gamma': [1, 10],
               'classifier__C': [10, 100]},
              {'scaler': [StandardScaler(), None],
               'classifier': [LogisticRegression()],
               'classifier__C': [10, 100]}]

In [8]:
# Run grid search
grid5 = GridSearchCV(pipe5, param_grid5, cv=5, n_jobs=-1)
grid5.fit(X2_train_s, y2_train)

  y = column_or_1d(y, warn=True)


GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('classifier', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=0, shrinking=True,
  tol=0.001, verbose=False))]),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid=[{'scaler': [StandardScaler(copy=True, with_mean=True, with_std=True)], 'classifier': [SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol...olver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)], 'classifier__C': [10, 100]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [9]:
# Print results
print('Best CV accuracy: {:.2f}'.format(grid5.best_score_))
print('Test score:       {:.2f}'.format(grid5.score(X2_test_s, y2_test)))
print('Best parameters: {}'.format(grid5.best_params_))

Best CV accuracy: 0.99
Test score:       1.00
Best parameters: {'classifier': LogisticRegression(C=100, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False), 'classifier__C': 100, 'scaler': StandardScaler(copy=True, with_mean=True, with_std=True)}


In [10]:
# Predict   classes
y_pred5 = grid5.predict(X2_test_s)

In [11]:
print(metrics.classification_report(y2_test, y_pred5))
print(metrics.confusion_matrix(y2_test, y_pred5))

             precision    recall  f1-score   support

          0       0.99      1.00      1.00       307
          1       1.00      0.99      1.00       403

avg / total       1.00      1.00      1.00       710

[[307   0]
 [  3 400]]


#### 2.2.2 Version 2 with other parameters (2)

In [12]:
# Create pipeline object with standard scaler and SVC estimator
pipe6 = Pipeline([('scaler', StandardScaler()), 
                 ('classifier', SVC(random_state=0))])

In [15]:
# Define parameter grid
param_grid6 = [{'scaler': [StandardScaler()],
               'classifier': [SVC(kernel='linear')],
               'classifier__gamma': [1, 10],
               'classifier__C': [10, 100]},
              {'scaler': [StandardScaler(), None],
               'classifier': [LogisticRegression()],
               'classifier__C': [10, 100]}]

In [16]:
# Run grid search
grid6 = GridSearchCV(pipe6, param_grid6, cv=5, n_jobs=-1)
grid6.fit(X2_train_s, y2_train)

  y = column_or_1d(y, warn=True)


GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('classifier', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=0, shrinking=True,
  tol=0.001, verbose=False))]),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid=[{'scaler': [StandardScaler(copy=True, with_mean=True, with_std=True)], 'classifier': [SVC(C=100, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=1, kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0...olver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)], 'classifier__C': [10, 100]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [17]:
# Print results
print('Best CV accuracy: {:.2f}'.format(grid6.best_score_))
print('Test score:       {:.2f}'.format(grid6.score(X2_test_s, y2_test)))
print('Best parameters: {}'.format(grid6.best_params_))

Best CV accuracy: 0.99
Test score:       1.00
Best parameters: {'classifier': SVC(C=100, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=1, kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False), 'classifier__C': 100, 'classifier__gamma': 1, 'scaler': StandardScaler(copy=True, with_mean=True, with_std=True)}


In [18]:
# Predict   classes
y_pred6 = grid6.predict(X2_test_s)

In [19]:
print(metrics.classification_report(y2_test, y_pred6))
print(metrics.confusion_matrix(y2_test, y_pred6))

             precision    recall  f1-score   support

          0       0.99      1.00      1.00       307
          1       1.00      1.00      1.00       403

avg / total       1.00      1.00      1.00       710

[[307   0]
 [  2 401]]


#### 2.2.3 Version 2 with other parameters (3)

In [None]:
# Create pipeline object with standard scaler and SVC estimator
#pipe7 = Pipeline([('scaler', StandardScaler()), 
                 #('classifier', SVC(random_state=0))])

In [None]:
# Define parameter grid
#param_grid7 = [{'scaler': [StandardScaler()],
               #'classifier': [SVC(kernel= "poly")],
               #'classifier__gamma': [1, 10],
               #'classifier__C': [10, 100]},
              #{'scaler': [StandardScaler(), None],
               #'classifier': [LogisticRegression()],
               #s'classifier__C': [10, 100]}]

In [None]:
# Run grid search
#grid7 = GridSearchCV(pipe7, param_grid7, cv=5, n_jobs=-1)
#grid7.fit(X2_train_s, y2_train)

In [None]:
# Print results
#print('Best CV accuracy: {:.2f}'.format(grid7.best_score_))
#print('Test score:       {:.2f}'.format(grid7.score(X2_test_s, y2_test)))
#print('Best parameters: {}'.format(grid7.best_params_))

In [None]:
# Predict   classes
#y_pred7 = grid7.predict(X2_test_s)

In [None]:
#print(metrics.classification_report(y2_test, y_pred7))
#print(metrics.confusion_matrix(y2_test, y_pred7))