# ML in Finance Group Project
### Group 2: Barbara Capl, Mathias Lüthi, Pamela Matias, Stefanie Rentsch
## 3. SVM

In [1]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.formula.api as smf
plt.style.use('seaborn-whitegrid')
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Imputer

  from pandas.core import datetools


### Import Feature Matrix and Response Vector

In [2]:
# Load in the response vector 
y = pd.read_csv('Data/generated_datasets/response_1.csv', sep=',', header=0)

# Choose which attribute matrice you want to load in by uncommenting it.
X = pd.read_csv('Data/generated_datasets/attributes_ratios_1.csv', sep=',', header=0)
# X = pd.read_csv('Data/generated_datasets/attributes_additional_1.csv', sep=',', header=0)

### Test/ Train Split and fill missing values with mean

In [3]:
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Use a mean fill for train
imp = Imputer(missing_values=np.nan, strategy = 'mean' , axis=0)
imputed_dataset = pd.DataFrame(imp.fit_transform(X_train))
imputed_dataset.columns = X_train.columns
imputed_dataset.index = X_train.index
X_train = imputed_dataset

# Use a mean fill for the test set
imputed_dataset = pd.DataFrame(imp.fit_transform(X_test))
imputed_dataset.columns = X_test.columns
imputed_dataset.index = X_test.index
X_test = imputed_dataset

# Extract the feature labels
feature_labels = list(X_train)

### SVM

In [4]:
# Create pipeline object with standard scaler and SVC estimator
pipe = Pipeline([('scaler', StandardScaler()), 
                 ('classifier', SVC(random_state=0))])

In [5]:
# Define parameter grid
param_grid = [{'scaler': [StandardScaler()],
               'classifier': [SVC(kernel='rbf')],
               'classifier__gamma': [1, 10],
               'classifier__C': [10, 100]},
              {'scaler': [StandardScaler(), None],
               'classifier': [LogisticRegression()],
               'classifier__C': [10, 100]}]

In [6]:
# Run grid search
grid = GridSearchCV(pipe, param_grid, cv=5, n_jobs=-1)
grid.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('classifier', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=0, shrinking=True,
  tol=0.001, verbose=False))]),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid=[{'scaler': [StandardScaler(copy=True, with_mean=True, with_std=True)], 'classifier': [SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol...olver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)], 'classifier__C': [10, 100]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [7]:
# Print results
print("Observed probability of up/down?: {: .2f}"
       .format(np.count_nonzero(y==0)/len(y)))
# Print results
print('Best CV accuracy: {:.2f}'.format(grid.best_score_))
print('Test score:       {:.2f}'.format(grid.score(X_test, y_test)))
print('Best parameters: {}'.format(grid.best_params_))

Observed probability of up/down?:  0.44
Best CV accuracy: 0.61
Test score:       0.64
Best parameters: {'classifier': LogisticRegression(C=100, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False), 'classifier__C': 100, 'scaler': StandardScaler(copy=True, with_mean=True, with_std=True)}


In [8]:
# Predict   classes
y_pred = grid.predict(X_test)

In [9]:
print(metrics.classification_report(y_test, y_pred))
print(metrics.confusion_matrix(y_test, y_pred))

             precision    recall  f1-score   support

          0       0.61      0.47      0.53       307
          1       0.66      0.77      0.71       403

avg / total       0.64      0.64      0.63       710

[[145 162]
 [ 91 312]]
