# SVM 

In [None]:
# Load packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.formula.api as smf
plt.style.use('seaborn-whitegrid')
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

## 1. Import Standardized Data

### 1.1 Version 1 with only ratios as predictive features

In [None]:
# import Data (already splitted to train/test-data and selected features-> bc_randomforest_feature_selection)
X1_train_s = pd.read_csv("Data/generated_splits/X1_train_s.csv")
y1_train_s = pd.read_csv("Data/generated_splits/y1_train_s.csv")

X1_test_s = pd.read.csv("Data/generated_splits/X1_test_s.csv")
y1_test_s = pd.read.csv("Data/generated_splits/y1_test_s.csv")

X1_train_s = X1_train.set_index(["PERMNO", "DATE"])
y1_train_s = y1_train.set_index(["PERMNO", "DATE"])
X1_test_s = X1_train.set_index(["PERMNO", "DATE"])
y1_test_s = y1_train.set_index(["PERMNO", "DATE"])

X1_train_s.head(3)
y1_train_s.head(3)

### Version 2 with ratios + seasonality and other market data

In [None]:
# import Data (already splitted to train/test-data and selected features-> bc_randomforest_feature_selection)
X2_train_s = pd.read_csv("Data/generated_splits/X2_train.csv")
y2_train_s = pd.read_csv("Data/generated_splits/y2_train.csv")

X2_test_s = pd.read.csv("Data/generated_splits/X2_test_s.csv")
y2_test_s = pd.read.csv("Data/generated_splits/y2_test_s.csv")

X2_train_s = X2_train.set_index(["PERMNO", "DATE"])
y2_train_s = y2_train.set_index(["PERMNO", "DATE"])
X2_test_s = X2_train.set_index(["PERMNO", "DATE"])
y2_test_s = y2_train.set_index(["PERMNO", "DATE"])

X2_train_s.head(3)
y2_train_s.head(3)

## 2. SVM

### 2.1 Version 1

#### 2.1.1 Version 1 with random parameters (1)

In [None]:
# Create pipeline object with standard scaler and SVC estimator
pipe1 = Pipeline([('scaler', StandardScaler()), 
                 ('classifier', SVC(random_state=0))])

In [None]:
# Define parameter grid
param_grid1 = [{'scaler': [StandardScaler()],
               'classifier': [SVC(kernel='rbf')],
               'classifier__gamma': [1, 10],
               'classifier__C': [10, 100]},
              {'scaler': [StandardScaler(), None],
               'classifier': [LogisticRegression()],
               'classifier__C': [10, 100]}]

In [None]:
# Run grid search
grid1 = GridSearchCV(pipe1, param_grid1, cv=5, n_jobs=-1)
grid1.fit(X1_train_s, y1_train_s)

In [None]:
# Print results
print("Observed probability of up/down?: {: .2f}"
       .format(np.count_nonzero(y==0)/len(y)))
# Print results
print('Best CV accuracy: {:.2f}'.format(grid1.best_score_))
print('Test score:       {:.2f}'.format(grid1.score(X1_test_s, y1_test_s)))
print('Best parameters: {}'.format(grid1.best_params_))

In [None]:
# Predict   classes
y_pred1 = grid1.predict(X1_test_s)

In [None]:
print(metrics.classification_report(y1_test_s, y_pred1))
print(metrics.confusion_matrix(y1_test_s, y_pred1))

#### 2.1.2 Version 1 with other parameters (2)

In [None]:
# Create pipeline object with standard scaler and SVC estimator
pipe2 = Pipeline([('scaler', StandardScaler()), 
                 ('classifier', SVC(random_state=0))])

In [None]:
# Define parameter grid
param_grid2 = [{'scaler': [StandardScaler()],
               'classifier': [SVC(kernel='linear')],
               'classifier__gamma': [1, 10],
               'classifier__C': [10, 100]},
              {'scaler': [StandardScaler(), None],
               'classifier': [LogisticRegression()],
               'classifier__C': [10, 100]}]

In [None]:
# Run grid search
grid2 = GridSearchCV(pipe2, param_grid2, cv=5, n_jobs=-1)
grid2.fit(X1_train_s, y1_train_s)

In [None]:
# Print results
print("Observed probability of up/down?: {: .2f}"
       .format(np.count_nonzero(y==0)/len(y)))
# Print results
print('Best CV accuracy: {:.2f}'.format(grid2.best_score_))
print('Test score:       {:.2f}'.format(grid2.score(X1_test_s, y1_test_s)))
print('Best parameters: {}'.format(grid2.best_params_))

In [None]:
# Predict   classes
y_pred2 = grid2.predict(X1_test_s)

In [None]:
print(metrics.classification_report(y1_test_s, y_pred2))
print(metrics.confusion_matrix(y1_test_s, y_pred2))

#### 2.1.3 Version 1 with other parameters (3)

In [None]:
# Create pipeline object with standard scaler and SVC estimator
pipe3 = Pipeline([('scaler', StandardScaler()), 
                 ('classifier', SVC(random_state=0))])

In [None]:
# Define parameter grid
param_grid3 = [{'scaler': [StandardScaler()],
               'classifier': [SVC(kernel= "poly")],
               'classifier__gamma': [1, 10],
               'classifier__C': [10, 100]},
              {'scaler': [StandardScaler(), None],
               'classifier': [LogisticRegression()],
               'classifier__C': [10, 100]}]

In [None]:
# Run grid search
grid3 = GridSearchCV(pipe3, param_grid3, cv=5, n_jobs=-1)
grid3.fit(X1_train_s, y1_train_s)

In [None]:
# Print results
print("Observed probability of up/down?: {: .2f}"
       .format(np.count_nonzero(y==0)/len(y)))
# Print results
print('Best CV accuracy: {:.2f}'.format(grid3.best_score_))
print('Test score:       {:.2f}'.format(grid3.score(X1_test_s, y1_test_s)))
print('Best parameters: {}'.format(grid3.best_params_))

In [None]:
# Predict   classes
y_pred3 = grid3.predict(X1_test_s)

In [None]:
print(metrics.classification_report(y1_test_s, y_pred3))
print(metrics.confusion_matrix(y1_test_s, y_pred3))

In [None]:
# compare different svm versions with different parameters and try the best one with balanced data

#### Best SVM with balanced data

In [None]:
# how can I automatically choose the best svm and use it for further "research"???

In [None]:
# Create pipeline object with standard scaler and SVC estimator
# pipe4 = Pipeline([('scaler', StandardScaler()), 
                 #('classifier', SVC(random_state=0))])

In [None]:
# Define parameter grid
# param_grid4 = [{'scaler': [StandardScaler()],
              # 'classifier': [SVC(kernel= "poly", class_weight = "balanced")],
             #  'classifier__gamma': [1, 10],
              # 'classifier__C': [10, 100]},
              #{'scaler': [StandardScaler(), None],
               #'classifier': [LogisticRegression()],
               #'classifier__C': [10, 100]}]

In [None]:
# Run grid search
#grid4 = GridSearchCV(pipe4, param_grid4, cv=5, n_jobs=-1)
#grid4.fit(X1_train_s, y1_train_s)

In [None]:
# Print results
#print("Observed probability of up/down?: {: .2f}"
       #.format(np.count_nonzero(y==0)/len(y)))
# Print results
#print('Best CV accuracy: {:.2f}'.format(grid4.best_score_))
#print('Test score:       {:.2f}'.format(grid4.score(X1_test_s, y1_test_s)))
#print('Best parameters: {}'.format(grid4.best_params_))

In [None]:
# Predict   classes
#y_pred4 = grid4.predict(X1_test_s)

In [None]:
#print(metrics.classification_report(y1_test_s, y_pred4))
#print(metrics.confusion_matrix(y1_test_s, y_pred4))

### 2.2 Version 2

#### 2.2.1 Version 2 with random parameters (1)

In [None]:
# Create pipeline object with standard scaler and SVC estimator
pipe5 = Pipeline([('scaler', StandardScaler()), 
                 ('classifier', SVC(random_state=0))])

In [None]:
# Define parameter grid
param_grid5 = [{'scaler': [StandardScaler()],
               'classifier': [SVC(kernel='rbf')],
               'classifier__gamma': [1, 10],
               'classifier__C': [10, 100]},
              {'scaler': [StandardScaler(), None],
               'classifier': [LogisticRegression()],
               'classifier__C': [10, 100]}]

In [None]:
# Run grid search
grid5 = GridSearchCV(pipe5, param_grid5, cv=5, n_jobs=-1)
grid5.fit(X2_train_s, y2_train_s)

In [None]:
# Print results
print("Observed probability of up/down?: {: .2f}"
       .format(np.count_nonzero(y==0)/len(y)))
# Print results
print('Best CV accuracy: {:.2f}'.format(grid5.best_score_))
print('Test score:       {:.2f}'.format(grid5.score(X2_test_s, y2_test_s)))
print('Best parameters: {}'.format(grid5.best_params_))

In [None]:
# Predict   classes
y_pred5 = grid5.predict(X2_test_s)

In [None]:
print(metrics.classification_report(y2_test_s, y_pred5))
print(metrics.confusion_matrix(y2_test_s, y_pred5))

#### 2.2.2 Version 2 with other parameters (2)

In [None]:
# Create pipeline object with standard scaler and SVC estimator
pipe6 = Pipeline([('scaler', StandardScaler()), 
                 ('classifier', SVC(random_state=0))])

In [None]:
# Define parameter grid
param_grid2 = [{'scaler': [StandardScaler()],
               'classifier': [SVC(kernel='linear')],
               'classifier__gamma': [1, 10],
               'classifier__C': [10, 100]},
              {'scaler': [StandardScaler(), None],
               'classifier': [LogisticRegression()],
               'classifier__C': [10, 100]}]

In [None]:
# Run grid search
grid6 = GridSearchCV(pipe6, param_grid6, cv=5, n_jobs=-1)
grid6.fit(X2_train_s, y2_train_s)

In [None]:
# Print results
print("Observed probability of up/down?: {: .2f}"
       .format(np.count_nonzero(y==0)/len(y)))
# Print results
print('Best CV accuracy: {:.2f}'.format(grid6.best_score_))
print('Test score:       {:.2f}'.format(grid6.score(X2_test_s, y2_test_s)))
print('Best parameters: {}'.format(grid6.best_params_))

In [None]:
# Predict   classes
y_pred6 = grid6.predict(X2_test_s)

In [None]:
print(metrics.classification_report(y2_test_s, y_pred6))
print(metrics.confusion_matrix(y2_test_s, y_pred6))

#### 2.2.3 Version 2 with other parameters (3)

In [None]:
# Create pipeline object with standard scaler and SVC estimator
pipe7 = Pipeline([('scaler', StandardScaler()), 
                 ('classifier', SVC(random_state=0))])

In [None]:
# Define parameter grid
param_grid7 = [{'scaler': [StandardScaler()],
               'classifier': [SVC(kernel= "poly")],
               'classifier__gamma': [1, 10],
               'classifier__C': [10, 100]},
              {'scaler': [StandardScaler(), None],
               'classifier': [LogisticRegression()],
               'classifier__C': [10, 100]}]

In [None]:
# Run grid search
grid7 = GridSearchCV(pipe7, param_grid7, cv=5, n_jobs=-1)
grid7.fit(X2_train_s, y2_train_s)

In [None]:
# Print results
print("Observed probability of up/down?: {: .2f}"
       .format(np.count_nonzero(y==0)/len(y)))
# Print results
print('Best CV accuracy: {:.2f}'.format(grid7.best_score_))
print('Test score:       {:.2f}'.format(grid7.score(X2_test_s, y2_test_s)))
print('Best parameters: {}'.format(grid7.best_params_))

In [None]:
# Predict   classes
y_pred7 = grid7.predict(X2_test_s)

In [None]:
print(metrics.classification_report(y2_test_s, y_pred7))
print(metrics.confusion_matrix(y2_test_s, y_pred7))

#### Best SVM with balanced data

In [None]:
# how can I automatically choose the best svm and use it for further "research"???

### 3 Evaluation

In [None]:
# Choose the best parameters and maybe there is a more efficient way (faster) to get to them...