In [1]:
# import packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.formula.api as smf
plt.style.use('seaborn-whitegrid')
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

  from pandas.core import datetools


In [2]:
# read Data
data_bereinigt = pd.read_csv("../Data/data_bereinigt.csv")
data_bereinigt = data_bereinigt.set_index(["PERMNO", "DATE"])
data_bereinigt.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,ewretx,vwretx,RETX,ewretd,sprtrn,VOL,vwretd,RET,SHRENDDT,ALTPRC,ALTPRCDT,PEG_trailing,SHROUT,staff_sale,ASKHI,PREDICTION
PERMNO,DATE,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
10107,2006-02-01,0.003476,-0.003552,-0.045471,0.004836,0.000453,11088149.0,-0.001637,-0.042273,20060330.0,26.87,20060228.0,10.28,10333369.0,0.0,28.04,1.0
10107,2006-03-01,0.035232,0.017585,0.012653,0.036978,0.011065,14514337.0,0.019053,0.012653,20060423.0,27.21,20060331.0,10.41,10225000.0,0.0,27.89,0.0
10107,2006-04-01,0.008497,0.011494,-0.112459,0.009791,0.012187,14689919.0,0.012965,-0.112459,20060629.0,24.15,20060428.0,9.239,10201203.0,0.0,27.74,0.0
10107,2006-05-01,-0.045982,-0.033025,-0.062112,-0.044331,-0.030917,23651189.0,-0.031045,-0.058385,20060629.0,22.65,20060531.0,0.709,10201203.0,0.0,24.29,1.0
10107,2006-06-01,-0.010474,-0.001881,0.028698,-0.008479,8.7e-05,19980809.0,-0.000386,0.028698,20060817.0,23.3,20060630.0,0.73,10062000.0,0.0,23.4702,1.0


In [3]:
# Create pipeline object with standard scaler and SVC estimator
pipe = Pipeline([('scaler', StandardScaler()), 
                 ('classifier', SVC(random_state=0))])

In [4]:
# Define parameter grid
param_grid = [{'scaler': [StandardScaler()],
               'classifier': [SVC(kernel='rbf')],
               'classifier__gamma': [1, 10],
               'classifier__C': [10, 100]},
              {'scaler': [StandardScaler(), None],
               'classifier': [LogisticRegression()],
               'classifier__C': [10, 100]}]

In [5]:
# Assign features & response to X and y (p.117)
X = data_bereinigt.drop(["PREDICTION"], axis=1)
y = data_bereinigt['PREDICTION']# Train/Test split (p. 104ff) 

In [6]:
# Train/Test split (p. 104ff) 
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state =0, stratify =y)

In [None]:
# Variablen standardisieren
sc = StandardScaler ()
X_train_std =  sc.fit_transform(X_train )
X_test_std = sc.transform(X_test)

In [None]:
# Run grid search
grid = GridSearchCV(pipe, param_grid, cv=5, n_jobs=-1)
grid.fit(X_train_std, y_train)

In [None]:
# Print results
print('Best CV accuracy: {:.2f}'.format(grid.best_score_))
print('Test score:       {:.2f}'.format(grid.score(X_test_std, y_test)))
print('Best parameters: {}'.format(grid.best_params_))

In [None]:
# SVA (Daten skalieren)
    #  Create object
svm_linear = SVC(kernel = "linear", C=1.0)
svm_linear

# Fit linear SVM to standardized training set
svm_linear.fit(X_train_std, y_train)

In [None]:
# Print results
print("Observed probability of up/down?: {: .2f}"
       .format(np.count_nonzero(y==0)/len(y)))
print("Train score: {: .2f}"
       .format(svm_linear.score(X_train_std, y_train)))
print("Test score: {: .2f}"
       .format(svm_linear.score(X_test_std, y_test)))

In [None]:
# Predict   classes
y_pred = svm_linear.predict(X_test_std)

In [None]:
# Confusion Matrix
print ("Confusion matrix: \n", metrics.confusion_matrix(y_test, y_pred))