In [7]:
from sklearn import datasets
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random

%matplotlib inline

In [8]:
# DATA PREPROCESSING

train_data_file_name = 'train_data.csv'
test_data_file_name = 'test_data.csv'

train_data = pd.read_csv(train_data_file_name)
test_data = pd.read_csv(test_data_file_name)

train_data = train_data[0:150]
test_data = test_data[0:150]

train_data.drop(['Date', 'Volume', 'High Price', 'Low Price'], axis=1, inplace=True)
train_data['CPI'] = train_data['Close Price'] - train_data['Close Price'].shift(1)
train_data['CPI'].iloc[0] = 0
train_data['CPI'] = pd.Series(map(lambda x: 1 if x > 0 else 0, train_data['CPI']))



test_data.drop(['Date', 'Volume', 'High Price', 'Low Price'], axis=1, inplace=True)
test_data['CPI'] = test_data['Close Price'] - test_data['Close Price'].shift(1)
test_data['CPI'].iloc[0] = 0
test_data['CPI'] = pd.Series(map(lambda x: 1 if x > 0 else 0, test_data['CPI']))

In [9]:
def split(train_d, test_d):
    x_train = train_d.loc[:, train_d.columns != 'CPI']
    y_train = train_d['CPI']
    x_test = test_d.loc[:, test_d.columns != 'CPI']
    y_test = test_d['CPI']
    return x_train, y_train, x_test, y_test

In [10]:
x_train, y_train, x_test, y_test = split(train_data, test_data)

In [11]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(x_train)

x_train_std = scaler.transform(x_train)
x_test_std = scaler.transform(x_test)

In [12]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# penalty of 100 seemed to result in the most accurate model
penalty = 100

svm = SVC(C=penalty, kernel="linear", probability=True)
svm.fit(x_train, y_train.values)

print("The labels by model prediction : %s" % svm.predict(x_test))
print("Real labels in the testing set : %s" % y_test.values)
acc_train = accuracy_score(y_train.values, svm.predict(x_train)) * 100
acc_test = accuracy_score(y_test.values, svm.predict(x_test)) * 100

print("Penalty = %.2f, Train accuracy = %.2f %%, Test accuracy = %.2f %%" % (penalty, acc_train, acc_test))
    


The labels by model prediction : [1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 0 1 1 0 0 0 1 0 0 1 0 0 1 1 1 1 1 1 1 0 1 1
 1 0 0 0 1 1 1 1 1 1 0 0 0 1 1 0 1 1 0 0 1 0 1 1 0 1 1 1 0 1 1 1 1 0 1 1 1
 0 0 1 0 1 1 1 0 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 0 1 0 1 1 1 1 1 1 1
 1 0 1 1 1 1 1 0 1 0 1 0 1 0 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 1 0 1
 1 1]
Real labels in the testing set : [0 1 1 1 1 1 0 1 1 0 1 0 1 1 1 0 1 1 0 0 1 0 0 0 1 0 0 1 1 1 1 1 1 0 0 1 1
 1 0 0 0 1 1 1 0 1 1 0 0 0 0 1 0 1 0 0 0 1 0 0 1 0 1 1 1 0 1 1 0 1 0 1 1 1
 0 0 1 0 1 1 1 0 1 0 0 1 1 0 1 1 1 1 0 1 0 0 1 0 1 0 0 0 1 0 1 1 1 1 0 1 1
 1 0 1 0 0 0 1 0 1 0 1 0 1 1 1 0 1 1 1 1 0 1 1 0 1 1 0 0 1 1 1 0 0 0 1 0 1
 1 1]
Penalty = 100.00, Train accuracy = 96.67 %, Test accuracy = 81.33 %
