In [239]:
from sklearn import datasets
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random

%matplotlib inline

In [240]:
# DATA PREPROCESSING

train_data_file_name = 'train_data.csv'
test_data_file_name = 'test_data.csv'

train_data = pd.read_csv(train_data_file_name)
test_data = pd.read_csv(test_data_file_name)

train_data = train_data[0:150]
test_data = test_data[0:150]

train_data.drop(['Date', 'Volume', 'High Price', 'Low Price'], axis=1, inplace=True)
train_data['CPI'] = train_data['Close Price'] - train_data['Close Price'].shift(1)
train_data['CPI'].iloc[0] = 0
train_data['CPI'] = pd.Series(map(lambda x: 1 if x > 0 else 0, train_data['CPI']))



test_data.drop(['Date', 'Volume', 'High Price', 'Low Price'], axis=1, inplace=True)
test_data['CPI'] = test_data['Close Price'] - test_data['Close Price'].shift(1)
test_data['CPI'].iloc[0] = 0
test_data['CPI'] = pd.Series(map(lambda x: 1 if x > 0 else 0, test_data['CPI']))

In [241]:
def split(train_d, test_d):
    x_train = train_d.loc[:, train_d.columns != 'CPI']
    y_train = train_d['CPI']
    x_test = test_d.loc[:, test_d.columns != 'CPI']
    y_test = test_d['CPI']
    return x_train, y_train, x_test, y_test

In [242]:
x_train, y_train, x_test, y_test = split(train_data, test_data)

In [243]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(x_train)

x_train_std = scaler.transform(x_train)
x_test_std = scaler.transform(x_test)

In [265]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

penalty = 100

for i in range(0, 100, 1):
    

    svm = SVC(C=penalty, kernel="linear", probability=True)
    svm.fit(x_train, y_train.values)

    acc_train = accuracy_score(y_train.values, svm.predict(x_train)) * 100
    acc_test = accuracy_score(y_test.values, svm.predict(x_test)) * 100
    %print("Penalty = %.2f, Train accuracy = %.2f %%, Test accuracy = %.2f %%" % (penalty, acc_train, acc_test))

Penalty = 100.00, Train accuracy = 96.67 %, Test accuracy = 81.33 %


In [248]:
print("The labels by model prediction : %s" % svm.predict(x_test))
print("Real labels in the testing set : %s" % y_test.values)

The labels by model prediction : [0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 1 0 0 0 0 0 0 1 0 0 1 1 1 1 1 0 0 0 0 1
 1 0 0 0 1 1 0 1 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 1 1 0 0 0 1 0 0 0 0 0 0
 0 0 0 0 0 1 0 0 0 0 0 1 0 0 1 1 0 0 0 0 0 0 0 0 1 0 0 0 1 0 1 0 0 1 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 1
 0 0]
Real labels in the testing set : [0 1 1 1 1 1 0 1 1 0 1 0 1 1 1 0 1 1 0 0 1 0 0 0 1 0 0 1 1 1 1 1 1 0 0 1 1
 1 0 0 0 1 1 1 0 1 1 0 0 0 0 1 0 1 0 0 0 1 0 0 1 0 1 1 1 0 1 1 0 1 0 1 1 1
 0 0 1 0 1 1 1 0 1 0 0 1 1 0 1 1 1 1 0 1 0 0 1 0 1 0 0 0 1 0 1 1 1 1 0 1 1
 1 0 1 0 0 0 1 0 1 0 1 0 1 1 1 0 1 1 1 1 0 1 1 0 1 1 0 0 1 1 1 0 0 0 1 0 1
 1 1]


In [249]:
for prob in svm.predict_proba(x_test):
    print(["%.2f" % p for p in prob] )

['0.75', '0.25']
['0.54', '0.46']
['0.97', '0.03']
['0.78', '0.22']
['0.97', '0.03']
['0.99', '0.01']
['0.98', '0.02']
['0.62', '0.38']
['0.52', '0.48']
['1.00', '0.00']
['0.42', '0.58']
['1.00', '0.00']
['0.94', '0.06']
['0.10', '0.90']
['0.98', '0.02']
['1.00', '0.00']
['1.00', '0.00']
['0.07', '0.93']
['1.00', '0.00']
['1.00', '0.00']
['1.00', '0.00']
['0.97', '0.03']
['1.00', '0.00']
['1.00', '0.00']
['0.00', '1.00']
['1.00', '0.00']
['1.00', '0.00']
['0.30', '0.70']
['0.23', '0.77']
['0.40', '0.60']
['0.00', '1.00']
['0.36', '0.64']
['0.96', '0.04']
['1.00', '0.00']
['1.00', '0.00']
['1.00', '0.00']
['0.01', '0.99']
['0.14', '0.86']
['1.00', '0.00']
['1.00', '0.00']
['1.00', '0.00']
['0.01', '0.99']
['0.00', '1.00']
['1.00', '0.00']
['0.44', '0.56']
['0.95', '0.05']
['0.00', '1.00']
['1.00', '0.00']
['1.00', '0.00']
['1.00', '0.00']
['1.00', '0.00']
['0.99', '0.01']
['1.00', '0.00']
['0.99', '0.01']
['1.00', '0.00']
['1.00', '0.00']
['1.00', '0.00']
['0.00', '1.00']
['1.00', '0.00

In [None]:
# My computer cannot plot more than 10 points, runs out of memory (MacBook Air :( )