In [46]:
import numpy as np
import pandas as pd

In [47]:
sentiment = pd.read_csv("data/tweets_sentiment.csv")
sentiment.drop(sentiment.columns[[0]], axis=1, inplace=True)
sentiment.rename(columns = {'created_at':'Date'}, inplace = True)
spy = pd.read_csv("data/SPY.csv")

In [48]:
df = pd.merge(sentiment, spy, on ='Date', how ="inner")
df.head()

Unnamed: 0,Date,pos_score,neu_score,neg_score,compound,Open,High,Low,Close,Adj Close,Volume
0,2020-04-09,0.073174,0.887866,0.03896,0.114438,277.579987,281.200012,275.470001,278.200012,271.092224,189999200
1,2020-04-13,0.07601,0.887225,0.036771,0.122273,277.140015,277.51001,271.410004,275.660004,268.617126,114839100
2,2020-04-14,0.083978,0.87145,0.044567,0.136125,280.980011,284.899994,275.51001,283.790009,276.539368,134143400
3,2020-04-15,0.083737,0.875221,0.041041,0.152802,277.570007,283.940002,275.459991,277.76001,270.663422,121775000
4,2020-04-16,0.081858,0.878151,0.039988,0.127939,279.149994,280.029999,275.76001,279.100006,271.969208,131798300


In [49]:
finance = {'Date': [df['Date'][i] for i in range(4,len(df))], \
       'Movement': [1 if df['Open'][i] - df['Open'][i-1] > 0 else 0 for i in range(4,len(df))], \
       'Open1': [df['Open'][i-1] for i in range(4,len(df))], \
       'Open2': [df['Open'][i-2] for i in range(4,len(df))], \
       'Open3': [df['Open'][i-3] for i in range(4,len(df))], \
       'Vol1': [(df['Open'][i-1] - df['Open'][i-2]) / (df['Open'][i-2])*100 for i in range(4,len(df))], \
       'Vol2': [(df['Open'][i-2] - df['Open'][i-3]) / (df['Open'][i-3])*100 for i in range(4,len(df))], \
       'Vol3': [(df['Open'][i-3] - df['Open'][i-4]) / (df['Open'][i-4])*100 for i in range(4,len(df))]}

sen = {'Date': [df['Date'][i] for i in range(4,len(df))], \
       'Sentiment1': [df['compound'][i-1]*100 for i in range(4,len(df))], \
       'Sentiment2': [df['compound'][i-2]*100 for i in range(4,len(df))], \
       'Sentiment3': [df['compound'][i-3]*100 for i in range(4,len(df))]}

In [50]:
f = pd.DataFrame(finance)
s = pd.DataFrame(sen)
data = pd.merge(f, s, on='Date', how='inner')

In [51]:
data.head()

Unnamed: 0,Date,Movement,Open1,Open2,Open3,Vol1,Vol2,Vol3,Sentiment1,Sentiment2,Sentiment3
0,2020-04-16,1,277.570007,280.980011,277.140015,-1.213611,1.38558,-0.158503,15.280185,13.612527,12.227275
1,2020-04-17,1,279.149994,277.570007,280.980011,0.569221,-1.213611,1.38558,12.793906,15.280185,13.612527
2,2020-04-20,0,285.380005,279.149994,277.570007,2.231779,0.569221,-1.213611,13.009616,12.793906,15.280185
3,2020-04-21,0,282.609985,285.380005,279.149994,-0.970643,2.231779,0.569221,10.960596,13.009616,12.793906
4,2020-04-22,1,276.730011,282.609985,285.380005,-2.080597,-0.970643,2.231779,9.882616,10.960596,13.009616


## Training Model

In [52]:
from sklearn.model_selection import train_test_split
X = data[data.columns[2:]]
y = data[data.columns[1]]
X_train, X_test, y_train, y_test =train_test_split(X, y, test_size=0.3)

In [53]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler() 
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

## Testing different kernels

In [54]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
kernels = ['linear', 'rbf', 'poly', 'sigmoid']
acc = {}
pred = {}

In [55]:
for k in kernels:
    classifier = SVC(kernel=k)
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)
    acc[k] = accuracy_score(y_test, y_pred)
    pred[k] = y_pred

In [56]:
from sklearn.metrics import classification_report
best_kernels = [k for k,v in acc.items() if v == max(acc.values())]
print(best_kernels)
print("Classification report of", best_kernels[0], "kernel")
print(classification_report(y_test, pred[best_kernels[0]]));

['sigmoid']
Classification report of sigmoid kernel
              precision    recall  f1-score   support

           0       0.75      0.50      0.60         6
           1       0.73      0.89      0.80         9

    accuracy                           0.73        15
   macro avg       0.74      0.69      0.70        15
weighted avg       0.74      0.73      0.72        15



## Comparing Best Model with SVM Trained Only on Price

In [57]:
X_train = [p[:len(finance)-2] for p in X_train]
X_test = [p[:len(finance)-2] for p in X_test]

In [58]:
classifier = SVC(kernel=best_kernels[0])
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

In [59]:
from sklearn.metrics import classification_report
print("Classification report of", best_kernels[0], "kernel")
print(classification_report(y_test,y_pred));

Classification report of sigmoid kernel
              precision    recall  f1-score   support

           0       0.50      0.50      0.50         6
           1       0.67      0.67      0.67         9

    accuracy                           0.60        15
   macro avg       0.58      0.58      0.58        15
weighted avg       0.60      0.60      0.60        15

