In [1]:
import numpy as np
import pandas as pd

In [2]:
sentiment = pd.read_csv("data/tweets_sentiment.csv")
sentiment.drop(sentiment.columns[[0]], axis=1, inplace=True)
sentiment.rename(columns = {'created_at':'Date'}, inplace = True)
spy = pd.read_csv("data/SPY.csv")

In [3]:
df = pd.merge(sentiment, spy, on ='Date', how ="inner")
df.head()

Unnamed: 0,Date,pos_score,neu_score,neg_score,compound,Open,High,Low,Close,Adj Close,Volume
0,2020-04-09,0.073174,0.887866,0.03896,0.114438,277.579987,281.200012,275.470001,278.200012,271.092224,189999200
1,2020-04-13,0.07601,0.887225,0.036771,0.122273,277.140015,277.51001,271.410004,275.660004,268.617126,114839100
2,2020-04-14,0.083978,0.87145,0.044567,0.136125,280.980011,284.899994,275.51001,283.790009,276.539368,134143400
3,2020-04-15,0.083737,0.875221,0.041041,0.152802,277.570007,283.940002,275.459991,277.76001,270.663422,121775000
4,2020-04-16,0.081858,0.878151,0.039988,0.127939,279.149994,280.029999,275.76001,279.100006,271.969208,131798300


In [4]:
opn = {'Date': [df['Date'][i] for i in range(3,len(df))], \
       'Movement': [1 if (df['Volume'][i] - df['Volume'][i-1]) / (df['Volume'][i-1]) > 0.01 else 0 for i in range(3,len(df))], \
       'Vol1': [df['Volume'][i-1] for i in range(3,len(df))], \
       'Vol2': [df['Volume'][i-2] for i in range(3,len(df))], \
       'Vol3': [df['Volume'][i-3] for i in range(3,len(df))]}

sen = {'Date': [df['Date'][i] for i in range(3,len(df))], \
       'Sentiment1': [df['compound'][i-1] for i in range(3,len(df))], \
       'Sentiment2': [df['compound'][i-2] for i in range(3,len(df))], \
       'Sentiment3': [df['compound'][i-3] for i in range(3,len(df))]}

In [5]:
o = pd.DataFrame(opn)
s = pd.DataFrame(sen)
data = pd.merge(o, s, on='Date', how='inner')

In [6]:
data.head()

Unnamed: 0,Date,Movement,Vol1,Vol2,Vol3,Sentiment1,Sentiment2,Sentiment3
0,2020-04-15,0,134143400,114839100,189999200,0.136125,0.122273,0.114438
1,2020-04-16,1,121775000,134143400,114839100,0.152802,0.136125,0.122273
2,2020-04-17,1,131798300,121775000,134143400,0.127939,0.152802,0.136125
3,2020-04-20,0,146684800,131798300,121775000,0.130096,0.127939,0.152802
4,2020-04-21,1,100109300,146684800,131798300,0.109606,0.130096,0.127939


## Training Model

In [7]:
from sklearn.model_selection import train_test_split
X = data[data.columns[2:]]
y = data[data.columns[1]]
X_train, X_test, y_train, y_test =train_test_split(X, y, test_size=0.3)

In [8]:
positives = sum([1 if y[i] == 1 else 0 for i in range(len(y))])
negatives = len(y) - positives
print("Positive: ", positives)
print("Negative: ", negatives)

Positive:  24
Negative:  26


In [9]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler() 
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

### Linear Kernel

In [10]:
from sklearn.svm import SVC
classifier = SVC(kernel='linear',verbose=True)
classifier.fit(X_train, y_train)

[LibSVM]..*
optimization finished, #iter = 97
obj = -16.860521, rho = -0.056381
nSV = 22, nBSV = 14
Total nSV = 22


SVC(kernel='linear', verbose=True)

In [11]:
y_pred = classifier.predict(X_test)

In [12]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred));

              precision    recall  f1-score   support

           0       0.80      0.40      0.53        10
           1       0.40      0.80      0.53         5

    accuracy                           0.53        15
   macro avg       0.60      0.60      0.53        15
weighted avg       0.67      0.53      0.53        15



### RBF Kernel

In [13]:
classifier = SVC(kernel='rbf',verbose=True)
classifier.fit(X_train, y_train)

[LibSVM]*.*
optimization finished, #iter = 40


SVC(verbose=True)

obj = -21.393209, rho = 0.118034
nSV = 34, nBSV = 20
Total nSV = 34


In [14]:
y_pred = classifier.predict(X_test)

In [15]:
print(classification_report(y_test, y_pred));

              precision    recall  f1-score   support

           0       0.83      0.50      0.62        10
           1       0.44      0.80      0.57         5

    accuracy                           0.60        15
   macro avg       0.64      0.65      0.60        15
weighted avg       0.70      0.60      0.61        15



### Polynomial Kernel

In [16]:
classifier = SVC(kernel='poly',verbose=True)
classifier.fit(X_train, y_train)

[LibSVM].*
optimization finished, #iter = 46


SVC(kernel='poly', verbose=True)

obj = -19.510724, rho = 0.343013
nSV = 30, nBSV = 21
Total nSV = 30


In [17]:
y_pred = classifier.predict(X_test)

In [18]:
print(classification_report(y_test, y_pred));

              precision    recall  f1-score   support

           0       1.00      0.20      0.33        10
           1       0.38      1.00      0.56         5

    accuracy                           0.47        15
   macro avg       0.69      0.60      0.44        15
weighted avg       0.79      0.47      0.41        15



### Sigmoid Kernel

In [19]:
classifier = SVC(kernel='sigmoid',verbose=True)
classifier.fit(X_train, y_train)

[LibSVM]

SVC(kernel='sigmoid', verbose=True)

*
optimization finished, #iter = 25
obj = -26.263777, rho = 0.088477
nSV = 30, nBSV = 26
Total nSV = 30


In [20]:
y_pred = classifier.predict(X_test)

In [21]:
print(classification_report(y_test, y_pred));

              precision    recall  f1-score   support

           0       1.00      0.40      0.57        10
           1       0.45      1.00      0.62         5

    accuracy                           0.60        15
   macro avg       0.73      0.70      0.60        15
weighted avg       0.82      0.60      0.59        15

