In [16]:
import tensorflow as tf
import pandas as pd
from sklearn import model_selection

In [127]:
cols = ["nlikes", "nreplies", "nretweets", "close", "date"]
data = pd.read_csv('data/needed/cleaned/Tweets_and_Stock.csv', usecols=cols)
data = data.dropna(subset=["nlikes", "close"])
print(data.shape)

(10622, 5)


## Model rounded to next 100

In [84]:
#target use close price rounded to next 100
data["close"] = data["close"].apply(lambda x: int(round(x/100)*100))
data["close"] = data["close"].factorize()[0]
data.describe()

Unnamed: 0,nlikes,nreplies,nretweets,close
count,10622.0,10622.0,10622.0,10622.0
mean,24492.01,1093.61881,2467.577198,8.955846
std,71014.87,4041.571601,10172.791564,3.588659
min,2.0,0.0,0.0,0.0
25%,1129.25,57.0,59.0,6.0
50%,3747.5,171.0,219.0,11.0
75%,14036.75,599.0,989.75,11.0
max,1596525.0,120523.0,362906.0,12.0


In [85]:
train, test = model_selection.train_test_split(data, test_size= 0.2)
print(train.shape)
print(test.shape)

train_in = train[["nlikes", "nreplies", "nretweets"]]
train_target = train["close"]

test_in = test[["nlikes", "nreplies", "nretweets"]]
test_target = test["close"]

(8497, 4)
(2125, 4)


In [98]:
normalizer = tf.keras.layers.Normalization(axis=-1) #to make values between 0 and 1
normalizer.adapt(train_in)

model = tf.keras.Sequential([
    normalizer,
    tf.keras.layers.Dense(10, activation='relu'),
    tf.keras.layers.Dropout(0.25),
    tf.keras.layers.Dense(10, activation='relu'),
    tf.keras.layers.Dropout(0.25),
    tf.keras.layers.Dense(max(test_target)+1)
  ])

model.compile(optimizer='adam',
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=['accuracy'])

In [99]:
model.fit(train_in, train_target, epochs=10, batch_size=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1a34126b490>

In [100]:
model.evaluate(test_in, test_target)



[1.7145901918411255, 0.39482352137565613]

## Model rounded to next 1000 --> Binary

In [74]:
data["close"] = data["close"].apply(lambda x: int(round(x/1000)*1000))
data["close"] = data["close"].factorize()[0]

train, test = model_selection.train_test_split(data, test_size= 0.2)

train_in = train[["nlikes", "nreplies", "nretweets"]]
train_target = train["close"]

test_in = test[["nlikes", "nreplies", "nretweets"]]
test_target = test["close"]

In [80]:
normalizer = tf.keras.layers.Normalization(axis=-1) #to make values between 0 and 1
normalizer.adapt(train_in)

model_bin = tf.keras.Sequential([
    normalizer,
    tf.keras.layers.Dense(10, activation='relu'),
    tf.keras.layers.Dropout(0.25),
    tf.keras.layers.Dense(10, activation='relu'),
    tf.keras.layers.Dropout(0.25),
    tf.keras.layers.Dense(1)
  ])

model_bin.compile(optimizer='adam',
  loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
  metrics=['accuracy'])

In [81]:
model_bin.fit(train_in, train_target, epochs=10, batch_size=50)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1a3407a4ca0>

In [82]:
model_bin.evaluate(test_in, test_target)



[0.3828790485858917, 0.827294111251831]

## Binary up or down per day

In [128]:
data = data.groupby(["date"])

data1 = data.sum() # for tweet statistics
data2 = data.mean() # for stock price
data1["close"] = data2["close"]
data = data1

# function to assign 1 if price went up, 0 otherwise
def direction (x):
    if x > 0:
        return 1
    else:
        return 0

# assign 1 or 0 to every date
data["close_diff"] = data["close"].diff()
data["direction"] = data["close_diff"].apply(direction)
print(data)

train, test = model_selection.train_test_split(data, test_size= 0.2)

train_in = train[["nlikes", "nreplies", "nretweets"]]
train_target = train["direction"]

test_in = test[["nlikes", "nreplies", "nretweets"]]
test_target = test["direction"]

              nlikes  nreplies  nretweets    close  close_diff  direction
date                                                                     
2011-12-01     267.0      63.0       24.0    6.520         NaN          0
2011-12-21    1330.0      87.0      597.0    5.514      -1.006          0
2011-12-22    1349.0     132.0      206.0    5.554       0.040          1
2011-12-27      39.0      13.0       34.0    5.714       0.160          1
2011-12-28      12.0       7.0        1.0    5.702      -0.012          0
...              ...       ...        ...      ...         ...        ...
2022-02-25  118902.0    9995.0     6482.0  809.870      45.830          1
2022-03-01  259382.0    7631.0    14625.0  864.370      54.500          1
2022-03-02  496142.0   19800.0    51087.0  879.890      15.520          1
2022-03-03  688520.0   21765.0    73639.0  839.290     -40.600          0
2022-03-04   95202.0    4688.0     4779.0  838.290      -1.000          0

[1636 rows x 6 columns]


In [129]:
normalizer = tf.keras.layers.Normalization(axis=-1) #to make values between 0 and 1
normalizer.adapt(train_in)

model_bin = tf.keras.Sequential([
    normalizer,
    tf.keras.layers.Dense(10, activation='relu'),
    tf.keras.layers.Dropout(0.25),
    tf.keras.layers.Dense(10, activation='relu'),
    tf.keras.layers.Dropout(0.25),
    tf.keras.layers.Dense(1)
  ])

model_bin.compile(optimizer='adam',
  loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
  metrics=['accuracy'])

In [130]:
model_bin.fit(train_in, train_target, epochs=10, batch_size=5)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1a348d7ba00>

In [131]:
model_bin.evaluate(test_in, test_target)



[0.694447934627533, 0.4878048896789551]