In [None]:
# import
!pip install torch
!pip install transformers
!pip install tensorflow
from datetime import datetime
import requests
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import csv
import tensorflow as tf
import random
import matplotlib.pyplot as plt

In [None]:
# downloading news from the CNBC web site

# key word that news should include
querry_word = "apple"

params = {
    "queryly_key": "31a35d40a9a64ab3",
    "query": querry_word,
    "endindex": "0",
    "batchsize": "100",
    "callback": "",
    "showfaceted": "true",
    "timezoneoffset": "-120",
    "facetedfields": "formats",
    "facetedkey": "formats|",
    "facetedvalue":
    "!Press Release|",
    "needtoptickers": "1",
    "additionalindexes": "4cd6f71fbf22424d,937d600b0d0d4e23,3bfbe40caee7443e,626fdfcd96444f28"
}

# parts of news article that will be included, in our project we use only Title and Publication Date but Description is also
# present as we can train model on description of articles
goal = ["cn:title", "_pubDate", "description"]


def main(url):
    with requests.Session() as req:
        allin = []
        for page, item in enumerate(range(5000, 20000, 100)):  # to get more or less news change range
                                                          # later news in the begging and old news are in the end
            print(f"Extracting Page# {page +1}")
            params["endindex"] = item
            r = req.get(url, params=params).json()
            for loop in r['results']:
                allin.append([loop[x] for x in goal])
        new = pd.DataFrame(
            allin, columns=["Title", "date", "Description"])
        new.to_csv("datasets/data.csv", index=False)


main("https://api.queryly.com/cnbc/json.aspx")

In [None]:
# combining the news articles data and AAPL stock's data

# AAPL.csv is downloaded from Kaggle dataset about the Apple's stock prices
a = pd.read_csv("datasets/AAPL.csv")
b = pd.read_csv("datasets/data.csv")


# since data from CNBC and Kaggle dataset have different format for "date" (day-month-year:00:00 and day/month/year)
# we change their format and standartize it
a['date'] = pd.to_datetime(a['date'], dayfirst=True, utc = False).dt.tz_localize(None).dt.strftime("%Y-%m-%d")
b['date'] = pd.to_datetime(b['date'], dayfirst=True, utc = False).dt.tz_localize(None).dt.strftime("%Y-%m-%d")


# saving cvs files
a.to_csv("datasets/a.csv")
b.to_csv("datasets/b.csv")

In [None]:
# merging 2 cvs files, saving only entries that share same "date" value so we have infomation about stock price and
# news articles that were published that day regarfing the company
merged_df = pd.merge(a, b, on='date')
print(a.info(), b.info())

# saving merged dataset as "output.csv"
merged_df.to_csv("datasets/output.csv")

In [None]:
# download pretrained model and prepared word embedding
tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")

In [None]:
def sentiment_score(inp):

  # tokenize the article name
  tokens = tokenizer.encode(inp,   return_tensors='pt')

  # pass the tokens to model to get sentiment
  output = model(tokens)

  # pass model output logits through a softmax layer
  sentim_scores = torch.nn.functional.softmax(output.logits, dim=-1)

  # returning sum of values that are multiplied by coeficients: 1 for positive, -1 for neative and 0 for neutaral
  # getting in the end following range: numbers closer to 1 means article is most likely positive, closer to -1 most likely negative and closer to 0 is neuteral
  return  sentim_scores.detach().numpy()[0][0]*1+sentim_scores.detach().numpy()[0][1]*-1+sentim_scores.detach().numpy()[0][2]*0

# creating 'sentiment' column in out csv file. Correlating values are output of sentiment_score function with 'Title' as input
merged_df['sentiment'] = merged_df['Title'].apply(lambda x: sentiment_score(x[:150]))
merged_df.to_csv("datasets/output_sentiment.csv")

In [None]:
#create and compile model
def create_model(in_shape,output_shape):
    model= tf.keras.models.Sequential([
        tf.keras.layers.LSTM(units = 50,return_sequences=True, input_shape=(in_shape[0],in_shape[1]),kernel_regularizer=tf.keras.regularizers.l1(l1=0.01)),
        tf.keras.layers.Dropout(0.3),
        tf.keras.layers.GRU(units=50, return_sequences=False, kernel_regularizer=tf.keras.regularizers.l2(l2=0.01)),
        tf.keras.layers.Dense(output_shape)])

    model.compile(optimizer='adam', loss='mean_squared_error')
    model.summary()
    return model

#We are using the close price of the last 60 days to predict the next day's close (this one will be used for pure LSTM)
#inp_sp=[60,1]

#We take the closing price the volume and sentiment analysis of a newspaper on that day and predict the next day's closing price
inp_sp=[3,1]
out_sp=1
model=create_model(inp_sp,out_sp)

#save model
model.save("predictor5.keras")

In [None]:
#in this file we wil be training the Neural Network

#firstly we need to get the training sample and the labels
df = pd.read_csv('datasets/output_sentiment.csv')

#dataset will be the training sample with shape (n,3,1) where n is the number of rows in /content/output_sentiment .csv
average_sentiment_per_day = df.groupby('date')[["close","sentiment","volume"]].mean().reset_index()
dataset=average_sentiment_per_day[["close","sentiment","volume"]].values.tolist()

#we create the labels, they will be the next close price value for each element in dataset
df = pd.read_csv('datasets/a.csv')
df['date'] = pd.to_datetime(df['date'])

#get list of dates from the dataset
date_list = average_sentiment_per_day["date"].tolist()
next_rows_df = pd.DataFrame()

#for each element add to the labels the close price of the next day of that element
for date in date_list:
    # Find the index of the date
    matching_indices = df.index[df['date'] == date].tolist()
    if matching_indices:
        #make sure the last element is not the last row of the a.csv file other wise there will be no label
        if matching_indices[0]<len(df)-1:
          next_index = matching_indices[0] + 1
          if next_index<len(df):
              # Append the next row to the next_rows_df DataFrame
              next_rows_df = pd.concat([ next_rows_df, df.iloc[next_index]],axis = 1,  join='outer')
              #next_rows_df = next_rows_df.append(df.iloc[next_index])
    else:
      #remove the element with no label
      dataset.pop()


next_rows_df = next_rows_df.T
correct_tmp=next_rows_df["close"].tolist()

#partition dataset into training sample and validation and test sample
training_tmp=dataset[:500]+dataset[600:]
correct_tmp1=correct_tmp[:500]+correct_tmp[600:]
test_sample=dataset[500:600]
test_correct=correct_tmp[500:600]
indices = list(range(len(correct_tmp1)))
random.shuffle(indices)
split = int(len(indices) * 0.7)
training= [training_tmp[i] for i in indices[:split]]
correct= [correct_tmp1[i] for i in indices[:split]]
training=tf.constant(training)
correct=tf.constant(correct)
val= [training_tmp[i] for i in indices[split:]]
val_cor= [correct_tmp1[i] for i in indices[split:]]
val=tf.constant(val)
val_cor=tf.constant(val_cor)

In [None]:

#train the model on the data and save it (batch_size and epochs can be played around with to find the ideal numbers)
history = model.fit(training, correct, epochs=900, batch_size=32, validation_data=(val,val_cor))
model.save("predictor5.keras")

In [None]:
#my kernel kept crashing because of matplotlib so if you experience a similar issue uncomment the next 2 line it might help
#it is a specific problem I encountered it might not be the same for everyone
#import os
#os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'
#load model
model=tf.keras.models.load_model("predictor5.keras")

test_sample=dataset[500:600]
test_correct=correct_tmp[500:600]


#if you want to have day by day prediction as oposed to 30 day prediction comment out previous part and uncomment this part
dataset_len=100
trainining=[]
date=[]
for i in range(dataset_len):
    date.append(i)
prediction=model.predict(test_sample)

#plot the predicted prices compared to the actual prices
plt.figure()
plt.plot(date, prediction, label='predictions')
plt.plot(date, test_correct, label='actual')
plt.title('prediction vs actual')
plt.xlabel('date')
plt.ylabel('prices')
plt.legend()
plt.show()

In [None]:
#calculate how much money the model makes us on our test sample
money=10000
for i in range(1,len(prediction)):
  #if we predict that the price will go up "buy"
  estimated_gain=prediction[i][0] - test_correct[i-1]
  if estimated_gain>0:
    #calclate how much money that day made us
    no_share=money/test_correct[i-1]
    money=no_share*test_correct[i]

#how much money we would have made if the model predicted perfectly
money_ideal=10000
for i in range(1,len(prediction)):
  #if the stock goes up "buy"
  estimated_gain=test_correct[i]-test_correct[i-1]
  if estimated_gain>0:
    #calculate how much we make
    no_share=money_ideal/test_correct[i-1]
    money_ideal=no_share*test_correct[i]

print("how much we made: ", money)
print("max amount we could have made if the model predicted perfectly: ",money_ideal)