In [1]:
#Initial Imports
from pathlib import Path
import pandas as pd
import numpy as np

import nlp

In [2]:
stock = 'tsla'

In [9]:
#Reading in StockTwits data
tweets = pd.read_csv(f'Data/{stock}_tweets.csv', infer_datetime_format=True, parse_dates=True)

In [10]:
#Reading in Reddit data
reddit = pd.read_csv(f'Data/reddit_{stock}.csv', infer_datetime_format=True, parse_dates=True)

In [11]:
#Calling function to make NLTK Sentiment score from StockTwits
n = nlp.NLT()
nltk_df = n.make_sentiment_df(tweets,"twits")
nltk_df.head(2)

Unnamed: 0,Created,Likes,NLTK_Compound,NLTK_Neg,NLTK_Neu,NLTK_Pos,Sentiment,Text
0,2021-01-16,150,0.6486,0.0,0.868,0.132,Bullish,SmartOptions® Unusual Activity Alert\n(Delayed...
1,2021-01-16,805,0.0,0.0,1.0,0.0,Bullish,$CLF $X $TSLA $NIO $VALE


In [12]:
#Adding TextBlob score to that dataframe
b = nlp.Blobby()

nltk_blob_df = b.add_blob(nltk_df, "Text")
nltk_blob_df.head(2)

Unnamed: 0,Created,Likes,NLTK_Compound,NLTK_Neg,NLTK_Neu,NLTK_Pos,Sentiment,Text,Blob Class,Blob Pos,Blob Neg
0,2021-01-16,150,0.6486,0.0,0.868,0.132,Bullish,SmartOptions® Unusual Activity Alert\n(Delayed...,pos,0.894859,0.105141
1,2021-01-16,805,0.0,0.0,1.0,0.0,Bullish,$CLF $X $TSLA $NIO $VALE,pos,0.5,0.5


In [None]:
#Making some additional modifications to the dataframe
df_final = nltk_blob_df[["Created", "NLTK_Compound","Blob Pos", "Blob Neg"]]
df_final["Blob Score"] = np.where(df_final["Blob Pos"] >0.5, df_final["Blob Pos"], df_final["Blob Neg"])
df_final = df_final[["Created", "NLTK_Compound", "Blob Score"]]

df_final.sort_index(inplace = True)
df_final = df_final.groupby(["Created"]).mean()
df_final.round({"NLTK_Compound": 4,
                "Blob Score" : 4
               })
df_final.head(2)

In [33]:
#Saving StockTwits sentiment to CSV
df_final.to_csv(f'Data/tweets_{stock}_sentiment.csv', header = True, index = True)

In [16]:
#Calling function to make reddit sentiment
sent_df = n.make_sentiment_df(reddit, "Reddit")

In [17]:
#Adding TextBlob score to that dataframe
b = nlp.Blobby()

nltk_blob_df = b.add_blob(sent_df, "Text")
nltk_blob_df.head(2)

Unnamed: 0,Created,NLTK_Compound,NLTK_Negative,NLTK_Neutral,NLTK_Positive,Text,Upvote_Ratio,Blob Class,Blob Pos,Blob Neg
0,2021-01-23,-0.4767,0.383,0.617,0.0,$BB: Why the shills are wrong,0.56,neg,0.356476,0.643524
1,2021-01-23,0.0772,0.099,0.785,0.115,Ultimate DD on $BB🚀🚀🪐 Join now so you can wipe...,0.9,pos,0.86021,0.13979


In [None]:
#Making some additional modifications to the dataframe
df_final = nltk_blob_df[["Created", "NLTK_Compound","Blob Pos", "Blob Neg"]]
df_final["Blob Score"] = np.where(df_final["Blob Pos"] >0.5, df_final["Blob Pos"], df_final["Blob Neg"])
df_final = df_final[["Created", "NLTK_Compound", "Blob Score"]]

df_final.sort_index(inplace = True)
df_final = df_final.groupby(["Created"]).mean()
df_final.round({"NLTK_Compound": 4,
                "Blob Score" : 4
               })
df_final.head(2)

In [33]:
#Saving Reddit sentiment to CSV
df_final.to_csv(f'Data/reddit_{stock}_sentiment.csv', header = True, index = True)

In [20]:
#Reading in stock data
f = Path("Data/alpaca_data.csv")
df = pd.read_csv(f, parse_dates = True, infer_datetime_format = True)
#Running function to fix date
df = nlp.fix_date(df)
df.set_index("Created", inplace = True)
df.index = pd.to_datetime(df.index)
df.head(2)

Unnamed: 0_level_0,GME,NIO,PLTR,PLUG,TSLA
Created,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-01-02,6.31,3.72,,3.25,430.24
2020-01-03,5.89,3.83,,3.225,442.75


In [21]:
#Running function that combines sentiment from twitter and reddit
dfs = nlp.combine("Data/reddit_tsla_sentiment.csv", "Data/tweets_tsla_sentiment.csv")
dfs.index = pd.to_datetime(dfs.index)
dfs.head()

Unnamed: 0_level_0,NLTK_Compound,Blob Score
Created,Unnamed: 1_level_1,Unnamed: 2_level_1
2018-05-07,0.397,0.629887
2018-05-08,0.158075,0.669236
2018-05-09,0.4278,0.503559
2018-05-10,0.0956,0.637904
2018-05-11,0.071833,0.641239


In [22]:
#Merging price data with combined sentiment data
upper_stock = stock.upper()
df = df[[upper_stock]]
df_combined = df.merge(dfs, left_index = True, right_index = True)

In [25]:
#Saving file (This is the file fed into the LSTM model)
df_combined.to_csv(f'Data/Master/{upper_stock}.csv", header = True, index = True)