In [1]:
import tweepy
from dotenv import load_dotenv
import os
import pandas as pd
import re
load_dotenv()
from pathlib import Path
from nltk.corpus import reuters, stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer 
lemmatizer = WordNetLemmatizer()
from wordcloud import WordCloud
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import matplotlib.pyplot as plt

In [2]:
# read tweets
file_path = Path("Resources/BTC_Large.csv")
BTCTweets = pd.read_csv(file_path)
BTCTweets.head()

Unnamed: 0,time,tweet
0,2022-05-31 00:03:52+00:00,"RT @rovercrc: #Bitcoin has not reached $50,000..."
1,2022-05-31 00:03:53+00:00,RT @Next100XGEMS: I’m so #Bullish on #BTC 📈👀
2,2022-05-31 00:03:53+00:00,@cz_binance @BTC_Archive @wolfofcrypto89 you a...
3,2022-05-31 00:03:53+00:00,"RT @rovercrc: #Bitcoin has not reached $50,000..."
4,2022-05-31 00:03:54+00:00,RT @RoyalBlackCard: THE LEADING UTILITY NFT 🏆\...


In [3]:
# define functions to process tweets
#cleaning
def cleaning(df):
    cleaned_tweets=[]
    for word in df['tweet']:
        #convert to string
        tw=str(word)
        #remove web links
        tw=re.sub(r"http\S+", "",tw)
        tw=re.sub(r"www\S+","",tw)
        #Remove mentions
        tw=re.sub(r"@\S+", "", tw)
        #remove non alphabetic characters
        tw=re.sub("[^A-Za-z0-9]"," ",tw)
        #remove rt
        tw=re.sub(r"RT","",tw)
        cleaned_tweets.append(tw.lower())
    df['cleaned_tweets']=cleaned_tweets
    return

#tokenizer 
def tokenizer(df):
    sw = set(stopwords.words('english'))
    token_tweets=[]
    for tweet in df['cleaned_tweets']:
        words = word_tokenize(tweet)
        lem = [lemmatizer.lemmatize(word) for word in words]
        output=[word for word in lem if word not in sw]
        token_tweets.append(output)
    df['token_tweets']=token_tweets
    return

analyzer = SentimentIntensityAnalyzer()
def sentiment(df):
    compound=[]
    pos=[]
    neu=[]
    neg=[]
    for tweet in df['cleaned_tweets']:
        sentiment=analyzer.polarity_scores(tweet)
        compound.append(sentiment['compound'])
        pos.append(sentiment['pos'])
        neu.append(sentiment['neu'])
        neg.append(sentiment['neg'])
    df['compound']=compound
    df['pos']=pos
    df['neu']=neu
    df['neg']=neg
    return

def wc(df):
    biglist=[]
    for tweets in df['token_tweets']:
        biglist=biglist+tweets
    bigstring=' '.join(biglist)
    wc=WordCloud().generate(bigstring)
    return plt.imshow(wc)


In [4]:
# process tweets
cleaning(BTCTweets)
#  TODO include word embedding instead of tokenizer
tokenizer(BTCTweets)
sentiment(BTCTweets)

In [5]:
# prepare df to count total postive negative tweets per minute
BTCTweets['positive'] = 0
BTCTweets.loc[BTCTweets['pos'] > 0, 'positive'] = 1
BTCTweets['negative'] = 0
BTCTweets.loc[BTCTweets['neg'] > 0, 'negative'] = 1
BTCTweets.head(5)

Unnamed: 0,time,tweet,cleaned_tweets,token_tweets,compound,pos,neu,neg,positive,negative
0,2022-05-31 00:03:52+00:00,"RT @rovercrc: #Bitcoin has not reached $50,000...",bitcoin has not reached 50 000 so i will ...,"[bitcoin, ha, reached, 50, 000, 2, 500, giveaw...",0.8128,0.373,0.577,0.05,1,1
1,2022-05-31 00:03:53+00:00,RT @Next100XGEMS: I’m so #Bullish on #BTC 📈👀,i m so bullish on btc,"[bullish, btc]",0.0,0.0,1.0,0.0,0,0
2,2022-05-31 00:03:53+00:00,@cz_binance @BTC_Archive @wolfofcrypto89 you a...,you are a kahbesinnnn,[kahbesinnnn],0.0,0.0,1.0,0.0,0,0
3,2022-05-31 00:03:53+00:00,"RT @rovercrc: #Bitcoin has not reached $50,000...",bitcoin has not reached 50 000 so i will ...,"[bitcoin, ha, reached, 50, 000, 2, 500, giveaw...",0.8128,0.373,0.577,0.05,1,1
4,2022-05-31 00:03:54+00:00,RT @RoyalBlackCard: THE LEADING UTILITY NFT 🏆\...,the leading utility nft kyc audited t...,"[leading, utility, nft, kyc, audited, early, b...",0.0,0.0,1.0,0.0,0,0


In [6]:
# get only compound score
BTCTweets['time'] = pd.DatetimeIndex(BTCTweets['time'])
BTCTweetsCompoundScore = BTCTweets[BTCTweets['compound'] != 0]
BTCTweetsCompoundScore = BTCTweetsCompoundScore[['time','compound']]
BTCTweetsCompoundScore.head()

Unnamed: 0,time,compound
0,2022-05-31 00:03:52+00:00,0.8128
3,2022-05-31 00:03:53+00:00,0.8128
7,2022-05-31 00:03:54+00:00,0.5719
8,2022-05-31 00:03:54+00:00,-0.3818
12,2022-05-31 00:03:55+00:00,0.0772


In [7]:
# get mean compound score by minute basis
BTCTweetsCompoundScoreMean = BTCTweetsCompoundScore\
     .groupby(pd.Grouper(key="time", freq='T'),as_index=True)\
     .mean()
BTCTweetsCompoundScoreMean.head()

Unnamed: 0_level_0,compound
time,Unnamed: 1_level_1
2022-05-31 00:03:00+00:00,0.370121
2022-05-31 00:04:00+00:00,0.263006
2022-05-31 00:05:00+00:00,
2022-05-31 00:06:00+00:00,0.281656
2022-05-31 00:07:00+00:00,


In [8]:
BTCTweetsPositiveNegativeCount = BTCTweets[['time', 'positive', 'negative']]\
     .groupby(pd.Grouper(key="time", freq='T'),as_index=True)\
     .sum()
BTCTweetsPositiveNegativeCount.head()

Unnamed: 0_level_0,positive,negative
time,Unnamed: 1_level_1,Unnamed: 2_level_1
2022-05-31 00:03:00+00:00,11,6
2022-05-31 00:04:00+00:00,27,18
2022-05-31 00:05:00+00:00,0,0
2022-05-31 00:06:00+00:00,41,18
2022-05-31 00:07:00+00:00,0,0


In [9]:
from pandas_datareader import data as pdr

import yfinance as yf
yf.pdr_override()

# download dataframe
tradesData = pdr.get_data_yahoo("BTC-USD", start='2022-05-31', end='2022-06-06', interval="1m")
tradesData.head()

[*********************100%***********************]  1 of 1 completed


Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2022-05-30 14:00:00+00:00,30469.806641,30469.806641,30469.806641,30469.806641,30469.806641,0
2022-05-30 14:01:00+00:00,30486.800781,30486.800781,30486.800781,30486.800781,30486.800781,0
2022-05-30 14:02:00+00:00,30484.949219,30484.949219,30484.949219,30484.949219,30484.949219,0
2022-05-30 14:03:00+00:00,30483.732422,30483.732422,30483.732422,30483.732422,30483.732422,1748992
2022-05-30 14:04:00+00:00,30480.753906,30480.753906,30480.753906,30480.753906,30480.753906,0


In [19]:
# concat all the data
merged_df = pd.concat([BTCTweetsPositiveNegativeCount, BTCTweetsCompoundScoreMean], join="inner", axis=1)
merged_df = pd.concat([merged_df, tradesData], join="inner", axis=1)

merged_df['actual_returns'] = merged_df['Close'].pct_change()
merged_df = merged_df.dropna()
merged_df['target_return'] = merged_df['actual_returns'].shift(-1)
merged_df = merged_df[['positive', 'negative', 'compound', 'actual_returns', 'target_return', 'Close', 'Volume']]
final_df = merged_df.dropna()

# output processed data to file
file_path = Path("Resources/final_df_without_y.csv")
final_df.to_csv(file_path)
final_df.head()

Unnamed: 0,positive,negative,compound,actual_returns,target_return,Close,Volume
2022-05-31 00:04:00+00:00,27,18,0.263006,0.000528,0.000418,31734.792969,33091584
2022-05-31 00:06:00+00:00,41,18,0.281656,0.000418,0.000285,31751.789062,8663040
2022-05-31 00:08:00+00:00,45,36,0.149611,0.000285,0.000119,31783.402344,10358784
2022-05-31 00:10:00+00:00,53,26,0.314945,0.000119,0.000544,31775.460938,69644288
2022-05-31 00:12:00+00:00,65,30,0.418451,0.000544,-0.00048,31796.03125,9519104
