In [1]:
#import libraries
import pandas as pd
import numpy as np
import re # for regular expressions
pd.set_option('display.max_columns', None)

#importing X data
tweets = pd.read_csv("data/Kaggle/Tweets/elonmusk.csv", encoding='latin1')
tweets=tweets[['Timestamp','text','Emojis','Comments','Likes','Retweets']]#[['date','day','hour', 'tweet', 'nlikes', 'nreplies','nretweets']]
print(tweets.shape)
tweets.head()

(12206, 6)


Unnamed: 0,Timestamp,text,Emojis,Comments,Likes,Retweets
0,2010-06-04T18:31:57.000Z,"Please ignore prior tweets, as that was someon...",,873,648,5453
1,2011-12-01T09:55:11.000Z,Went to Iceland on Sat to ride bumper cars on ...,,30,24,188
2,2011-12-01T10:29:04.000Z,I made the volume on the Model S http://ow.ly/...,,29,17,78
3,2011-12-03T08:20:28.000Z,"Great Voltaire quote, arguably better than Twa...",,23,30,44
4,2011-12-03T08:22:07.000Z,That was a total non sequitur btw\n26\n14\n50,,26,14,50


In [2]:
#filtering data
tweets=tweets.loc[tweets.Timestamp>='2022-01-01']
tweets=tweets.loc[tweets.Timestamp<'2022-02-09']
print("Tweets from: ",tweets.Timestamp.min()," to",tweets.Timestamp.max())

Tweets from:  2022-01-01T00:53:13.000Z  to 2022-02-08T20:59:40.000Z


In [3]:
def value_to_float(x):
    if type(x) == float or type(x) == int:
        return x
    if 'K' in x:
        if len(x) > 1:
            return float(x.replace('K', '')) * 1000
        return 1000.0
    if 'M' in x:
        if len(x) > 1:
            return float(x.replace('M', '')) * 1000000
        return 1000000.0
    return float(x.replace(',',''))

tweets['nlikes'] = tweets['Likes'].apply(value_to_float)
tweets['nretweets'] = tweets['Retweets'].apply(value_to_float)
tweets['nreplies'] = tweets['Comments'].apply(value_to_float)

In [4]:
tweets['tweet']=tweets.text
tweets['date']=pd.to_datetime(tweets.Timestamp)

In [5]:
#importing libraries
import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.tokenize import regexp_tokenize
from nltk.tokenize import TweetTokenizer

def new_feature_with_pattern(text, pattern):
    """ 
    returns list of substrings that follows a specific pattern
    """
    new_column=[]
    r = re.findall(pattern, text)
    for i in r:
        new_column.append(regexp_tokenize(i, pattern))
    return new_column

def remove_pattern(input_txt, pattern):
    """
    returns input text with substring without specific pattern (removed)
    """
    r = re.findall(pattern, input_txt)
    for i in r:
        input_txt = input_txt.replace(i, " ")
    return input_txt   


def clean(text):
    """Clean function using nltk to case normalize and lemmatize.
    
    Args: 
    text (str)
    
    Returns:
    clean_text (str)
    
    """
    #defining patterns
    url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    handler_regex = "@[\w]*" #Twitter Handlers @user
    hashtag_regex= r"#[\w]*"#r"#[A-Za-z0-9_]+"
    punct_regex="[^a-zA-Z#]" #removing punctuations, numbers and special characters
    stop_words = stopwords.words("english")
    
    #removing patterns
    text=remove_pattern(text, url_regex)
    text=remove_pattern(text,'and [\d] others')
    text=remove_pattern(text, handler_regex) #remove handlers (@user)
    hash_list=new_feature_with_pattern(text, hashtag_regex) # list of hashtags
    text=remove_pattern(text, hashtag_regex)
    text=remove_pattern(text, punct_regex)
    text=remove_pattern(text,'Replying to')
    
    #removing short and stop words
    text = ' '.join(w for w in text.split() if len(w)>3 and w.lower() not in (stopwords.words("english")))
    
    return text


def tokenize(text):
    """Tokenize function using nltk to case normalize, lemmatize, and tokenize text.
    
    Args: 
    text (str)
    
    Returns:
    clean_tokens (list): list of clean key words
    
    """
    text=clean(text)
    
    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()

    clean_tokens = []
    for tok in tokens:
        clean_tok = lemmatizer.lemmatize(tok).lower().strip()  
        clean_tokens.append(clean_tok)

    return clean_tokens


In [6]:
tweets['clean_tweet']=[clean(tweet) for tweet in tweets.tweet]

In [7]:
bow= ['actually', 'best', 'beta', 'better', 'cars', 'coming', 'earth',
       'energy', 'engine', 'engines', 'exactly', 'good', 'great', 'haha',
       'hard', 'high', 'launch', 'like', 'long', 'make', 'mars', 'maybe',
       'model', 'need', 'needed', 'people', 'point', 'probably', 'production',
       'right', 'rocket', 'software', 'solar', 'soon', 'spacex', 'starship',
       'super', 'sure', 'team', 'tesla', 'test', 'thanks', 'time', 'true',
       'week', 'work', 'world', 'yeah', 'year', 'years'] #from other notebook

In [8]:
for i in bow:
    tweets[i]=tweets.tweet.str.count(i)

In [9]:
tweets.head()

Unnamed: 0,Timestamp,text,Emojis,Comments,Likes,Retweets,nlikes,nretweets,nreplies,tweet,date,clean_tweet,actually,best,beta,better,cars,coming,earth,energy,engine,engines,exactly,good,great,haha,hard,high,launch,like,long,make,mars,maybe,model,need,needed,people,point,probably,production,right,rocket,software,solar,soon,spacex,starship,super,sure,team,tesla,test,thanks,time,true,week,work,world,yeah,year,years
11799,2022-01-02T18:19:33.000Z,Letâs make the roaring 20âs happen!\n22.4K...,,22.4K,30.9K,320.1K,30900.0,320100.0,22400.0,Letâs make the roaring 20âs happen!\n22.4K...,2022-01-02 18:19:33+00:00,make roaring happen,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
11800,2022-01-02T17:56:50.000Z,Great work by Tesla team worldwide!\nTesla\n@T...,,5632,4768,66.3K,4768.0,66300.0,5632.0,Great work by Tesla team worldwide!\nTesla\n@T...,2022-01-02 17:56:50+00:00,Great work Tesla team worldwide Tesla Producti...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,2,0,0,0,0,0,1,1,0,0,0
11802,2022-01-03T22:04:59.000Z,Raptor 2 now operates routinely at 300 bar mai...,,2870,2602,57.6K,2602.0,57600.0,2870.0,Raptor 2 now operates routinely at 300 bar mai...,2022-01-03 22:04:59+00:00,Raptor operates routinely main chamber pressure,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
11803,2022-01-03T18:44:10.000Z,Replying to \n@jack\nReminds me of when I hex ...,,1434,754,14K,754.0,14000.0,1434.0,Replying to \n@jack\nReminds me of when I hex ...,2022-01-03 18:44:10+00:00,Reminds edited Ultima final maze,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
11804,2022-01-03T02:58:55.000Z,Replying to \n@auren\nThere is no way to be in...,,771,848,12K,848.0,12000.0,771.0,Replying to \n@auren\nThere is no way to be in...,2022-01-03 02:58:55+00:00,touch voters three generations away voting,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [10]:
tweets.tweet = tweets.tweet.astype(str)
tweets['tweet_len']=[len(tweet) for tweet in tweets.tweet]
tweets['clean_len']=[len(tweet) for tweet in tweets.clean_tweet]

In [11]:
# Find tweets that contain these few crypto or tesla words
searchforcrypto=['crypto', 'cryptocurrency' 'doge', 'bitcoin', 'coin', 'hodl', 'eth', 'sol']
searchfortesla=['tesla', '@Tesla','car', 'cars']
searchforspacex=['spacex','rocket','starship','space','mars']

tweets['mention_crypto'] = np.where((tweets['clean_tweet'].str.contains('|'.join(searchforcrypto)) == True), 1, 0)
tweets['mention_tesla'] = np.where((tweets['clean_tweet'].str.contains('|'.join(searchfortesla)) == True), 1, 0)
tweets['mention_spacex'] = np.where((tweets['clean_tweet'].str.contains('|'.join(searchforspacex)) == True), 1, 0)
tweets[['mention_crypto','mention_tesla','mention_spacex']].describe()

Unnamed: 0,mention_crypto,mention_tesla,mention_spacex
count,238.0,238.0,238.0
mean,0.029412,0.037815,0.02521
std,0.169314,0.191151,0.157093
min,0.0,0.0,0.0
25%,0.0,0.0,0.0
50%,0.0,0.0,0.0
75%,0.0,0.0,0.0
max,1.0,1.0,1.0


In [12]:
tweets['date']=pd.to_datetime(tweets.Timestamp).dt.date

tweets_sum=tweets.groupby('date')[['nlikes', 'nretweets', 'nreplies', 'mention_crypto', 'mention_tesla', 'mention_spacex',
                                       'actually', 'best', 'beta', 'better', 'cars', 'coming', 'earth',
       'energy', 'engine', 'engines', 'exactly', 'good', 'great', 'haha',
       'hard', 'high', 'launch', 'like', 'long', 'make', 'mars', 'maybe',
       'model', 'need', 'needed', 'people', 'point', 'probably', 'production',
       'right', 'rocket', 'software', 'solar', 'soon', 'spacex', 'starship',
       'super', 'sure', 'team', 'tesla', 'test', 'thanks', 'time', 'true',
       'week', 'work', 'world', 'yeah', 'year', 'years']].sum().reset_index()
tweets_avg=tweets.groupby('date')[['tweet_len','clean_len']].mean().reset_index()
tweets_daily=pd.merge(tweets_sum, tweets_avg, on='date', how='left')

In [52]:
# parsing date into year, month, day and hour
tweets_daily['year_t'] = pd.DatetimeIndex(tweets_daily.date).year  
#pd.DatetimeIndex(df['birth_date']).year
tweets_daily['month_t'] = pd.DatetimeIndex(tweets_daily.date).month
tweets_daily['day_t'] = pd.DatetimeIndex(tweets_daily.date).day
tweets_daily['date'] = pd.to_datetime(tweets_daily.date)


In [53]:
print(tweets_daily.shape)
tweets_daily.to_csv('Daily_Tweets_test.csv')  

(36, 62)


### Target (y_test) - bitcoin

In [54]:
bitcoin=pd.read_csv('data/BTCUSD_dailydata.csv', index_col='Date')

In [55]:
bitcoin.reset_index(inplace=True)

In [56]:
bitcoin=bitcoin.loc[bitcoin.Date>='2022-01-01']
bitcoin=bitcoin.loc[bitcoin.Date<'2022-02-09']
print("Historical Price from: ",bitcoin.Date.min()," to",bitcoin.Date.max())

Historical Price from:  2022-01-01  to 2022-02-08


In [57]:
bitcoin.rename(columns={'Date':'date'}, inplace=True)
bitcoin['date']=pd.to_datetime(bitcoin['date'])

In [58]:
test_data=pd.merge(tweets_daily, bitcoin, on='date', how='left')

In [59]:
test_data.shape

(36, 69)

In [60]:
test_data[['Close']].to_csv('Bitcoin_test.csv')

In [61]:
test_data

Unnamed: 0,date,nlikes,nretweets,nreplies,mention_crypto,mention_tesla,mention_spacex,actually,best,beta,better,cars,coming,earth,energy,engine,engines,exactly,good,great,haha,hard,high,launch,like,long,make,mars,maybe,model,need,needed,people,point,probably,production,right,rocket,software,solar,soon,spacex,starship,super,sure,team,tesla,test,thanks,time,true,week,work,world,yeah,year,years,tweet_len,clean_len,year_t,month_t,day_t,High,Low,Open,Close,Volume,Adj Close,Vol_Fiat
0,2022-01-01,682.0,49726.0,1400.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,57.0,0.0,2022,1,1,47827.3125,46288.484375,46311.746094,47686.8125,24582670000.0,47686.8125,1172269000000000.0
1,2022-01-02,92368.0,859900.0,53632.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,2,0,0,0,0,0,1,1,0,0,0,95.666667,32.0,2022,1,2,47881.40625,46856.9375,47680.925781,47345.21875,27951570000.0,47345.21875,1323373000000000.0
2,2022-01-03,4992.0,98981.0,6439.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,89.0,24.5,2022,1,3,47510.726562,45835.964844,47343.542969,46458.117188,33071630000.0,46458.117188,1536446000000000.0
3,2022-01-04,46019.0,515572.0,32957.0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,56.25,12.5,2022,1,4,47406.546875,45752.464844,46458.851562,45897.574219,42494680000.0,45897.574219,1950403000000000.0
4,2022-01-05,959.0,19149.0,1538.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,51.5,6.0,2022,1,5,46929.046875,42798.222656,45899.359375,43569.003906,36851080000.0,43569.003906,1605565000000000.0
5,2022-01-06,37465.0,443915.0,53045.0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,49.75,14.5,2022,1,6,43748.71875,42645.539062,43565.511719,43160.929688,30208050000.0,43160.929688,1303807000000000.0
6,2022-01-07,10389.0,123013.0,16730.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,123.8,56.0,2022,1,7,43153.570312,41077.445312,43153.570312,41557.902344,84196610000.0,41557.902344,3499034000000000.0
7,2022-01-08,791.0,23900.0,1868.0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,99.0,21.0,2022,1,8,42228.941406,40672.277344,41561.464844,41733.941406,28066360000.0,41733.941406,1171320000000000.0
8,2022-01-09,48100.0,419900.0,33900.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,41.0,16.5,2022,1,9,42663.949219,41338.160156,41734.726562,41911.601562,21294380000.0,41911.601562,892481800000000.0
9,2022-01-10,3527.0,50600.0,4781.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,125.5,55.5,2022,1,10,42199.484375,39796.570312,41910.230469,41821.261719,32104230000.0,41821.261719,1342640000000000.0
