In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
def get_data(path):
    tweets = pd.read_csv(path)
    tweets = tweets[['Tweet', 'Date']]
    tweets['Date'] = pd.to_datetime(tweets['Date'], utc=True)
    tweets = tweets[(tweets['Date'] >= '2020-09-26') 
                    & (tweets['Date'] <= '2022-09-25')]
    tweets['Date'] = tweets['Date'].dt.strftime('%Y-%m-%d %H:00')
    tweets.fillna('', inplace=True)
    tweets_2 = {
        'Date': [],
        'Tweet': []
    }
    for h, df in tweets.groupby(by=['Date']):
        df['Tweet'].str.join(' ')
        tweets_2['Date'].append(h)
        tweets_2['Tweet'].append(" ".join(df['Tweet'].tolist()))
    tweets_2 = pd.DataFrame(tweets_2)
    return tweets_2

In [3]:
btc_tweets = get_data('./bitcoin_tweets.csv')
crypto_tweets = get_data('./crypto_tweets.csv')
eth_tweets = get_data('./ethereum_tweets.csv')

In [4]:
btc_all = pd.merge(btc_tweets, crypto_tweets, how='outer', on='Date')
btc_all.fillna('', inplace=True)
btc_all['Tweet'] = btc_all['Tweet_x'] + btc_all['Tweet_y']
btc_all.drop(columns=['Tweet_x', 'Tweet_y'], inplace=True)
btc_all.to_csv('./btc_all.csv', index=False)
btc_all.head()

Unnamed: 0,Date,Tweet
0,2020-09-26 00:00,I dunno. 505 sat at this Bitcoin price doesn't...
1,2020-09-26 02:00,Markaccy reached its highest price on Septembe...
2,2020-09-26 04:00,bitcoin will finish the monthly candle above 1...
3,2020-09-26 06:00,An asset that is human-made and starts at a pr...
4,2020-09-26 07:00,The strength of $BAND that can resist the pric...


In [5]:
eth_all = pd.merge(eth_tweets, crypto_tweets, how='outer', on='Date')
eth_all.fillna('', inplace=True)
eth_all['Tweet'] = eth_all['Tweet_x'] + eth_all['Tweet_y']
eth_all.drop(columns=['Tweet_x', 'Tweet_y'], inplace=True)
eth_all.to_csv('./eth_all.csv', index=False)
eth_all.head()

Unnamed: 0,Date,Tweet
0,2020-09-26 19:00,Let me give you the price action for today TIM...
1,2020-09-28 07:00,Why traders are not worried that the KuCoin ha...
2,2020-09-30 15:00,I hope to markacy to soon listed big exchange ...
3,2020-10-01 16:00,"Without mania, without the dream of selling to..."
4,2020-10-06 13:00,crypto mining best ethereum pools pull ethereu...


In [6]:
def basic_cleaning(text):
    text=re.sub(r'https?://www\.\S+\.com','',text)
    text=re.sub(r'[^A-Za-z|\s]','',text)
    text=re.sub(r'\*+','swear',text) #capture swear words that are **** out
    return text

def remove_html(text):
    html=re.compile(r'<.*?>')
    return html.sub(r'',text)

def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

def remove_multiplechars(text):
    text = re.sub(r'(.)\1{3,}',r'\1', text)
    return text

def cleaning_all(df):
    for col in ['Tweet']:
        df[col]=df[col].astype(str).apply(lambda x:basic_cleaning(x))
        df[col]=df[col].astype(str).apply(lambda x:remove_emoji(x))
        df[col]=df[col].astype(str).apply(lambda x:remove_html(x))
        df[col]=df[col].astype(str).apply(lambda x:remove_multiplechars(x))

In [7]:
cleaning_all(btc_all)
cleaning_all(eth_all)

btc_all.to_csv('./btc_all.csv', index=False)
eth_all.to_csv('./eth_all.csv', index=False)