In [164]:
import glob
import os
import datetime as dt
import nltk
import numpy as np
import pandas as pd

from pathlib import Path
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer, WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from pyspark.sql.functions import col, udf, unix_timestamp
from textblob import TextBlob

In [165]:
currency = [
    "DOGE"
    # "BTC",
    #"ETH",
    #"USDT",
    # "XRP",
    #"BCH",
    #"ADA",
    #"BSV",
    #"LTC",
    #"LINK",
    #"BNB",
    #"EOS",
    #"TRON",
]

In [166]:
class process_tweets():

    def __init__(self, tokenizer=None, stop_words=None, stemmer=None, lemmatizer=None):
        """ 
        Initialize the class.
        """
        self.path = Path(f'{os.getcwd()}')
        self.tokenizer = tokenizer
        self.stop_words = stop_words
        self.stemmer = stemmer
        self.lemmatizer = lemmatizer

        self.df = []
        self.vocabulary = []
        self.final = []

    # Read tweets from CSV for every currency
    def read_tweets(self, curr):
        """
        Read the tweets from the CSV file.
        """
        #initialize the dataframe
        aux = []
        ret = pd.DataFrame()
        for file in glob.glob(f"{self.path}/twitter_data/*/*-{curr}*.csv"):
            # print(f"Reading {file}")
            ret = pd.concat([pd.read_csv(file)], ignore_index=True)
            ret['coin_type'] = curr
            aux.append(ret)

        self.df = pd.concat(aux, ignore_index=True)
        # Print the number of rows
        print(f"Number of rows: {len(self.df)}")


    def clean_df(self):
        """
        Since I repeted the data mining multiple times, we expect duplicate of tweets.
        Keep the latess mined as the number of followers and retweets can chage.
        """
        self.df.sort_values(by=['created_at'], ascending=True)
        self.df.drop_duplicates(subset=['tweet_id'], keep='last', ignore_index=True)

        self.df.drop(columns=['tweet_id', 'name', 'screen_name', 'mined_at', 
                              'retweet_count', 'favourite_count', 'hashtags', 
                              'status_count', 'followers_count', 'location', 
                              'source_device', 'retweet_text'], inplace=True)


        self.df['created_at'] = pd.to_datetime(self.df['created_at'], format='%Y-%m-%d %H:%M')
        self.df['round_time'] = self.df['created_at'].dt.round('30min')
        self.df['round_time'] = self.df['round_time'].dt.strftime('%Y-%m-%d %H:%M')


    def clean_text(self, text):
        """
        Clean the text.
        """
        count = 0
        # Tokenize the text
        tokens = self.tokenizer.tokenize(text)
        count += len(tokens)
        # Remove stop words
        tokens = [token for token in tokens if token not in self.stop_words]
        # Case Folding
        tokens = [token.lower() for token in tokens]
        # Lemmatize the text
        tokens = [self.lemmatizer.lemmatize(token) for token in tokens]
        # Add word to vocabulary
        self.vocabulary.extend(tokens)

        # Return the lemma text
        return " ".join(tokens), count

    def process_ccy(self):
        """
        Processes the currency data.
        """
        # Create a new column for text_clean and assign ""
        self.df["text_clean"] = ""

        count = 0
        # Create column for text_clean and process the text
        for index, row in self.df.iterrows():

            # Clean the text
            text_clean, words = self.clean_text(row['text'])
            count += words
            # Add the cleaned text to the dataframe
            self.df['text_clean'][index] = text_clean
        
        print(f"There are {count} words in the text")
        print(f"Number of unique words: {len(self.vocabulary)}")
    
    def drop_last(self):
        """
        Drop text and created_at columns.
        """
        self.df.drop(columns=['text', 'created_at'], inplace=True)


    def getSentiment(self, tweet) -> list:
        """
        Get the sentiment of the tweet.
        """
        analysis = TextBlob(tweet)

        return analysis.sentiment.polarity

In [167]:
processed = process_tweets(RegexpTokenizer(r'\w+'), 
                           stop_words=stopwords.words('english'), 
                           stemmer=SnowballStemmer("english"), 
                           lemmatizer=WordNetLemmatizer()
                           )
# Read data and concatenate to dataframe
for curr in currency:
    processed.read_tweets(curr)

processed.clean_df()
processed.process_ccy()
processed.drop_last()

# Create a dataframe with the sentiment
for curr in currency:
    processed.df['sentiment'] = processed.df['text_clean'].apply(processed.getSentiment)

# Rearange the columns 'coin_type', 'round_time', 'text_clean', 'sentiment'
processed.df = processed.df[['coin_type', 'round_time', 'text_clean', 'sentiment']]


# Print head of the dataframe
# print(f"Head of the dataframe: {processed.df.head()}")

Number of rows: 66251


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.df['text_clean'][index] = text_clean


There are 1398224 words in the text
Number of unique words: 1134537


### Truncating dataframe by hour and then grouping them by hour

In [168]:
hourly_sentiment_df = pd.DataFrame()

# for loop to get the mean of sentiment for each hour
for hour in processed.df['round_time'].unique():
    # Get the sentiment for each hour
    sentiment = processed.df[processed.df['round_time'] == hour]['sentiment']
    mean_sentiment = sentiment.mean()

    # Create a dataframe with the mean sentiment for each hour
    hourly_sentiment_df = hourly_sentiment_df.append({'hour': hour, 'sentiment': mean_sentiment}, ignore_index=True)

print(f"Head of the hourly sentiment dataframe: {hourly_sentiment_df.head()}")
print(f"Number of rows: {len(hourly_sentiment_df)}")

Head of the hourly sentiment dataframe:                hour  sentiment
0  2022-01-14 08:00   0.171108
1  2022-01-14 07:30   0.165510
2  2022-01-14 07:00   0.124129
3  2022-01-14 06:30   0.109723
4  2022-01-14 06:00   0.135061
Number of rows: 59


### Process Market Price for #COINS to the same format

In [81]:
from datetime import datetime

def convert_date(x):
    try:
        return datetime.strptime(x, '%Y-%m-%d').date()
    except:
        return None

ret = []
for curr in currency:
    for file in glob.glob(f"{processed.path}/market-price_data/{curr}*.csv"):
        ret = pd.concat([pd.read_csv(file)], ignore_index=True)
        ret['coin_type'] = curr

# Process TIME column to create 2 columns in dataframe with date and time
ret['date'] = ret['TIME'].apply(lambda x: x[:8])
# Change the format of date in ret from dd-mm-yy to yyyy-mm-dd
ret['date'] = ret['date'].apply(lambda x: datetime.strptime(x, '%d-%m-%y').date())

ret['time'] = ret['TIME'].apply(lambda x: x[9:14])
ret['time'] = ret['time'].apply(lambda x: datetime.strptime(x, '%H:%M').time())

# Drop columns that are not needed PAIR, TIME
ret.drop(columns=['PAIR', 'TIME'], inplace=True)
# Rearange columns to match the dataframe
ret = ret[['coin_type', 'date', 'time', 'BID', 'ASK']]

In [82]:
print(ret.head())

  coin_type        date      time      BID      ASK
0      DOGE  2022-01-15  23:30:00  0.18476  0.18488
1      DOGE  2022-01-15  23:00:00  0.18473  0.18483
2      DOGE  2022-01-15  22:30:00  0.18569  0.18570
3      DOGE  2022-01-15  22:00:00  0.18635  0.18641
4      DOGE  2022-01-15  21:30:00  0.18673  0.18685


In [83]:
# Change the order of columns 
# market_coin = ret[['coin_type', 'date', 'time', 'close']]
# Change name of column 'close' to 'price'
# market_coin.rename(columns={'close': 'price'}, inplace=True)

for index in range(len(processed.df)):
    processed.df[index] = processed.df[index][['coin_type', 'date', 'time', 'text_clean', 'sentiment']]

In [84]:
print(processed.df[0].head())

  coin_type        date      time  \
0      DOGE  2022-01-14  15:01:50   
1      DOGE  2022-01-14  15:01:50   
2      DOGE  2022-01-14  15:01:47   
3      DOGE  2022-01-14  15:01:45   
4      DOGE  2022-01-14  15:01:43   

                                          text_clean  sentiment  
0  rt bleufiofficial just 2 day launching happy a...   0.350000  
1  excellent trx mining signup bonus 3000 tron bt...   1.000000  
2  rt dogesecurity1 this real magic doge rt u dog...   0.750000  
3  rt kikuinubsc kikuarmy hitting that 1 98m mark...   0.068182  
4  rt taylormusk_ the next gem x100 bnb usdt doge...   0.000000  


In [162]:
# Create one big dataframe with all the data concatenated together
# df_final = []
# for index in range(len(processed.df)):
#     df_final = pd.concat([processed.df[index]])

# # print(df_final.head())

# # Group the data by date_time
# df_final = df_final.groupby(['date_time', 'sentiment', 'coin_type']).mean()
