In [1]:
import os
import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from dateutil.relativedelta import relativedelta
import re
from nltk.stem import *
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import gensim as gs
import pyLDAvis.gensim

In [2]:
directory_path = 'Tweets/'
directory = os.fsencode(directory_path)

In [3]:
data = []
for file in os.listdir(directory):
    fname = os.fsdecode(file)
    if fname.endswith(".json"): 
        with open(f'{directory_path}{fname}', "r") as file:
            temp = json.loads(file.read())
            temp = [json.loads(tweet) for tweet in temp]
        data.extend(temp)

In [4]:
# # set period under observation
# start_date = datetime(2023, 1, 1)
# end_date = datetime.today() - relativedelta(months=2)

# # Iterate through 1st of each month
# data = []
# current_date = start_date
# while current_date < end_date:
#     f_name = f'Tweets/tweets_{current_date.strftime('%b%y').lower()}.json'
#     with open(f_name, "r") as file:
#         temp = json.loads(file.read())
#         temp = [json.loads(tweet) for tweet in temp]
#     data.extend(temp)
#     # Increment to the 1st of the next month
#     current_date += relativedelta(months=+1)

In [5]:
# Convert list of tweets into a dataframe
tweets_df = pd.DataFrame(data)

# Keep only relevant columns
cols_to_keep = ['id', 'date', 'user', 'lang', 'rawContent']
tweets_df = tweets_df.loc[:, cols_to_keep]

tweets_user_df = pd.DataFrame(tweets_df['user'].tolist())
tweets_user_df.rename(columns={'id': 'user_id'}, inplace=True)

user_cols_keep = ['user_id', 'username', 'displayname', 'rawDescription']
tweets_user_df = tweets_user_df.loc[:, user_cols_keep]

# Concatenating the original DataFrame with the passages dataframe
tweets_df = pd.concat([tweets_df, tweets_user_df], axis=1)
tweets_df.drop(columns='user', inplace=True)

# Convert date to yyyy-mm-dd format
tweets_df['date'] = pd.to_datetime(tweets_df['date']).dt.strftime('%Y-%m-%d')

#Keep only tweets in english
tweets_df = tweets_df[tweets_df['lang'] == 'en']

tweets_df.drop_duplicates('rawContent', inplace=True)
tweets_df.reset_index(inplace=True, drop=True)
tweets_df.head()

Unnamed: 0,id,date,lang,rawContent,user_id,username,displayname,rawDescription
0,1785820420652085579,2024-05-01,en,"TD on Cenovus Energy's 1Q24 Conference Call\n""...",1213673551305199620,emmpeethree,The Napster,"Focusing on energy investing, #FIRE, churning ..."
1,1785676648953831445,2024-05-01,en,"DJ and TD on Cenovus Energy's 1Q24 \n""1Q24 fir...",1213673551305199620,emmpeethree,The Napster,"Focusing on energy investing, #FIRE, churning ..."
2,1785816061457358941,2024-05-01,en,"🚩Crude Oil -2.97%, with interest rates remaini...",1754316103852204032,ForeverRebate,Forever Rebate,"I have traded FX, Futures, and Stocks for over..."
3,1785814624455332253,2024-05-01,en,DJ on Imperial Oil's 1Q24 (Full report)\n“‘I’m...,1213673551305199620,emmpeethree,The Napster,"Focusing on energy investing, #FIRE, churning ..."
4,1785391289539973283,2024-04-30,en,"TD on Imperial Oil's 1Q24 Conference Call\n""Q1...",1213673551305199620,emmpeethree,The Napster,"Focusing on energy investing, #FIRE, churning ..."


In [6]:
remove_line = lambda x: re.sub('\n', '', x)
remove_handles = lambda x: re.sub(r'@[^\s]+','',x)
remove_urls = lambda x: re.sub(r'https?://\S+|www\.\S+','', x)
remove_hashtags = lambda x: re.sub(r'#\S+', '', x)
emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           u"\U0001F4A0-\U0001F4AF"  # additional emoticons
                           u"\U0001F525"             # flame
                           u"\U0001F929"             # star-struck
                           "]+", flags=re.UNICODE)
remove_emojis = lambda x: emoji_pattern.sub(r'', x) # no emoji

stop_words_nltk = set(stopwords.words('english'))
tokenizer = TweetTokenizer()
lemmatizer = WordNetLemmatizer()

def preprocess_tweet(tweet):
    """
    Clean tweet content (removal of hanles, urls, hashtags and new line indicators)
    Tokenize and Lemmatize tweet contents
        Args:
        - tweet: tweet of type String
        Returns:
        - processed_tweets: A list of preprocessed tokens of type String.
    """
    tweeet_clean = tweet
    for func in (remove_line, remove_handles, remove_urls, remove_hashtags, remove_emojis):
        tweeet_clean = func(tweeet_clean)
    processed_tweets = [w for w in tokenizer.tokenize(tweeet_clean) if (w not in stop_words_nltk)]
    processed_tweets = [lemmatizer.lemmatize(w) for w in processed_tweets if len(w) > 2]    
    return processed_tweets

tweets_df['rawContent'] = tweets_df['rawContent'].apply(preprocess_tweet)
# tweets_df['rawDescription'] = tweets_df['rawDescription'].apply(preprocess_tweet)
tweets_df.drop_duplicates('rawContent', inplace=True)
tweets_df.reset_index(inplace=True, drop=True)
tweets_df.head()

Unnamed: 0,id,date,lang,rawContent,user_id,username,displayname,rawDescription
0,1785820420652085579,2024-05-01,en,"[Cenovus, Energy's, 1Q24, Conference, Call, CO...",1213673551305199620,emmpeethree,The Napster,"Focusing on energy investing, #FIRE, churning ..."
1,1785676648953831445,2024-05-01,en,"[Cenovus, Energy's, 1Q24, 1Q24, first, take, d...",1213673551305199620,emmpeethree,The Napster,"Focusing on energy investing, #FIRE, churning ..."
2,1785816061457358941,2024-05-01,en,"[Crude, Oil, -2.97, interest, rate, remaining,...",1754316103852204032,ForeverRebate,Forever Rebate,"I have traded FX, Futures, and Stocks for over..."
3,1785814624455332253,2024-05-01,en,"[Imperial, Oil's, 1Q24, Full, report, talking,...",1213673551305199620,emmpeethree,The Napster,"Focusing on energy investing, #FIRE, churning ..."
4,1785391289539973283,2024-04-30,en,"[Imperial, Oil's, 1Q24, Conference, Call, CONF...",1213673551305199620,emmpeethree,The Napster,"Focusing on energy investing, #FIRE, churning ..."


In [7]:
tweets_dict = gs.corpora.Dictionary(tweets_df['rawContent'].to_list())
count_bow_tweets = [tweets_dict.doc2bow(doc) for doc in tweets_df['rawContent'].to_list()]

# Instantiate model 
model = gs.models.LdaMulticore(
  count_bow_tweets,
  num_topics = 4,
  id2word = tweets_dict,
  passes = 10,
  workers = 2)

# save model to disk
model.save('lda.model')

In [8]:
pyLDAvis.enable_notebook()
topic_vis = pyLDAvis.gensim.prepare(model, count_bow_tweets, tweets_dict)
topic_vis

  pid = os.fork()
  if isinstance(node, ast.Num):  # <number>
  if isinstance(node, ast.Num):  # <number>
  if isinstance(node, ast.Num):  # <number>
  return node.n
  if isinstance(node, ast.Num):  # <number>
  return node.n
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  EPOCH = datetime.datetime.utcfromtimestamp(0)
  EPOCH = datetime.datetime.utcfromtimestamp(0)
  EPOCH = datetime.datetime.utcfromtimestamp(0)
  EPOCH = datetime.datetime.utcfromtimestamp(0)
  EPOCH = datetime.datetime.utcfromtimestamp(0)
  EPOCH = datetime.datetime.utcfromtimestamp(0)
  EPOCH = datetime.datetime.utcfromtimestamp(0)
  EPOCH = datetime.datetime.utcfromtimestamp(0)


In [9]:
# Compute Coherence Score
coherence_model = gs.models.CoherenceModel(
    model=model,
    texts=tweets_df['rawContent'].to_list(),
    dictionary=tweets_dict,
    coherence='c_v')
coherence_score = coherence_model.get_coherence()
print(f'Coherence Score: {coherence_score}')

Coherence Score: 0.348107991414505


In [10]:
topics_prob = [model.get_document_topics(tweet, minimum_probability=0.0) for tweet in count_bow_tweets]
topic = [max(prob, key=lambda x: x[1])[0] for prob in topics_prob]

tweets_df['topic'] = topic
tweets_df.head()

Unnamed: 0,id,date,lang,rawContent,user_id,username,displayname,rawDescription,topic
0,1785820420652085579,2024-05-01,en,"[Cenovus, Energy's, 1Q24, Conference, Call, CO...",1213673551305199620,emmpeethree,The Napster,"Focusing on energy investing, #FIRE, churning ...",1
1,1785676648953831445,2024-05-01,en,"[Cenovus, Energy's, 1Q24, 1Q24, first, take, d...",1213673551305199620,emmpeethree,The Napster,"Focusing on energy investing, #FIRE, churning ...",1
2,1785816061457358941,2024-05-01,en,"[Crude, Oil, -2.97, interest, rate, remaining,...",1754316103852204032,ForeverRebate,Forever Rebate,"I have traded FX, Futures, and Stocks for over...",2
3,1785814624455332253,2024-05-01,en,"[Imperial, Oil's, 1Q24, Full, report, talking,...",1213673551305199620,emmpeethree,The Napster,"Focusing on energy investing, #FIRE, churning ...",1
4,1785391289539973283,2024-04-30,en,"[Imperial, Oil's, 1Q24, Conference, Call, CONF...",1213673551305199620,emmpeethree,The Napster,"Focusing on energy investing, #FIRE, churning ...",1


In [11]:
tweets_df.to_csv('tweets_topic.csv')