In [3]:
import os
import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from dateutil.relativedelta import relativedelta
import re
from nltk.stem import *
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import gensim as gs
import pyLDAvis.gensim

In [4]:
directory_path = 'Tweets/'
directory = os.fsencode(directory_path)

In [5]:
data = []
for file in os.listdir(directory):
    fname = os.fsdecode(file)
    if fname.endswith(".json"): 
        with open(f'{directory_path}{fname}', "r") as file:
            temp = json.loads(file.read())
            temp = [json.loads(tweet) for tweet in temp]
        data.extend(temp)

In [6]:
# # set period under observation
# start_date = datetime(2023, 1, 1)
# end_date = datetime.today() - relativedelta(months=2)

# # Iterate through 1st of each month
# data = []
# current_date = start_date
# while current_date < end_date:
#     f_name = f'Tweets/tweets_{current_date.strftime('%b%y').lower()}.json'
#     with open(f_name, "r") as file:
#         temp = json.loads(file.read())
#         temp = [json.loads(tweet) for tweet in temp]
#     data.extend(temp)
#     # Increment to the 1st of the next month
#     current_date += relativedelta(months=+1)

In [7]:
# Convert list of tweets into a dataframe
tweets_df = pd.DataFrame(data)

# Keep only relevant columns
cols_to_keep = ['id', 'date', 'user', 'lang', 'rawContent']
tweets_df = tweets_df.loc[:, cols_to_keep]

tweets_user_df = pd.DataFrame(tweets_df['user'].tolist())
tweets_user_df.rename(columns={'id': 'user_id'}, inplace=True)

user_cols_keep = ['user_id', 'username', 'displayname', 'rawDescription']
tweets_user_df = tweets_user_df.loc[:, user_cols_keep]

# Concatenating the original DataFrame with the passages dataframe
tweets_df = pd.concat([tweets_df, tweets_user_df], axis=1)
tweets_df.drop(columns='user', inplace=True)

# Convert date to yyyy-mm-dd format
tweets_df['date'] = pd.to_datetime(tweets_df['date']).dt.strftime('%Y-%m-%d')

#Keep only tweets in english
tweets_df = tweets_df[tweets_df['lang'] == 'en']

tweets_df.drop_duplicates('rawContent', inplace=True)
tweets_df.reset_index(inplace=True, drop=True)
tweets_df.head()

Unnamed: 0,id,date,lang,rawContent,user_id,username,displayname,rawDescription
0,1576358723295281152,2022-10-01,en,Goals? \n\n1) Energy security\n2) Energy affor...,1622604644,Patricia_Energy,Patricia Schouker,💡Energy & Security • Non-Resident Fellow @cosc...
1,1576354710415720448,2022-10-01,en,"💰 #SP500 3,585.62 📉 0%\n🚜 #DJIA 28,725.5...",1558597273386291200,EconSnapped,🤖 EconSnapped,Your economic update. Hourly.\n\n💰 Asset - Pr...
2,1576353909349941248,2022-10-01,en,@rickygervais Having this made for our new hom...,275455521,MarkSJordan,Mark,Married to the blonde.\nComplacency is the enemy
3,1576352995675021312,2022-10-01,en,SCOTT RITTER: The Onus Is on Biden https://t....,29962180,CarlosFGarciaG,Carlos Felipe García,Economía UCV. Magister CGU. Estudios de filoso...
4,1576352573623263233,2022-10-01,en,Local Victorian park sometimes looks pretty \n...,9921452,gn0me,Andrei Ianovskii,Views expressed here are my own and do not nec...


In [8]:
remove_line = lambda x: re.sub('\n', '', x)
remove_handles = lambda x: re.sub(r'@[^\s]+','',x)
remove_urls = lambda x: re.sub(r'https?://\S+|www\.\S+','', x)
remove_hashtags = lambda x: re.sub(r'#\S+', '', x)
emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           u"\U0001F4A0-\U0001F4AF"  # additional emoticons
                           u"\U0001F525"             # flame
                           u"\U0001F929"             # star-struck
                           "]+", flags=re.UNICODE)
remove_emojis = lambda x: emoji_pattern.sub(r'', x) # no emoji
remove_numerics = lambda x: re.sub(r'\w*\d\w*', '', x).strip()

stop_words_nltk = set(stopwords.words('english'))
tokenizer = TweetTokenizer()
lemmatizer = WordNetLemmatizer()

def preprocess_tweet(tweet):
    """
    Clean tweet content (removal of hanles, urls, hashtags and new line indicators)
    Tokenize and Lemmatize tweet contents
        Args:
        - tweet: tweet of type String
        Returns:
        - processed_tweets: A list of preprocessed tokens of type String.
    """
    tweeet_clean = tweet
    for func in (remove_line, remove_handles, remove_urls, remove_hashtags, remove_emojis, remove_numerics):
        tweeet_clean = func(tweeet_clean)
    processed_tweets = [w.lower() for w in tokenizer.tokenize(tweeet_clean) if (w.lower() not in stop_words_nltk)]
    processed_tweets = [lemmatizer.lemmatize(w) for w in processed_tweets if len(w) > 2]    
    return processed_tweets

tweets_df['rawContent'] = tweets_df['rawContent'].apply(preprocess_tweet)
# tweets_df['rawDescription'] = tweets_df['rawDescription'].apply(preprocess_tweet)
tweets_df.drop_duplicates('rawContent', inplace=True)
tweets_df.reset_index(inplace=True, drop=True)
tweets_df.head()

Unnamed: 0,id,date,lang,rawContent,user_id,username,displayname,rawDescription
0,1576358723295281152,2022-10-01,en,"[goal, energy, energy, affordability, energy, ...",1622604644,Patricia_Energy,Patricia Schouker,💡Energy & Security • Non-Resident Fellow @cosc...
1,1576354710415720448,2022-10-01,en,"[usd, eur]",1558597273386291200,EconSnapped,🤖 EconSnapped,Your economic update. Hourly.\n\n💰 Asset - Pr...
2,1576353909349941248,2022-10-01,en,"[made, new, home]",275455521,MarkSJordan,Mark,Married to the blonde.\nComplacency is the enemy
3,1576352995675021312,2022-10-01,en,"[scott, ritter, onus, biden]",29962180,CarlosFGarciaG,Carlos Felipe García,Economía UCV. Magister CGU. Estudios de filoso...
4,1576352573623263233,2022-10-01,en,"[local, victorian, park, sometimes, look, pret...",9921452,gn0me,Andrei Ianovskii,Views expressed here are my own and do not nec...


In [14]:
tweets_dict = gs.corpora.Dictionary(tweets_df['rawContent'].to_list())
count_bow_tweets = [tweets_dict.doc2bow(doc) for doc in tweets_df['rawContent'].to_list()]

In [27]:
# Instantiate model 
model = gs.models.LdaMulticore(
  count_bow_tweets,
  num_topics = 4,
  id2word = tweets_dict,
  passes = 10,
  workers = 2)

# Compute Coherence Score
coherence_model = gs.models.CoherenceModel(
    model=model,
    texts=tweets_df['rawContent'].to_list(),
    dictionary=tweets_dict,
    coherence='c_v')
coherence_score = coherence_model.get_coherence()
print(f'Coherence Score: {coherence_score}')

Coherence Score: 0.5336253514019408


In [16]:
# Visualise topics
pyLDAvis.enable_notebook()
topic_vis = pyLDAvis.gensim.prepare(model, count_bow_tweets, tweets_dict)
topic_vis

  if isinstance(node, ast.Num):  # <number>
  if isinstance(node, ast.Num):  # <number>
  return node.n
  if isinstance(node, ast.Num):  # <number>
  return node.n


In [17]:
topics_prob = [model.get_document_topics(tweet, minimum_probability=0.0) for tweet in count_bow_tweets]
topic = [max(prob, key=lambda x: x[1])[0] for prob in topics_prob]

tweets_df['topic'] = topic
tweets_df.head()

Unnamed: 0,id,date,lang,rawContent,user_id,username,displayname,rawDescription,topic
0,1576358723295281152,2022-10-01,en,"[goal, energy, energy, affordability, energy, ...",1622604644,Patricia_Energy,Patricia Schouker,💡Energy & Security • Non-Resident Fellow @cosc...,2
1,1576354710415720448,2022-10-01,en,"[usd, eur]",1558597273386291200,EconSnapped,🤖 EconSnapped,Your economic update. Hourly.\n\n💰 Asset - Pr...,2
2,1576353909349941248,2022-10-01,en,"[made, new, home]",275455521,MarkSJordan,Mark,Married to the blonde.\nComplacency is the enemy,2
3,1576352995675021312,2022-10-01,en,"[scott, ritter, onus, biden]",29962180,CarlosFGarciaG,Carlos Felipe García,Economía UCV. Magister CGU. Estudios de filoso...,1
4,1576352573623263233,2022-10-01,en,"[local, victorian, park, sometimes, look, pret...",9921452,gn0me,Andrei Ianovskii,Views expressed here are my own and do not nec...,0


In [18]:
# save topic assigned
tweets_df.to_csv('Data/tweets_topic.csv')

# save model to disk
model.save('Models/LDA/lda.model')