# Topic Modelling
- Post-processing tweets
- Applying LDA

## Load packages and tweets

In [1]:
import glob
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
def csv_compiler(folder='./data/tweets_*.csv'):
    '''Import files as specified in input and collate into one DataFrame
    Args: folder, str. filepath relative to current folder, with optional pattern recognition
    Returns: DataFrame of collated data
    '''
    df = pd.concat([pd.read_csv('{}'.format(filepath), parse_dates=['date']) for filepath in glob.iglob(folder)],
                   ignore_index=True)
    df.sort_values("date", inplace=True)
    df.drop_duplicates(subset="tweet_id", ignore_index=True, inplace=True)
    return df

In [3]:
tweets = csv_compiler()

cases = pd.read_csv('./data/UKDailyConfirmedCases.csv')
cases['DateVal'] = pd.to_datetime(cases['DateVal'], format='%d/%m/%Y')
cases.rename(columns={'CMODateCount':'DailyCases'}, inplace=True)

## Preparing the Corpus

In [4]:
compileddoc = [tweet for tweet in tweets.text] # compile tweets into one list / corpus

In [6]:
import string

import nltk
from nltk.tokenize import word_tokenize

from nltk.stem import WordNetLemmatizer 
from nltk.corpus import stopwords
nltk.download('stopwords')

stopwords = set(stopwords.words('english'))
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()

def clean(document):
    '''Preprocessing each tweet'''
    stopwordremoval = " ".join([i for i in document.lower().split() if i not in stopwords])
    punctuationremoval = ''.join(ch for ch in stopwordremoval if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punctuationremoval.split())
    return normalized

final_doc = [clean(document).split() for document in compileddoc]

[nltk_data] Downloading package stopwords to /Users/Noah/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Preparing the LDA Model

In [7]:
import gensim
from gensim import corpora
from gensim.corpora.dictionary import Dictionary
from nltk.tokenize import word_tokenize

dictionary = corpora.Dictionary(final_doc) # creating a term dictionary, where each term is assigned an index

DT_matrix = [dictionary.doc2bow(doc) for doc in final_doc] # converting the corpus into a document-term matrix

Lda_object = gensim.models.ldamodel.LdaModel # object for the LDA Model

## Training the LDA Model

In [10]:
lda_model_1 = Lda_object(DT_matrix, num_topics=10, id2word=dictionary) # training LDA on the DT Matrix
print(lda_model_1.print_topics(num_topics=10, num_words=1)) # printing topics found, and the words associated

[(0, '0.022*"covid19"'), (1, '0.038*"coronavirus"'), (2, '0.032*"covid19"'), (3, '0.041*"covid19"'), (4, '0.218*"covid"'), (5, '0.051*"cummings"'), (6, '0.070*"covid19"'), (7, '0.037*"death"'), (8, '0.067*"lockdown"'), (9, '0.031*"covid19"')]
