# Topic Modelling

The aim of this effort is to analyse topics highlighted by different countries in news related to the coronavirus pandemic. We are limited this analysis to a period from Dec 2019 to Aug 2020.

In [19]:
# Import Libraries

# We need NLTK and Gensim for LDA Topic Modelling
from nltk import word_tokenize, pos_tag
from gensim import matutils, models
import gensim
import scipy.sparse
import os # to access files for cleaning
from collections import Counter # to count word occurance
import re # Regix to remove punctuation from strings I split
from shutil import copyfile # For copying clean files
from sklearn.feature_extraction.text import CountVectorizer # For creating document-term matrix & excluding stop words
from sklearn.feature_extraction import text # For getting stop words
from wordcloud import WordCloud # For creating word clouds
from textblob import TextBlob # For sentiment analysis
import numpy as np # For dataframe analysis
import pandas as pd # For dataframe analysis
import matplotlib.pyplot as plt # For graphs
import seaborn as sns # For graphs
%matplotlib inline

In [2]:
yearMonths = ['201912', '202001', '202002', '202003', '202004', '202005', '202006', '202007', '202008']

## Using clean dataframe

In [3]:
df = pd.read_csv('covid_cleandf.csv')

In [4]:
df

Unnamed: 0,name,path,country,network,date,token_freq,text
0,20191204_AE_KhaleejTimes_GDELT75493.txt,Raw text/AEClean/20191204_AE_KhaleejTimes_GDEL...,AE,KhaleejTimes,20191204,16,WKND Inspired Living KT Home Videos Interactiv...
1,20191214_TR_AnadoluAgency_NEXIS720304.txt,Raw text/TRClean/20191214_TR_AnadoluAgency_NEX...,TR,AnadoluAgency,20191214,4,Virus transmitted to people from wild animals ...
2,20191218_US_TheNewHumanitarian_GNAPI57383.txt,Raw text/USClean/20191218_US_TheNewHumanitaria...,US,TheNewHumanitarian,20191218,4,The humanitarian sector has a trust problem Th...
3,20191218_US_USAToday_GNAPI57390.txt,Raw text/USClean/20191218_US_USAToday_GNAPI573...,US,USAToday,20191218,7,Alabama Mobile Researchers from the University...
4,20191222_AE_KhaleejTimes_GDELT119076.txt,Raw text/AEClean/20191222_AE_KhaleejTimes_GDEL...,AE,KhaleejTimes,20191222,12,WKND Inspired Living KT Home Videos Interactiv...
...,...,...,...,...,...,...,...
12825,20200819_KW_KUNA_GDELT162457.txt,Raw text/KWClean/20200819_KW_KUNA_GDELT162457.txt,KW,KUNA,20200819,6,LOC23272027 GMT KUWAIT April 23 KUNA UN Secret...
12826,20200819_KW_KUNA_GDELT169766.txt,Raw text/KWClean/20200819_KW_KUNA_GDELT169766.txt,KW,KUNA,20200819,3,LOC15121212 GMT ROME April 26 KUNA The Kuwaiti...
12827,20200819_KW_KUNA_GDELT174322.txt,Raw text/KWClean/20200819_KW_KUNA_GDELT174322.txt,KW,KUNA,20200819,5,LOC02532353 GMT KUWAIT March 6 KUNA The Kuwait...
12828,20200819_KW_KUNA_GDELT176072.txt,Raw text/KWClean/20200819_KW_KUNA_GDELT176072.txt,KW,KUNA,20200819,6,LOC00002100 GMT NEW YORK March 23 KUNA In an a...


## Functions

Defining required funtion at the top so they can be called multiple times later, whenever needed.

In [7]:
def cleanTextInDf(mystring):
    mystring = mystring.lower() # Text normalization: make string lowercase
    mystring = re.sub(r'[^\w\s]','', mystring) # Text normalization: remove punctuation
    return mystring

In [8]:
def checkYearMonth(row):
    value = row['date']
    return str(value)[0:6]

In [9]:
def combinedTextForCountryDf(country):
    index = 0
    dfCountryYrList = []
    for ym in yearMonths:
        combinedText = ' '.join(df[(df['yearmonth'] == ym) & (df['country'] == country)].text)
        dictCountryYr = {'country': country, 'yearmonth': ym, 'text': combinedText}
        dfCountryYrList.append(dictCountryYr)
    return dfCountryYrList

In [10]:
# Filter nouns and adjectives
def partsOfSpeechFilter(text):
    isNounAdj = lambda pos: pos[:2] == 'NN' or pos[:2] == 'JJ'
    tokenized = word_tokenize(text)
    nounsAdj = [word for (word, pos) in pos_tag(tokenized) if isNounAdj(pos)] 
    return ' '.join(nounsAdj)

In [17]:
def countryTopicsOverTime(country):
    countryYrList = combinedTextForCountryDf(country) # This function combines text and returns a list
    countryYrDf = pd.DataFrame(countryYrList) # convert list to dataframe
    dfna = pd.DataFrame(countryYrDf.text.apply(partsOfSpeechFilter))
    dfna['ym'] = yearMonths
    dfna.set_index('ym', inplace=True, drop=True)
    vectorizerNA = CountVectorizer(stop_words='english')
    dataVectorizerNA = vectorizerNA.fit_transform(dfna.text)
    dataDtmNA = pd.DataFrame(dataVectorizerNA.toarray(), columns = vectorizerNA.get_feature_names())
    dataDtmNA.index = dfna.index
    corpusNA = matutils.Sparse2Corpus(scipy.sparse.csr_matrix(dataDtmNA.transpose()))
    id2wordNA = dict((v, k) for k, v in vectorizerNA.vocabulary_.items())
    ldaNA = models.LdaModel(corpus=corpusNA, num_topics=5, id2word=id2wordNA, passes=80)
    return [ldaNA.print_topics(), ldaNA, corpusNA]

In [12]:
# Cleaning the dataframe
cleanText = lambda text: cleanTextInDf(text) # Lambda function applies to all cells in a column
cleandf = pd.DataFrame(df.text.apply(cleanText)) # .apply() the function to all cells
df['text'] = cleandf['text']
df['yearmonth'] = df.apply(checkYearMonth, axis=1)
df

Unnamed: 0,name,path,country,network,date,token_freq,text,yearmonth
0,20191204_AE_KhaleejTimes_GDELT75493.txt,Raw text/AEClean/20191204_AE_KhaleejTimes_GDEL...,AE,KhaleejTimes,20191204,16,wknd inspired living kt home videos interactiv...,201912
1,20191214_TR_AnadoluAgency_NEXIS720304.txt,Raw text/TRClean/20191214_TR_AnadoluAgency_NEX...,TR,AnadoluAgency,20191214,4,virus transmitted to people from wild animals ...,201912
2,20191218_US_TheNewHumanitarian_GNAPI57383.txt,Raw text/USClean/20191218_US_TheNewHumanitaria...,US,TheNewHumanitarian,20191218,4,the humanitarian sector has a trust problem th...,201912
3,20191218_US_USAToday_GNAPI57390.txt,Raw text/USClean/20191218_US_USAToday_GNAPI573...,US,USAToday,20191218,7,alabama mobile researchers from the university...,201912
4,20191222_AE_KhaleejTimes_GDELT119076.txt,Raw text/AEClean/20191222_AE_KhaleejTimes_GDEL...,AE,KhaleejTimes,20191222,12,wknd inspired living kt home videos interactiv...,201912
...,...,...,...,...,...,...,...,...
12825,20200819_KW_KUNA_GDELT162457.txt,Raw text/KWClean/20200819_KW_KUNA_GDELT162457.txt,KW,KUNA,20200819,6,loc23272027 gmt kuwait april 23 kuna un secret...,202008
12826,20200819_KW_KUNA_GDELT169766.txt,Raw text/KWClean/20200819_KW_KUNA_GDELT169766.txt,KW,KUNA,20200819,3,loc15121212 gmt rome april 26 kuna the kuwaiti...,202008
12827,20200819_KW_KUNA_GDELT174322.txt,Raw text/KWClean/20200819_KW_KUNA_GDELT174322.txt,KW,KUNA,20200819,5,loc02532353 gmt kuwait march 6 kuna the kuwait...,202008
12828,20200819_KW_KUNA_GDELT176072.txt,Raw text/KWClean/20200819_KW_KUNA_GDELT176072.txt,KW,KUNA,20200819,6,loc00002100 gmt new york march 23 kuna in an a...,202008


## Topics for UAE

Running the first model for UAE, I'll do the same for the rest of the countries to get an idea of the topics.

In [13]:
# Get dataframe for UAE

countryYrList = combinedTextForCountryDf('AE') # This function combines text and returns a list
countryYrDf = pd.DataFrame(countryYrList) # convert list to dataframe
countryYrDf # Show the data frame

Unnamed: 0,country,yearmonth,text
0,AE,201912,wknd inspired living kt home videos interactiv...
1,AE,202001,wknd inspired living kt home videos interactiv...
2,AE,202002,wknd inspired living kt home videos interactiv...
3,AE,202003,international experts are questioning the scal...
4,AE,202004,over 50 deaths have been reported in iraq maki...
5,AE,202005,hamas has reportedly allowed medical teams fro...
6,AE,202006,uk pledges 160m in yemen aid before donor conf...
7,AE,202007,dubai 30th june 2020 wam as part of its keenne...
8,AE,202008,dubai 2nd august 2020 wam nasser bin thani al ...


In [14]:
# Apply parts of speech filter to filter out nouns and adjectives

dfna = pd.DataFrame(countryYrDf.text.apply(partsOfSpeechFilter))
dfna

Unnamed: 0,text
0,wknd living kt home videos kt podcast mars mis...
1,wknd living kt home videos kt podcast mars mis...
2,wknd living kt home videos kt podcast mars mis...
3,international experts scale new coronavirus ep...
4,deaths iraq worstaffected countries region wom...
5,hamas medical teams gaza coronavirusrelated tr...
6,uk pledges yemen aid donor conference un saudi...
7,dubai june wam part keenness communal solidari...
8,dubai august wam bin thani hamli minister huma...


In [15]:
# Make YearMonths the Index

dfna['ym'] = yearMonths
dfna.set_index('ym', inplace=True, drop=True)
dfna

Unnamed: 0_level_0,text
ym,Unnamed: 1_level_1
201912,wknd living kt home videos kt podcast mars mis...
202001,wknd living kt home videos kt podcast mars mis...
202002,wknd living kt home videos kt podcast mars mis...
202003,international experts scale new coronavirus ep...
202004,deaths iraq worstaffected countries region wom...
202005,hamas medical teams gaza coronavirusrelated tr...
202006,uk pledges yemen aid donor conference un saudi...
202007,dubai june wam part keenness communal solidari...
202008,dubai august wam bin thani hamli minister huma...


In [16]:
# Get the document term matrix

vectorizerNA = CountVectorizer(stop_words='english')
dataVectorizerNA = vectorizerNA.fit_transform(dfna.text)
dataDtmNA = pd.DataFrame(dataVectorizerNA.toarray(), columns = vectorizerNA.get_feature_names())
dataDtmNA.index = dfna.index
dataDtmNA # This is the document term matrix

Unnamed: 0_level_0,1000kilometrelong,10day,10th,10year,13th,14day,14th,150mile,17th,1900year,...,ومد,ونزاهة,ونقلهم,ووقاية,و١٢,يأمر,يتعب,يد,يعبر,يكل
ym,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
201912,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
202001,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
202002,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
202003,0,0,1,0,1,2,0,0,0,0,...,1,0,1,1,0,1,0,1,2,0
202004,0,1,1,0,0,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
202005,3,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
202006,0,0,1,1,0,2,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
202007,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,1,0,0,1
202008,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [55]:
# Map ID to words

corpusNA = matutils.Sparse2Corpus(scipy.sparse.csr_matrix(dataDtmNA.transpose()))
id2wordNA = dict((v, k) for k, v in vectorizerNA.vocabulary_.items())

In [90]:
# Run the LDA model

ldaNA = models.LdaModel(corpus=corpusNA, num_topics=5, id2word=id2wordNA, passes=80)
ldaNA.print_topics()

[(0,
  '0.015*"coronavirus" + 0.014*"uae" + 0.011*"covid19" + 0.008*"people" + 0.008*"pandemic" + 0.007*"al" + 0.007*"health" + 0.007*"world" + 0.006*"humanitarian" + 0.006*"dubai"'),
 (1,
  '0.010*"august" + 0.007*"uae" + 0.007*"india" + 0.006*"beirut" + 0.006*"air" + 0.005*"express" + 0.005*"kozhikode" + 0.005*"flight" + 0.004*"covid19" + 0.004*"lebanon"'),
 (2,
  '0.014*"uae" + 0.013*"coronavirus" + 0.011*"covid19" + 0.009*"pandemic" + 0.007*"al" + 0.007*"people" + 0.006*"humanitarian" + 0.006*"world" + 0.006*"dubai" + 0.006*"health"'),
 (3,
  '0.018*"coronavirus" + 0.011*"uae" + 0.009*"people" + 0.008*"health" + 0.007*"covid19" + 0.007*"humanitarian" + 0.006*"iran" + 0.006*"world" + 0.006*"countries" + 0.005*"country"'),
 (4,
  '0.011*"august" + 0.010*"times" + 0.010*"syria" + 0.008*"kt" + 0.007*"khaleej" + 0.006*"dubai" + 0.006*"uae" + 0.006*"coronavirus" + 0.006*"ktinsyria" + 0.005*"video"')]

### Topics for UAE:

* 0: Coronavirus Pandemic
* 1: Beirut/Lebanon in August + Flights/Travel
* 2: Coronavirus and Public Health
* 3: Coronavirus
* 4: Syria + Coronavirus

In [91]:
# Map the topics to months - each row of the list should represent the mapped topic.

transformedCorpus = ldaNA[corpusNA]
for tc in transformedCorpus:
    print(tc)

[(0, 0.023316061), (2, 0.1404689), (4, 0.83447915)]
[(0, 0.011764727), (2, 0.40063396), (3, 0.4995596), (4, 0.08792532)]
[(2, 0.99432164)]
[(3, 0.99389523)]
[(0, 0.998874)]
[(0, 0.9935542)]
[(2, 0.99228925)]
[(2, 0.99600726)]
[(0, 0.33353826), (1, 0.43542117), (2, 0.22957814)]


* Dec 2019: Syria + Coronavirus
* Jan 2020: Coronavirus and Public Health
* Feb 2020: Coronavirus and Public Health
* Mar 2020: Coronavirus
* Apr 2020: Coronavirus
* May 2020: Coronavirus
* Jun 2020: Coronavirus and Public Health
* Jul 2020: Coronavirus and Public Health
* Aug 2020: Beirut/Lebanon in August + Flights/Travel + Coronavirus

## Topics for Kuwait

I'll run the same analysis for Kuwait

In [101]:
kuwaitModel = countryTopicsOverTime('KW')
kuwaitModel[0]

[(0,
  '0.000*"kuwait" + 0.000*"health" + 0.000*"international" + 0.000*"countries" + 0.000*"coronavirus" + 0.000*"people" + 0.000*"approaches" + 0.000*"sake" + 0.000*"reflection" + 0.000*"enhanced"'),
 (1,
  '0.000*"kuwait" + 0.000*"people" + 0.000*"countries" + 0.000*"covid19" + 0.000*"health" + 0.000*"humanitarian" + 0.000*"international" + 0.000*"coronavirus" + 0.000*"government" + 0.000*"global"'),
 (2,
  '0.000*"kuwait" + 0.000*"humanitarian" + 0.000*"international" + 0.000*"unresolved" + 0.000*"fervent" + 0.000*"occupation" + 0.000*"solace" + 0.000*"feed" + 0.000*"corners" + 0.000*"akhm"'),
 (3,
  '0.016*"kuwait" + 0.009*"people" + 0.009*"health" + 0.009*"coronavirus" + 0.008*"countries" + 0.008*"covid19" + 0.007*"humanitarian" + 0.007*"government" + 0.007*"international" + 0.006*"pandemic"'),
 (4,
  '0.000*"kuwait" + 0.000*"coronavirus" + 0.000*"health" + 0.000*"international" + 0.000*"humanitarian" + 0.000*"covid19" + 0.000*"tomorrow" + 0.000*"august" + 0.000*"relentless" + 0.

### Topics for Kuwait:

* 0: Coronavirus and Public Health
* 1: Coronavirus and Public Health
* 2: Humanitarian efforts
* 3: Public Health and Safety + International efforts
* 4: Humanitarian efforts + Coronavirus

In [102]:
ldaNA = kuwaitModel[1]
corpusNA = kuwaitModel[2]
transformedCorpus = ldaNA[corpusNA]
for tc in transformedCorpus:
    print(tc)

[(0, 0.2), (1, 0.2), (2, 0.2), (3, 0.2), (4, 0.2)]
[(0, 0.2), (1, 0.2), (2, 0.2), (3, 0.2), (4, 0.2)]
[(0, 0.2), (1, 0.2), (2, 0.2), (3, 0.2), (4, 0.2)]
[(3, 0.999816)]
[(3, 0.9997828)]
[(3, 0.99979717)]
[(3, 0.9994398)]
[(3, 0.99963075)]
[(3, 0.999075)]


* Dec 2019: Coronavirus and Public Health + Humanitarian Efforts
* Jan 2020: Coronavirus and Public Health + Humanitarian Efforts
* Feb 2020: Coronavirus and Public Health + Humanitarian Efforts
* Mar 2020: Coronavirus and Public Health + Humanitarian Efforts
* Apr 2020: Coronavirus and Public Health + Humanitarian Efforts
* May 2020: Coronavirus and Public Health + Humanitarian Efforts
* Jun 2020: Coronavirus and Public Health + Humanitarian Efforts
* Jul 2020: Coronavirus and Public Health + Humanitarian Efforts
* Aug 2020: Coronavirus and Public Health + Humanitarian Efforts

## Topics for Turkey

I'll run the same analysis for Turkey

In [103]:
turkeyModel = countryTopicsOverTime('TR')
turkeyModel[0]

[(0,
  '0.013*"coronavirus" + 0.011*"cases" + 0.011*"people" + 0.010*"health" + 0.009*"virus" + 0.009*"country" + 0.008*"covid19" + 0.008*"new" + 0.008*"world" + 0.008*"countries"'),
 (1,
  '0.000*"pandemic" + 0.000*"coronavirus" + 0.000*"countries" + 0.000*"cases" + 0.000*"covid19" + 0.000*"people" + 0.000*"turkey" + 0.000*"health" + 0.000*"country" + 0.000*"world"'),
 (2,
  '0.009*"economic" + 0.007*"covid19" + 0.006*"beirut" + 0.006*"region" + 0.006*"lebanon" + 0.006*"turkey" + 0.005*"thursday" + 0.005*"pandemic" + 0.005*"tuesday" + 0.004*"coronavirus"'),
 (3,
  '0.015*"coronavirus" + 0.011*"people" + 0.010*"cases" + 0.010*"covid19" + 0.009*"pandemic" + 0.009*"countries" + 0.009*"country" + 0.009*"health" + 0.009*"turkey" + 0.008*"world"'),
 (4,
  '0.000*"coronavirus" + 0.000*"countries" + 0.000*"cases" + 0.000*"turkey" + 0.000*"people" + 0.000*"covid19" + 0.000*"health" + 0.000*"pandemic" + 0.000*"virus" + 0.000*"new"')]

### Topics for Turkey:

* 0: Coronavirus and Rising Cases
* 1: Coronavirus and Public Health
* 2: Beirut/Lebanon + Coronavirus
* 3: Coronavirus and Public Health
* 4: Coronavirus and Public Health

In [104]:
ldaNA = turkeyModel[1]
corpusNA = turkeyModel[2]
transformedCorpus = ldaNA[corpusNA]
for tc in transformedCorpus:
    print(tc)

[(0, 0.997352)]
[(0, 0.9976034)]
[(0, 0.99802905)]
[(0, 0.99777466)]
[(3, 0.99824154)]
[(3, 0.9997316)]
[(0, 0.99816126)]
[(0, 0.9895916), (3, 0.010350898)]
[(0, 0.082289845), (2, 0.6626386), (3, 0.2549784)]


* Dec 2019: Coronavirus and Rising Cases
* Jan 2020: Coronavirus and Rising Cases
* Feb 2020: Coronavirus and Rising Cases
* Mar 2020: Coronavirus and Rising Cases
* Apr 2020: Coronavirus and Public Health
* May 2020: Coronavirus and Public Health
* Jun 2020: Coronavirus and Rising Cases
* Jul 2020: Coronavirus and Rising Cases
* Aug 2020: Coronavirus and Public Health + Beirut

## Topics for US

I'll run the same analysis for US

In [105]:
usModel = countryTopicsOverTime('US')
usModel[0]

[(0,
  '0.000*"people" + 0.000*"coronavirus" + 0.000*"health" + 0.000*"new" + 0.000*"country" + 0.000*"president" + 0.000*"virus" + 0.000*"covid19" + 0.000*"government" + 0.000*"world"'),
 (1,
  '0.011*"people" + 0.007*"new" + 0.007*"health" + 0.007*"coronavirus" + 0.006*"cases" + 0.006*"pandemic" + 0.005*"covid19" + 0.005*"president" + 0.005*"country" + 0.004*"virus"'),
 (2,
  '0.012*"people" + 0.010*"coronavirus" + 0.008*"health" + 0.007*"new" + 0.005*"pandemic" + 0.005*"world" + 0.005*"covid19" + 0.005*"virus" + 0.005*"country" + 0.005*"president"'),
 (3,
  '0.012*"people" + 0.010*"coronavirus" + 0.007*"health" + 0.007*"new" + 0.006*"virus" + 0.005*"president" + 0.005*"country" + 0.005*"cases" + 0.004*"world" + 0.004*"government"'),
 (4,
  '0.013*"people" + 0.007*"president" + 0.006*"new" + 0.005*"coronavirus" + 0.005*"trump" + 0.004*"country" + 0.004*"health" + 0.004*"government" + 0.004*"world" + 0.004*"time"')]

### Topics for US:

* 0: Coronavirus and Public Health + President's messages
* 1: Coronavirus and Public Health + President's messages
* 2: Coronavirus and Public Health + President's messages
* 3: Coronavirus and Public Health
* 4: Novel/New Virus + President's messages

In [107]:
ldaNA = usModel[1]
corpusNA = usModel[2]
transformedCorpus = ldaNA[corpusNA]
for tc in transformedCorpus:
    print(tc)

[(1, 0.4213007), (2, 0.057244007), (3, 0.5008299), (4, 0.020572944)]
[(1, 0.035445776), (2, 0.34658325), (3, 0.14308782), (4, 0.47486943)]
[(3, 0.02574117), (4, 0.96920377)]
[(2, 0.03514284), (3, 0.9528601)]
[(2, 0.97344714), (3, 0.017897649)]
[(1, 0.0871119), (2, 0.90506136)]
[(1, 0.9518598), (2, 0.03857566)]
[(1, 0.9640164), (2, 0.02871547)]
[(1, 0.08472384), (2, 0.04751099), (4, 0.86682534)]


* Dec 2019: Coronavirus and Public Health + President's messages
* Jan 2020: Novel Coronavirus and Public Health + President's messages
* Feb 2020: Novel/New Virus + President's messages
* Mar 2020: Coronavirus and Public Health
* Apr 2020: Coronavirus and Public Health + President's messages
* May 2020: Coronavirus and Public Health + President's messages
* Jun 2020: Coronavirus and Public Health
* Jul 2020: Coronavirus and Public Health
* Aug 2020: Novel/New Virus + President's messages

## Topics for UK

I'll run the same analysis for UK

In [20]:
ukModel = countryTopicsOverTime('UK')
ukModel[0]

[(0,
  '0.011*"coronavirus" + 0.010*"people" + 0.009*"virus" + 0.008*"gmt" + 0.008*"china" + 0.007*"health" + 0.007*"cases" + 0.006*"new" + 0.006*"outbreak" + 0.005*"government"'),
 (1,
  '0.015*"coronavirus" + 0.012*"people" + 0.009*"health" + 0.008*"new" + 0.008*"cases" + 0.007*"bst" + 0.007*"covid19" + 0.006*"government" + 0.006*"country" + 0.006*"world"'),
 (2,
  '0.012*"coronavirus" + 0.012*"people" + 0.009*"cases" + 0.009*"new" + 0.008*"health" + 0.007*"covid19" + 0.007*"country" + 0.006*"bst" + 0.006*"government" + 0.005*"world"'),
 (3,
  '0.015*"coronavirus" + 0.013*"people" + 0.009*"cases" + 0.008*"health" + 0.008*"new" + 0.007*"government" + 0.007*"virus" + 0.007*"country" + 0.006*"gmt" + 0.005*"covid19"'),
 (4,
  '0.013*"cases" + 0.013*"coronavirus" + 0.012*"people" + 0.010*"new" + 0.009*"health" + 0.009*"covid19" + 0.007*"government" + 0.006*"country" + 0.006*"bst" + 0.006*"pandemic"')]

### Topics for UK

1. Topic 0: Coronavirus, China
2. Topic 1: Coronavirus and Public Health + Rising Cases
3. Topic 2: Coronavirus and Public Health + Rising Cases
4. Topic 3: Coronavirus and Public Health + Rising Cases
5. Topic 4: Coronavirus and Public Health + Rising Cases

In [22]:
ldaNA = ukModel[1]
corpusNA = ukModel[2]
transformedCorpus = ldaNA[corpusNA]
for tc in transformedCorpus:
    print(tc)

[(0, 0.4484221), (1, 0.050511863), (2, 0.26005825), (3, 0.04436346), (4, 0.19664435)]
[(0, 0.85653275), (1, 0.024458837), (3, 0.09251843), (4, 0.021701824)]
[(0, 0.77981746), (1, 0.06085252), (3, 0.12730305), (4, 0.02264195)]
[(1, 0.014239696), (3, 0.9804953)]
[(1, 0.9825007)]
[(1, 0.12103872), (2, 0.8390095), (3, 0.015606011), (4, 0.024345197)]
[(1, 0.029820552), (2, 0.014529348), (4, 0.94949174)]
[(4, 0.98502755)]
[(2, 0.98941207)]


* Dec 2019: Coronavirus, China
* Jan 2020: Coronavirus, China
* Feb 2020: Coronavirus, China
* Mar 2020: Coronavirus and Public Health + Rising Cases
* Apr 2020: Coronavirus and Public Health + Rising Cases
* May 2020: Coronavirus and Public Health + Rising Cases
* Jun 2020: Coronavirus and Public Health + Rising Cases
* Jul 2020: Coronavirus and Public Health + Rising Cases
* Aug 2020: Coronavirus and Public Health + Rising Cases

## Topics for China

In [27]:
cnModel = countryTopicsOverTime('CN')
cnModel[0]

[(0,
  '0.000*"china" + 0.000*"covid19" + 0.000*"countries" + 0.000*"health" + 0.000*"chinese" + 0.000*"international" + 0.000*"world" + 0.000*"pandemic" + 0.000*"global" + 0.000*"country"'),
 (1,
  '0.012*"medical" + 0.011*"coronavirus" + 0.010*"novel" + 0.010*"china" + 0.010*"chinese" + 0.010*"wuhan" + 0.008*"new" + 0.007*"humanitarian" + 0.006*"year" + 0.005*"xinhua"'),
 (2,
  '0.000*"china" + 0.000*"covid19" + 0.000*"health" + 0.000*"countries" + 0.000*"people" + 0.000*"pandemic" + 0.000*"chinese" + 0.000*"virus" + 0.000*"world" + 0.000*"coronavirus"'),
 (3,
  '0.019*"china" + 0.016*"covid19" + 0.012*"countries" + 0.011*"health" + 0.010*"pandemic" + 0.009*"people" + 0.009*"world" + 0.009*"global" + 0.008*"chinese" + 0.008*"international"'),
 (4,
  '0.022*"china" + 0.012*"chinese" + 0.011*"coronavirus" + 0.010*"covid19" + 0.010*"people" + 0.008*"health" + 0.008*"medical" + 0.008*"countries" + 0.008*"world" + 0.007*"virus"')]

### Topics for China

1. Topic 0: Coronavirus and public health
2. Topic 1: Wuhan, Novel Coronavirus
3. Topic 2: Coronavirus
4. Topic 3: Coronavirus and public health
5. Topic 4: Coronavirus and public health

In [28]:
ldaNA = cnModel[1]
corpusNA = cnModel[2]
transformedCorpus = ldaNA[corpusNA]
for tc in transformedCorpus:
    print(tc)

[(0, 0.2), (1, 0.2), (2, 0.2), (3, 0.2), (4, 0.2)]
[(1, 0.6419108), (3, 0.15890743), (4, 0.19911174)]
[(4, 0.99812967)]
[(3, 0.072167195), (4, 0.9278218)]
[(3, 0.9740071), (4, 0.025986535)]
[(3, 0.99956405)]
[(3, 0.999438)]
[(3, 0.9998687)]
[(3, 0.9998798)]


* Dec 2019: Coronavirus and public health
* Jan 2020: Wuhan, Novel Coronavirus
* Feb 2020: Coronavirus and public health
* Mar 2020: Coronavirus and public health
* Apr 2020: Coronavirus and public health
* May 2020: Coronavirus and public health
* Jun 2020: Coronavirus and public health
* Jul 2020: Coronavirus and public health
* Aug 2020: Coronavirus and public health

## Topics for Germany

In [25]:
deModel = countryTopicsOverTime('DE')
deModel[0]

[(0,
  '0.015*"coronavirus" + 0.010*"people" + 0.009*"new" + 0.008*"country" + 0.008*"health" + 0.008*"cases" + 0.007*"government" + 0.006*"covid19" + 0.006*"pandemic" + 0.005*"world"'),
 (1,
  '0.000*"coronavirus" + 0.000*"people" + 0.000*"cases" + 0.000*"country" + 0.000*"health" + 0.000*"pandemic" + 0.000*"government" + 0.000*"new" + 0.000*"world" + 0.000*"virus"'),
 (2,
  '0.003*"____" + 0.003*"talks" + 0.002*"merkel" + 0.002*"idlib" + 0.002*"syria" + 0.002*"resolution" + 0.002*"court" + 0.002*"internationaldpacom" + 0.002*"syrian" + 0.002*"recovery"'),
 (3,
  '0.019*"coronavirus" + 0.012*"people" + 0.010*"cases" + 0.009*"country" + 0.009*"health" + 0.008*"new" + 0.008*"covid19" + 0.007*"government" + 0.007*"germany" + 0.006*"pandemic"'),
 (4,
  '0.000*"coronavirus" + 0.000*"people" + 0.000*"health" + 0.000*"cases" + 0.000*"covid19" + 0.000*"country" + 0.000*"new" + 0.000*"government" + 0.000*"virus" + 0.000*"germany"')]

### Topics for Germany

1. Topic 0: Coronavirus, Public Health and New Cases
2. Topic 1: Coronavirus, Public Health and New Cases
3. Topic 2: Syrian Crisis Discourse
4. Topic 3: Coronavirus, Public Health and New Cases
5. Topic 4: Coronavirus, Public Health and New Cases

In [26]:
ldaNA = deModel[1]
corpusNA = deModel[2]
transformedCorpus = ldaNA[corpusNA]
for tc in transformedCorpus:
    print(tc)

[(0, 0.2), (1, 0.2), (2, 0.2), (3, 0.2), (4, 0.2)]
[(0, 0.2), (1, 0.2), (2, 0.2), (3, 0.2), (4, 0.2)]
[(0, 0.99635774)]
[(3, 0.99998087)]
[(3, 0.99992)]
[(0, 0.994165)]
[(0, 0.012681768), (3, 0.9871532)]
[(2, 0.450205), (3, 0.5494175)]
[(2, 0.7785722), (3, 0.21946736)]


* Dec 2019: Coronavirus, Public Health and New Cases
* Jan 2020: Coronavirus, Public Health and New Cases
* Feb 2020: Coronavirus, Public Health and New Cases
* Mar 2020: Syrian Crisis Discourse
* Apr 2020: Syrian Crisis Discourse
* May 2020: Coronavirus, Public Health and New Cases
* Jun 2020: Syrian Crisis Discourse
* Jul 2020: Coronavirus, Public Health and New Cases
* Aug 2020: Coronavirus, Public Health and New Cases

## Topics for Iran

In [30]:
irModel = countryTopicsOverTime('IR')
irModel[0]

[(0,
  '0.019*"coronavirus" + 0.017*"iran" + 0.015*"country" + 0.012*"health" + 0.012*"people" + 0.010*"covid19" + 0.009*"iranian" + 0.008*"countries" + 0.008*"virus" + 0.008*"patients"'),
 (1,
  '0.033*"iran" + 0.027*"coronavirus" + 0.022*"sanctions" + 0.015*"health" + 0.015*"people" + 0.014*"country" + 0.013*"iranian" + 0.011*"medical" + 0.010*"world" + 0.010*"virus"'),
 (2,
  '0.000*"iran" + 0.000*"coronavirus" + 0.000*"country" + 0.000*"sanctions" + 0.000*"health" + 0.000*"people" + 0.000*"iranian" + 0.000*"ministry" + 0.000*"covid19" + 0.000*"countries"'),
 (3,
  '0.025*"iran" + 0.024*"coronavirus" + 0.015*"health" + 0.014*"sanctions" + 0.014*"country" + 0.012*"virus" + 0.012*"people" + 0.010*"iranian" + 0.010*"countries" + 0.009*"covid19"'),
 (4,
  '0.000*"iran" + 0.000*"coronavirus" + 0.000*"health" + 0.000*"sanctions" + 0.000*"country" + 0.000*"iranian" + 0.000*"people" + 0.000*"medical" + 0.000*"world" + 0.000*"countries"')]

### Topics for Iran

1. Topic 0: Coronavirus Patients, Public Health
2. Topic 1: Coronavirus, Public Health and Sanctions
3. Topic 2: Coronavirus, Government Sanctions
4. Topic 3: Coronavirus
5. Topic 4: Coronavirus and Public Health

In [31]:
ldaNA = irModel[1]
corpusNA = irModel[2]
transformedCorpus = ldaNA[corpusNA]
for tc in transformedCorpus:
    print(tc)

[(0, 0.2), (1, 0.2), (2, 0.2), (3, 0.2), (4, 0.2)]
[(0, 0.2), (1, 0.2), (2, 0.2), (3, 0.2), (4, 0.2)]
[(1, 0.9982951)]
[(1, 0.9925809)]
[(1, 0.011291613), (3, 0.98749375)]
[(0, 0.75663614), (1, 0.03602219), (3, 0.20733318)]
[(0, 0.97995734), (3, 0.014087068)]
[(0, 0.9963793)]
[(0, 0.99472284)]


* Dec 2019: Coronavirus and Public Health
* Jan 2020: Coronavirus and Public Health
* Feb 2020: Coronavirus Patients, Public Health
* Mar 2020: Coronavirus Patients, Public Health
* Apr 2020: Coronavirus, Government Sanctions
* May 2020: Coronavirus Patients, Public Health
* Jun 2020: Coronavirus Patients, Public Health
* Jul 2020: Coronavirus Patients, Public Health
* Aug 2020: Coronavirus Patients, Public Health

## Topics for Qatar

In [32]:
qaModel = countryTopicsOverTime('QA')
qaModel[0]

[(0,
  '0.021*"coronavirus" + 0.012*"cases" + 0.012*"gmt" + 0.010*"new" + 0.010*"people" + 0.009*"health" + 0.008*"covid19" + 0.008*"country" + 0.007*"pandemic" + 0.006*"government"'),
 (1,
  '0.000*"coronavirus" + 0.000*"cases" + 0.000*"health" + 0.000*"people" + 0.000*"new" + 0.000*"virus" + 0.000*"china" + 0.000*"gmt" + 0.000*"country" + 0.000*"government"'),
 (2,
  '0.021*"coronavirus" + 0.016*"cases" + 0.013*"gmt" + 0.011*"new" + 0.010*"covid19" + 0.009*"people" + 0.009*"health" + 0.009*"country" + 0.007*"pandemic" + 0.006*"deaths"'),
 (3,
  '0.028*"coronavirus" + 0.017*"cases" + 0.015*"gmt" + 0.013*"new" + 0.012*"people" + 0.012*"health" + 0.009*"country" + 0.008*"china" + 0.008*"virus" + 0.008*"government"'),
 (4,
  '0.018*"beirut" + 0.015*"gmt" + 0.011*"lebanon" + 0.010*"people" + 0.010*"blast" + 0.010*"lebanese" + 0.010*"explosion" + 0.006*"port" + 0.006*"aid" + 0.005*"country"')]

### Topics for Qatar

1. Topic 0: Coronavirus and new cases
2. Topic 1: Coronavirus and Public Health, China
3. Topic 2: Coronavirus and Public Health
4. Topic 3: Coronavirus and Public Health
5. Topic 4: Lebanon Blast

In [33]:
ldaNA = qaModel[1]
corpusNA = qaModel[2]
transformedCorpus = ldaNA[corpusNA]
for tc in transformedCorpus:
    print(tc)

[(0, 0.2), (1, 0.2), (2, 0.2), (3, 0.2), (4, 0.2)]
[(3, 0.99988484)]
[(3, 0.99983335)]
[(3, 0.9943365)]
[(0, 0.99678725)]
[(0, 0.99585813)]
[(2, 0.9893172)]
[(0, 0.9937935)]
[(0, 0.044973157), (3, 0.018431747), (4, 0.93522096)]


* Dec 2019: Coronavirus and Public Health
* Jan 2020: Coronavirus and Public Health
* Feb 2020: Coronavirus and Public Health
* Mar 2020: Coronavirus and Public Health
* Apr 2020: Coronavirus and new cases
* May 2020: Coronavirus and new cases
* Jun 2020: Coronavirus and Public Health, China
* Jul 2020: Coronavirus and new cases
* Aug 2020: Lebanon Blast

## Topics for Russia

In [34]:
ruModel = countryTopicsOverTime('RU')
ruModel[0]

[(0,
  '0.017*"coronavirus" + 0.013*"russian" + 0.013*"covid19" + 0.009*"pandemic" + 0.009*"russia" + 0.009*"humanitarian" + 0.008*"people" + 0.007*"health" + 0.006*"cases" + 0.006*"medical"'),
 (1,
  '0.014*"russian" + 0.014*"coronavirus" + 0.014*"russia" + 0.008*"covid19" + 0.007*"cases" + 0.007*"humanitarian" + 0.006*"july" + 0.005*"new" + 0.005*"people" + 0.005*"ministry"'),
 (2,
  '0.028*"coronavirus" + 0.012*"russian" + 0.012*"cases" + 0.012*"russia" + 0.010*"new" + 0.010*"country" + 0.009*"number" + 0.008*"ussia" + 0.008*"moscow" + 0.007*"june"'),
 (3,
  '0.019*"coronavirus" + 0.014*"russian" + 0.010*"russia" + 0.008*"people" + 0.007*"humanitarian" + 0.006*"cases" + 0.006*"china" + 0.006*"covid19" + 0.006*"president" + 0.006*"health"'),
 (4,
  '0.016*"coronavirus" + 0.012*"covid19" + 0.012*"russian" + 0.010*"pandemic" + 0.009*"russia" + 0.008*"humanitarian" + 0.007*"health" + 0.006*"united" + 0.006*"people" + 0.006*"country"')]

### Topics for Russia

* Topic 0: Coronavirus, new cases, humanitarian and medical support
* Topic 1: Coronavirus, new cases, humanitarian support
* Topic 2: Coronavirus, new cases
* Topic 3: Coronavirus, new cases, China, President
* Topic 4: Coronavirus, US, Humanitarian

In [35]:
ldaNA = ruModel[1]
corpusNA = ruModel[2]
transformedCorpus = ldaNA[corpusNA]
for tc in transformedCorpus:
    print(tc)

[(0, 0.2), (1, 0.2), (2, 0.2), (3, 0.2), (4, 0.2)]
[(2, 0.36665168), (3, 0.63090914)]
[(3, 0.99995697)]
[(0, 0.13252148), (3, 0.86708075)]
[(0, 0.9998452)]
[(4, 0.991742)]
[(0, 0.42017758), (2, 0.56834036)]
[(1, 0.9967667)]
[(2, 0.9999439)]


* Dec 2019: Coronavirus, new cases
* Jan 2020: Coronavirus, new cases, China, President
* Feb 2020: Coronavirus, new cases, China, President
* Mar 2020: Coronavirus, new cases, China, President
* Apr 2020: Coronavirus, new cases, humanitarian and medical support
* May 2020: Coronavirus, US, Humanitarian
* Jun 2020: Coronavirus, new cases, humanitarian and medical support
* Jul 2020: Coronavirus, new cases, humanitarian support
* Aug 2020: Coronavirus, new cases

## Topics for Saudi Arabia

In [36]:
saModel = countryTopicsOverTime('SA')
saModel[0]

[(0,
  '0.010*"humanitarian" + 0.009*"yemen" + 0.009*"people" + 0.009*"health" + 0.008*"coronavirus" + 0.008*"covid19" + 0.008*"saudi" + 0.007*"aid" + 0.007*"pandemic" + 0.007*"international"'),
 (1,
  '0.000*"coronavirus" + 0.000*"covid19" + 0.000*"health" + 0.000*"humanitarian" + 0.000*"yemen" + 0.000*"people" + 0.000*"pandemic" + 0.000*"international" + 0.000*"countries" + 0.000*"world"'),
 (2,
  '0.012*"coronavirus" + 0.011*"health" + 0.009*"people" + 0.008*"covid19" + 0.008*"humanitarian" + 0.007*"government" + 0.006*"yemen" + 0.006*"virus" + 0.006*"world" + 0.006*"saudi"'),
 (3,
  '0.010*"coronavirus" + 0.009*"covid19" + 0.009*"people" + 0.008*"health" + 0.008*"pandemic" + 0.007*"humanitarian" + 0.006*"world" + 0.006*"countries" + 0.006*"international" + 0.006*"medical"'),
 (4,
  '0.000*"coronavirus" + 0.000*"health" + 0.000*"people" + 0.000*"covid19" + 0.000*"country" + 0.000*"medical" + 0.000*"humanitarian" + 0.000*"pandemic" + 0.000*"international" + 0.000*"countries"')]

### Topics for Saudi Arabia

* Topic 0: Yemen, Coronavirus, Pandemic, Humanitarian Aid
* Topic 1: Coronavirus, Yemen
* Topic 2: Coronavirus, Yemen
* Topic 3: Coronavirus, Public Health
* Topic 4: Coronavirus, Public Health, Medical and Humanatarian Support

In [37]:
ldaNA = saModel[1]
corpusNA = saModel[2]
transformedCorpus = ldaNA[corpusNA]
for tc in transformedCorpus:
    print(tc)

[(0, 0.2), (1, 0.2), (2, 0.2), (3, 0.2), (4, 0.2)]
[(0, 0.2), (1, 0.2), (2, 0.2), (3, 0.2), (4, 0.2)]
[(2, 0.99932766)]
[(2, 0.9994958)]
[(3, 0.99836344)]
[(2, 0.99792176)]
[(0, 0.9988792)]
[(3, 0.9999199)]
[(0, 0.99978703)]


* Dec 2019: Coronavirus, Yemen
* Jan 2020: Coronavirus, Yemen
* Feb 2020: Coronavirus, Yemen
* Mar 2020: Coronavirus, Yemen
* Apr 2020: Coronavirus, Public Health
* May 2020: Coronavirus, Yemen
* Jun 2020: Yemen, Coronavirus, Pandemic, Humanitarian Aid
* Jul 2020: Coronavirus, Public Health
* Aug 2020: Yemen, Coronavirus, Pandemic, Humanitarian Aid

## Conclusion

The topics analysed using LDA topic modelling will be presented on a timeline. The limitation of this exercise is the consolidation of articles from entire month into a single article. Obviously, articles talk about different topics and LDA will give us the most prominent topic. 

This data is represented on the website using a timeline.