## CorEx Topic Modeling to detect recurrent themes in the claims text

## Imports

In [11]:
import pandas as pd
import sys
import spacy
# !{sys.executable} -m spacy download en

import gensim
from corextopic import corextopic as ct

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import text


## Create claims dataset 

In [12]:
# true claims

real_claims1 = pd.read_csv("https://raw.githubusercontent.com/cuilimeng/CoAID/master/05-01-2020/ClaimRealCOVID-19.csv")
real_claims2 = pd.read_csv("https://raw.githubusercontent.com/cuilimeng/CoAID/master/07-01-2020/ClaimRealCOVID-19.csv")
real_claims3 = pd.read_csv("https://raw.githubusercontent.com/cuilimeng/CoAID/master/09-01-2020/ClaimRealCOVID-19.csv")
real_claims4 = pd.read_csv("https://raw.githubusercontent.com/cuilimeng/CoAID/master/11-01-2020/ClaimRealCOVID-19.csv")

real_claims = pd.concat([real_claims1, real_claims2, real_claims3, real_claims4])

real_claims['eval'] = 'real'

# fake claims- none for 9/1 and 11/1 additions

fake_claims1 = pd.read_csv("https://raw.githubusercontent.com/cuilimeng/CoAID/master/05-01-2020/ClaimFakeCOVID-19.csv")
fake_claims2 = pd.read_csv("https://raw.githubusercontent.com/cuilimeng/CoAID/master/07-01-2020/ClaimFakeCOVID-19.csv")

fake_claims = pd.concat([fake_claims1, fake_claims2])

fake_claims['eval'] = 'fake'

claims = pd.concat([real_claims, fake_claims], axis = 0)


## Text Pre-Processing

In [13]:
# Convert review text to list
data = claims.title.values.tolist()


# Tokenize by sentence- each review becomes a list of words
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc = True))  # deacc=True removes punctuations

data_words = list(sent_to_words(data))


# Only keep certain parts of speech
def lemmatization(texts, allowed_postags = ['NOUN', 'ADJ', 'VERB', 'ADV']): #'NOUN', 'ADJ', 'VERB', 'ADV'
    texts_out = []
    for sent in texts:
        doc = nlp(' '.join(sent)) 
        texts_out.append(' '.join([token.lemma_ if token.lemma_ not in ['-PRON-'] else '' for token in doc if token.pos_ in allowed_postags]))
    return texts_out


nlp = spacy.load('en', disable = ['parser', 'ner'])

data_lemmatized = lemmatization(data_words, allowed_postags = ['NOUN', 'VERB', 'ADJ', 'ADV']) #select noun and verb

print(data_lemmatized[:2])


['how large meeting event need order mass gathering', 'recommend international mass gathering cancel']


## Vectorizing the text

In [14]:
# Vectorizing
my_words = set(['covid', 'covid-19', 'coronavirus', 'virus', 'patient', 'sick'])

my_stop_words = text.ENGLISH_STOP_WORDS.union(my_words)

vectorizer = TfidfVectorizer(
    analyzer = 'word', 
    max_df = .95, # ignore terms that appear in more than 95% of the documents
    #min_df = 0.05, # ignore terms that appear in less than 5% of the documents"
    max_features = None, # default
    ngram_range = (2, 4),
    #norm = None, # default = l2
    binary = False, # default = False
    use_idf = True, # make sure to set to True so that it's actually used
    sublinear_tf = False, # default
    stop_words = set(my_stop_words) # edited as above
)


# Fit on the text of the claims
data_lemmatized

vectorizer = vectorizer.fit(data_lemmatized)
tfidf = vectorizer.transform(data_lemmatized)
vocab = vectorizer.get_feature_names()

print(len(vocab))


3682




## CorEx Topic Modeling- Unseeded Model

In [6]:
# Initiate Corex Model 
# 15 topics

model_15 = ct.Corex(n_hidden = 15, seed = 123, n_jobs = -1) 
model_15 = model_15.fit(tfidf, words = vocab)

# Print the top 5 words of each topic- no anchors
for i, topic_ngrams in enumerate(model_15.get_topics(n_words = 5)):
    topic_ngrams = [ngram[0] for ngram in topic_ngrams if ngram[1] > 0]
    print('Topic #{}: {}'.format(i+1, ', '.join(topic_ngrams)))
 

Topic #1: hand dryer, member international health regulation, member international, member international health, mask general
Topic #2: family planning, contraception family, contraception family planning, drug license treatment prevention, license treatment prevention
Topic #3: self care, position use chloroquine context, position use chloroquine, chloroquine context, chloroquine context response
Topic #4: develop severe, risk develop severe, risk develop, contraceptive method, change guidance respect malaria
Topic #5: live surface food, live surface, surface food, surface food packaging, live surface food packaging
Topic #6: drink alcohol protect, drink alcohol protect dangerous, alcohol protect dangerous, alcohol protect, spread coin banknote
Topic #7: malaria affect country, affect country, malaria affect, good household disinfectant surface, good household disinfectant
Topic #8: wash fruit vegetable, wash fruit, prevent treat, vegetable time, wash fruit vegetable time
Topic #9: dr

In [15]:
# 20 topics

model_20 = ct.Corex(n_hidden = 20, seed = 123, n_jobs = -1) #20 is good
model_20 = model_20.fit(tfidf, words = vocab)

# Print the top 5 words of each topic
for i, topic_ngrams in enumerate(model_20.get_topics(n_words = 5)):
    topic_ngrams = [ngram[0] for ngram in topic_ngrams if ngram[1] > 0]
    print('Topic #{}: {}'.format(i+1, ', '.join(topic_ngrams)))
    

Topic #1: address ventilation, address ventilation context, ventilation context, food grocery delivery, food grocery delivery safe
Topic #2: good household disinfectant, household disinfectant surface, good household disinfectant surface, good household, disinfectant surface
Topic #3: develop severe, risk develop, risk develop severe, severe symptom, concerned spread
Topic #4: safely time, grocery shop safely time, shop safely, shop safely time, grocery shop safely
Topic #5: chloroquine context response, use chloroquine, chloroquine context, use chloroquine context, use chloroquine context response
Topic #6: right duty, medical mask, old people, surface food packaging, food packaging
Topic #7: preventive therapy maintain, preventive therapy, therapy maintain, recommend preventive, recommend preventive therapy maintain
Topic #8: incubation period, mask general public, general public, mask general, long incubation period
Topic #9: affect country, malaria affect country, malaria affect, c

In [51]:
# 21 topics

model_21 = ct.Corex(n_hidden = 21, seed = 123, n_jobs = -1) 
model_21 = model_21.fit(tfidf, words = vocab)

# Print the top 5 words of each topic
for i, topic_ngrams in enumerate(model_21.get_topics(n_words = 5)):
    topic_ngrams = [ngram[0] for ngram in topic_ngrams if ngram[1] > 0]
    print('Topic #{}: {}'.format(i+1, ', '.join(topic_ngrams)))
    

Topic #1: assess risk, tuberculosis spread way, tuberculosis spread, spread way, medical face mask
Topic #2: likelihood shoe spread low, likelihood shoe spread, likelihood shoe, shoe spread, shoe spread low
Topic #3: thermal scanner detect, scanner detect, medical mask, antiretroviral use, thermal scanner
Topic #4: contraceptive method, respect malaria diagnosis treatment, respect malaria diagnosis, guidance respect, guidance respect malaria
Topic #5: wash fruit vegetable, wash fruit, fruit vegetable, fruit vegetable time, vegetable time
Topic #6: risk develop severe, develop severe, risk develop, vaccine drug treatment, vaccine drug
Topic #7: newborn baby, international health regulation, health regulation, alcohol protect, alcohol protect dangerous
Topic #8: pregnant woman, woman test, pregnant woman test, affect country report, affect country report case
Topic #9: help prevent, eat garlic, eat garlic help, eat garlic help prevent, garlic help
Topic #10: preventive therapy maintain, 

In [52]:
# 22 topics

model_22 = ct.Corex(n_hidden = 22, seed = 123, n_jobs = -1) 
model_22 = model_22.fit(tfidf, words = vocab)

# Print the top 5 words of each topic
for i, topic_ngrams in enumerate(model_22.get_topics(n_words = 5)):
    topic_ngrams = [ngram[0] for ngram in topic_ngrams if ngram[1] > 0]
    print('Topic #{}: {}'.format(i+1, ', '.join(topic_ngrams)))
    

Topic #1: safely time, grocery shop safely time, grocery shop safely, grocery shop, shop safely time
Topic #2: health authority, assess risk, kill new, use pandemic, measure people
Topic #3: household disinfectant surface, household disinfectant, good household disinfectant, good household disinfectant surface, disinfectant surface
Topic #4: live surface food, live surface, additional special, special measure need context, special measure need
Topic #5: support malaria affect country, affect country context, support malaria affect, country context, malaria affect country context
Topic #6: prevent cure, survive surface, respect malaria, change guidance respect malaria, change guidance respect
Topic #7: pregnant woman, woman test, pregnant woman test, vaccine pneumonia, vaccine pneumonia protect
Topic #8: pregnancy childbirth, available pregnancy, care available, care available pregnancy, available pregnancy childbirth
Topic #9: preventive therapy, preventive therapy maintain, recommend 

In [53]:
# 23 topics

model_23 = ct.Corex(n_hidden = 23, seed = 123, n_jobs = -1)
model_23 = model_23.fit(tfidf, words = vocab)

# Print the top 5 words of each topic
for i, topic_ngrams in enumerate(model_23.get_topics(n_words = 5)):
    topic_ngrams = [ngram[0] for ngram in topic_ngrams if ngram[1] > 0]
    print('Topic #{}: {}'.format(i+1, ', '.join(topic_ngrams)))
    

Topic #1: risk transportation, risk transportation school, transportation school, smokeless tobacco, sport tournament
Topic #2: contraceptive method, grocery delivery, shop safely, shop safely time, delivery safe
Topic #3: risk develop, risk develop severe, develop severe, likelihood shoe spread, spread low
Topic #4: malaria affect, malaria affect country, affect country, malaria affect country report, support malaria
Topic #5: recommend preventive, recommend preventive therapy, therapy maintain, preventive therapy, preventive therapy maintain
Topic #6: chloroquine context, position use chloroquine, position use chloroquine context, context response, chloroquine context response
Topic #7: guidance respect malaria diagnosis, guidance respect, guidance respect malaria, change guidance, change guidance respect
Topic #8: prevent treat, treat new, prevent treat new, antibiotic effective, antibiotic effective prevent
Topic #9: household disinfectant, good household disinfectant surface, good

In [54]:
# 24 topics

model_24 = ct.Corex(n_hidden = 24, seed = 123, n_jobs = -1) 
model_24 = model_24.fit(tfidf, words = vocab)

# Print the top 5 words of each topic- no anchors
for i, topic_ngrams in enumerate(model_24.get_topics(n_words = 5)):
    topic_ngrams = [ngram[0] for ngram in topic_ngrams if ngram[1] > 0]
    print('Topic #{}: {}'.format(i+1, ', '.join(topic_ngrams)))
    

Topic #1: garlic help, garlic help prevent, eat garlic, eat garlic help prevent, eat garlic help
Topic #2: protect dangerous, alcohol protect dangerous, alcohol protect, drink alcohol protect, drink alcohol protect dangerous
Topic #3: key consideration, workplace risk assessment, risk assessment, workplace risk, country work
Topic #4: prevent treat, treat new, prevent treat new, antibiotic effective prevent treat, antibiotic effective
Topic #5: position use, position use chloroquine, use chloroquine context response, use chloroquine context, use chloroquine
Topic #6: wash fruit vegetable time, fruit vegetable time, vegetable time, good household disinfectant, disinfectant surface
Topic #7: particularly concerned spread malaria, particularly concerned, spread malaria, spread malaria affect, spread malaria affect area
Topic #8: risk develop, develop severe, risk develop severe, severe symptom, vitamin mineral supplement cure
Topic #9: likelihood shoe, likelihood shoe spread low, spread l

In [55]:
# 25 topics

model_25 = ct.Corex(n_hidden = 25, seed = 123, n_jobs = -1) 
model_25 = model_25.fit(tfidf, words = vocab)

# Print the top 5 words of each topic- no anchors
for i, topic_ngrams in enumerate(model_25.get_topics(n_words = 5)):
    topic_ngrams = [ngram[0] for ngram in topic_ngrams if ngram[1] > 0]
    print('Topic #{}: {}'.format(i+1, ', '.join(topic_ngrams)))
    

Topic #1: spread coin banknote, coin banknote, spread coin, food business, health condition
Topic #2: right duty, breastfeed baby directly, baby directly, breastfeed baby, unwell breastfeed
Topic #3: healthcare worker, planning information, family planning information, family planning information service, people access contraception family
Topic #4: health regulation, international health regulation, woman test, pregnant woman test, international health
Topic #5: live surface food, live surface, incubation period, surface food, guidance respect malaria diagnosis
Topic #6: contact tracing, context response, position use chloroquine context, use chloroquine context response, use chloroquine context
Topic #7: wash fruit, wash fruit vegetable, fruit vegetable time, vegetable time, wash fruit vegetable time
Topic #8: household disinfectant surface, good household disinfectant surface, disinfectant surface, household disinfectant, good household disinfectant
Topic #9: prevent treat, prevent 

### Checking topic correlation
- although the tc is higher as the number of topics increases, the interpretability of the topics suffers. A topic count of 20 seems to offer a good balance between tc and topic interpretability

In [56]:
# Overall higher tc = better model, produces topics that are most informative about the claims
# Search for best number of topics- where tc is the greatest, but also reasonable for interpretation

print(model_15.tc)
print(model_20.tc)
print(model_21.tc)
print(model_22.tc)
print(model_23.tc)
print(model_24.tc)
print(model_25.tc)


3.0800399088135504
3.46576662139439
3.3357408892366074
3.3674525721472532
3.3050751905246285
3.420543984571386
3.5740708656114055


## Topic probabilities for each claim using topic count of 20

In [16]:
#print(model.p_y_given_x) # n_docs x k_topics

claim_topic_probs = pd.DataFrame(model_20.p_y_given_x) # softmax determines label of claim
claim_topic_probs


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,0.003132,0.001405,0.010406,0.999999,0.004620,0.011083,0.010577,0.00106,0.002529,0.000719,0.000566,0.001685,0.008207,0.00138,0.001034,0.001823,0.002395,0.002773,0.175692,0.169631
1,0.003132,0.001405,0.999999,0.009513,0.004620,0.011083,0.010577,0.00106,0.002529,0.000719,0.000566,0.001685,0.008207,0.00138,0.000522,0.001823,0.002395,0.002773,0.173437,0.169430
2,0.999999,0.001405,0.010406,0.009513,0.004620,0.011083,0.010577,0.00106,0.002529,0.000719,0.000566,0.001685,0.008207,0.00138,0.001034,0.001228,0.002395,0.002773,0.175692,0.170225
3,0.005846,0.001405,0.010406,0.009513,0.999999,0.011083,0.010577,0.00106,0.002529,0.000719,0.000566,0.001685,0.008207,0.00138,0.001034,0.001823,0.002395,0.002773,0.175692,0.169681
4,0.999999,0.001405,0.010406,0.009513,0.006246,0.011083,0.010577,0.00106,0.002529,0.000719,0.000566,0.001685,0.008207,0.00138,0.000641,0.001823,0.002395,0.002773,0.174114,0.169673
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
513,0.003132,0.001405,0.010406,0.009513,0.999999,0.011083,0.010577,0.00106,0.002529,0.000719,0.000566,0.001685,0.008207,0.00138,0.001034,0.001823,0.002395,0.002773,0.175692,0.170237
514,0.003132,0.001405,0.010406,0.009513,0.004620,0.011083,0.010577,0.00106,0.002529,0.000719,0.000566,0.001685,0.008207,0.00138,0.001034,0.001823,0.002395,0.002773,0.175692,0.170237
515,0.003132,0.001405,0.999999,0.009513,0.007108,0.011083,0.010577,0.00106,0.002529,0.000719,0.000566,0.001685,0.028109,0.00138,0.001034,0.001823,0.002395,0.002773,0.175692,0.170237
516,0.003132,0.001405,0.020789,0.999999,0.004620,0.011083,0.010577,0.00106,0.002529,0.000719,0.000566,0.001685,0.008207,0.00138,0.001034,0.001823,0.002395,0.002773,0.175692,0.170237


# Create full claims dataset

In [20]:
claim_topic_probs

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,0.003132,0.001405,0.010406,0.999999,0.004620,0.011083,0.010577,0.00106,0.002529,0.000719,0.000566,0.001685,0.008207,0.00138,0.001034,0.001823,0.002395,0.002773,0.175692,0.169631
1,0.003132,0.001405,0.999999,0.009513,0.004620,0.011083,0.010577,0.00106,0.002529,0.000719,0.000566,0.001685,0.008207,0.00138,0.000522,0.001823,0.002395,0.002773,0.173437,0.169430
2,0.999999,0.001405,0.010406,0.009513,0.004620,0.011083,0.010577,0.00106,0.002529,0.000719,0.000566,0.001685,0.008207,0.00138,0.001034,0.001228,0.002395,0.002773,0.175692,0.170225
3,0.005846,0.001405,0.010406,0.009513,0.999999,0.011083,0.010577,0.00106,0.002529,0.000719,0.000566,0.001685,0.008207,0.00138,0.001034,0.001823,0.002395,0.002773,0.175692,0.169681
4,0.999999,0.001405,0.010406,0.009513,0.006246,0.011083,0.010577,0.00106,0.002529,0.000719,0.000566,0.001685,0.008207,0.00138,0.000641,0.001823,0.002395,0.002773,0.174114,0.169673
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
513,0.003132,0.001405,0.010406,0.009513,0.999999,0.011083,0.010577,0.00106,0.002529,0.000719,0.000566,0.001685,0.008207,0.00138,0.001034,0.001823,0.002395,0.002773,0.175692,0.170237
514,0.003132,0.001405,0.010406,0.009513,0.004620,0.011083,0.010577,0.00106,0.002529,0.000719,0.000566,0.001685,0.008207,0.00138,0.001034,0.001823,0.002395,0.002773,0.175692,0.170237
515,0.003132,0.001405,0.999999,0.009513,0.007108,0.011083,0.010577,0.00106,0.002529,0.000719,0.000566,0.001685,0.028109,0.00138,0.001034,0.001823,0.002395,0.002773,0.175692,0.170237
516,0.003132,0.001405,0.020789,0.999999,0.004620,0.011083,0.010577,0.00106,0.002529,0.000719,0.000566,0.001685,0.008207,0.00138,0.001034,0.001823,0.002395,0.002773,0.175692,0.170237


In [22]:
### Raw Claims data, real and fake

claim_topic_probs.reset_index(drop=True, inplace=True)
claims.reset_index(drop=True, inplace=True)

claims['word_count'] = [len(x.split()) for x in claims['title'].tolist()]

claims_data = pd.concat([claims, claim_topic_probs], axis = 1)

#claims_data.to_csv('claims_data_v2.csv')



In [24]:
# Sums
#sums = claims_data.sum()
#sums.to_csv('sums.csv')