# Correlated Topic Model (CTM) on Web Of Science dataset 

In [86]:
# import libraries
import pandas as pd
from time import time
import nltk
#nltk.download('punkt')
from nltk import word_tokenize 
from nltk.corpus import stopwords
#nltk.download('stopwords')
#nltk.download('words')
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer
from pyvis.network import Network
import matplotlib.pyplot as plt 
import tomotopy as tp

In [87]:
import warnings
warnings.filterwarnings('ignore')

In [88]:
data = pd.read_excel('C:/Users/micky/OneDrive/Desktop/Tesi DS/data/wos_data.xlsx')

In [89]:
data.shape

(46985, 7)

In [90]:
data.head()

Unnamed: 0,Y1,Y2,Y,Domain,area,keywords,Abstract
0,0,12,12,CS,Symbolic computation,(2+1)-dimensional non-linear optical waves; e...,(2 + 1)-dimensional non-linear optical waves t...
1,5,2,74,Medical,Alzheimer's Disease,Aging; Tau; Amyloid; PET; Alzheimer's disease...,(beta-amyloid (A beta) and tau pathology becom...
2,4,7,68,Civil,Green Building,LED lighting system; PV system; Distributed l...,(D)ecreasing of energy consumption and environ...
3,1,10,26,ECE,Electric motor,NdFeB magnets; Electric motor; Electric vehic...,(Hybrid) electric vehicles are assumed to play...
4,5,43,115,Medical,Parkinson's Disease,Parkinson's disease; dyskinesia; adenosine A(...,"(L)-3,4-Dihydroxyphenylalanine ((L)-DOPA) rema..."


In [91]:
# sample 10% docs
data = data.sample(frac=0.1, random_state=0, replace=False)
data.shape

(4698, 7)

In [92]:
corp = data['Abstract']

In [93]:
corp

34021    The clinical importance of the thyroid nodules...
29323    Purpose: The purpose of the study is to examin...
379      A double-sided liquid cooling Nd: YAG disk osc...
4070     As a fundamental optimization problem, the veh...
35682    The increasing air pollution of urban areas ca...
                               ...                        
17400    In terms of the theories and techniques of net...
42607    This study examines women's media selections w...
28117    Primary headache disorders, including migraine...
11591    Cortisol is one of the most important glucocor...
35855    The interpretation of regression models result...
Name: Abstract, Length: 4698, dtype: object

In [94]:
# lowercase
corp = corp.str.lower()
corp.head()

34021    the clinical importance of the thyroid nodules...
29323    purpose: the purpose of the study is to examin...
379      a double-sided liquid cooling nd: yag disk osc...
4070     as a fundamental optimization problem, the veh...
35682    the increasing air pollution of urban areas ca...
Name: Abstract, dtype: object

In [95]:
corp = corp.astype(str) #cast to string
corp = corp[corp!='nan'] #remove nan values
corp

34021    the clinical importance of the thyroid nodules...
29323    purpose: the purpose of the study is to examin...
379      a double-sided liquid cooling nd: yag disk osc...
4070     as a fundamental optimization problem, the veh...
35682    the increasing air pollution of urban areas ca...
                               ...                        
17400    in terms of the theories and techniques of net...
42607    this study examines women's media selections w...
28117    primary headache disorders, including migraine...
11591    cortisol is one of the most important glucocor...
35855    the interpretation of regression models result...
Name: Abstract, Length: 4698, dtype: object

In [96]:
# remove numbers
corp = corp.str.replace(r'[0-9]+', ' ')
corp.head()

34021    the clinical importance of the thyroid nodules...
29323    purpose: the purpose of the study is to examin...
379      a double-sided liquid cooling nd: yag disk osc...
4070     as a fundamental optimization problem, the veh...
35682    the increasing air pollution of urban areas ca...
Name: Abstract, dtype: object

In [97]:
# remove link
corp = corp.str.replace(r'http\S+', ' ')

In [99]:
# remove special characters
corp = corp.str.replace(r'[^a-zA-Z0-9 ]', ' ')
corp.head()

34021    the clinical importance of the thyroid nodules...
29323    purpose  the purpose of the study is to examin...
379      a double sided liquid cooling nd  yag disk osc...
4070     as a fundamental optimization problem  the veh...
35682    the increasing air pollution of urban areas ca...
Name: Abstract, dtype: object

In [100]:
# remove single and double letters
corp = corp.str.replace('\\b\\w{1,2}\\s', '')
corp.head()

34021    the clinical importance the thyroid nodules pa...
29323    purpose  the purpose the study examine the ass...
379      double sided liquid cooling  yag disk oscillat...
4070     fundamental optimization problem  the vehicle ...
35682    the increasing air pollution urban areas cause...
Name: Abstract, dtype: object

In [101]:
# remove extra spaces
corp = corp.str.replace(' +', ' ')
corp.head()

34021    the clinical importance the thyroid nodules pa...
29323    purpose the purpose the study examine the asso...
379      double sided liquid cooling yag disk oscillato...
4070     fundamental optimization problem the vehicle r...
35682    the increasing air pollution urban areas cause...
Name: Abstract, dtype: object

In [102]:
# sub-sub sample 10% of corpus
corp_subsub = corp.sample(frac=.1, replace=False, random_state=0)

In [104]:
# stopwords + tokenization
stop = stopwords.words('english')
corp_subsub = corp_subsub.apply(lambda x: [item for item in str(x).split() if item not in stop])

In [105]:
# stemming
stemmer = SnowballStemmer("english")
corp_subsub = corp_subsub.apply(lambda x: [stemmer.stem(y) for y in x])

In [106]:
corpus_sub = tp.utils.Corpus()
for d in corp_subsub:
    corpus_sub.add_doc(d)

In [107]:
# stopwords + tokenization
stop = stopwords.words('english')
corp = corp.apply(lambda x: [item for item in str(x).split() if item not in stop])

In [108]:
# stemming
stemmer = SnowballStemmer("english")
corp = corp.apply(lambda x: [stemmer.stem(y) for y in x])

In [109]:
corpus = tp.utils.Corpus()
for d in corp:
    corpus.add_doc(d)

### Run the CTM Model

In [131]:
model = tp.CTModel(tw=tp.TermWeight.ONE, min_df=5, rm_top=5, k=10, corpus=corpus, seed=999)
model.burn_in = 10
model.train(0)

print('Num docs:{}, Num Vocabs:{}, Total Words:{}'.format(
    len(model.docs), len(model.used_vocabs), model.num_words
))
print('Removed Top words: ', *model.removed_top_words)

Num docs:4698, Num Vocabs:5738, Total Words:512493
Removed Top words:  use studi patient result system


In [132]:
# Model training
t0 = time()
model.train(100)
print("done in %0.3fs." % (time() - t0))

done in 22.152s.


In [136]:
# visualize top 20 words per topic
for k in range(model.k):
    print('Top 30 words of topic #{}'.format(k))
    print(model.get_topic_words(k, top_n=20))
    print('\n')

Top 30 words of topic #0
[('associ', 0.03428752347826958), ('health', 0.028298988938331604), ('risk', 0.023625586181879044), ('gene', 0.022216519340872765), ('age', 0.018787790089845657), ('children', 0.018294617533683777), ('particip', 0.018083257600665092), ('protein', 0.016063595190644264), ('social', 0.015946172177791595), ('factor', 0.014537105336785316), ('intervent', 0.014043932780623436), ('disord', 0.014020447619259357), ('care', 0.013832572847604752), ('examin', 0.013644697144627571), ('cognit', 0.011718972586095333), ('year', 0.011437159031629562), ('individu', 0.011413674801588058), ('role', 0.01108489278703928), ('self', 0.011037923395633698), ('parent', 0.0108970170840621)]


Top 30 words of topic #1
[('propos', 0.027388697490096092), ('comput', 0.0214783176779747), ('paper', 0.019560232758522034), ('problem', 0.018511977046728134), ('design', 0.018333550542593002), ('model', 0.017664451152086258), ('algorithm', 0.017017655074596405), ('simul', 0.014831929467618465), ('ba

### CTM visualization

In [134]:
# Visualize results and correlations between topic in graph
g = Network(width=800, height=800, font_color="#333")
correl = model.get_correlations().reshape([-1])
correl.sort()
top_tenth = model.k * (model.k - 1) // 10
top_tenth = correl[-model.k - top_tenth]

for k in range(model.k):
    label = "#{}".format(k)
    title= ' '.join(word for word, _ in model.get_topic_words(k, top_n=20))
    print('Topic', label, title)
    g.add_node(k, label=label, title=title, shape='ellipse')
    for l, correlation in zip(range(k - 1), model.get_correlations(k)):
        if correlation < top_tenth: continue
        g.add_edge(k, l, value=float(correlation), title='{:.02}'.format(correlation))

g.barnes_hut(gravity=-1000, spring_length=20)
g.show_buttons()
g.show("topic_network.html")

Topic #0 associ health risk gene age children particip protein social factor intervent disord care examin cognit year individu role self parent
Topic #1 propos comput paper problem design model algorithm simul base structur solut applic techniqu data effici injuri oper field order complex
Topic #2 present two water differ approach imag condit high area construct reserv technolog right ltd distribut measur project compon pattern properti
Topic #3 base model process power manag materi design perform data cost work obtain network consid new practic state tool achiev paper
Topic #4 cell treatment diseas group express clinic conclus symptom day therapi cancer signific report regul infect drug acid depress outcom adult
Topic #5 control show perform energi detect generat optim paramet analysi select larg electr rang flow test process determin various build novel
Topic #6 level effect activ howev may posit women increas test year specif experi total stress common influenc relationship subject 

### CTM on Sub-Subsampled data

In [112]:
model = tp.CTModel(tw=tp.TermWeight.ONE, min_df=5, rm_top=5, k=10, corpus=corpus_sub, seed=999)
model.burn_in = 10
model.train(0)

print('Num docs:{}, Num Vocabs:{}, Total Words:{}'.format(
    len(model.docs), len(model.used_vocabs), model.num_words
))
print('Removed Top words: ', *model.removed_top_words)

Num docs:470, Num Vocabs:1491, Total Words:42176
Removed Top words:  use studi patient result control


In [113]:
# Model training
t0 = time()
model.train(100)
print("done in %0.3fs." % (time() - t0))

done in 2.045s.
