In [1]:
import os
from top2vec import Top2Vec
import pandas as pd
import nltk
stop=nltk.corpus.stopwords.words('english')
import spacy
nlp=spacy.load('en_core_web_sm')
from nltk import word_tokenize
import re
from collections import Counter
import umap
from sklearn.datasets import load_digits
import numpy as np
import plotly.express as px

  @numba.jit()
  @numba.jit()
  @numba.jit()
  from .autonotebook import tqdm as notebook_tqdm
  @numba.jit()


To enable the code to run, you need to install the following packages:
- `pip install pandas`
- `pip install numpy`
- `pip install Top2Vec`
- `pip install nltk`
- `pip install spacy`
- `pip install re`
- `pip install umap`
- `pip install sklearn`
- `pip install plotly`

The code is written in Python 3.10. and some of the packages might have been downloaded with the Anaconda distribution.

The dataset we will be using for this class is from https://github.com/owid/covid-19-data and I sampled 15,000 rows for this class.

In [2]:
data=pd.read_csv('small-covid.csv')
#load csv file using pandas

In [3]:
data.shape

(15000, 4)

In [4]:
data.columns

Index(['title', 'abstract', 'doi', 'authors'], dtype='object')

In [5]:
data.head(5)

Unnamed: 0,title,abstract,doi,authors
0,Evaluation of a Multiplex Real-Time Reverse Tr...,The ability to rapidly diagnose influenza viru...,http://dx.doi.org/10.1128/JCM.43.2.589-595.2005,"['Hindiyeh, Musa', 'Levy, Virginia', 'Azar, Ro..."
1,Genomics of Emerging Infectious Disease: A PLo...,,http://dx.doi.org/10.1371/journal.pbio.1000224,"['Eisen, Jonathan A.', 'MacCallum, Catriona J.']"
2,Global response to pandemic flu: more research...,If and when sustained human-to-human transmiss...,http://dx.doi.org/10.1186/1478-4505-4-8,"Lim, Meng-Kin"
3,Murine Norovirus 1 (MNV1) Replication Induces ...,Protein synthesis is a tightly controlled proc...,http://dx.doi.org/10.1074/jbc.M114.602649,"['Royall, Elizabeth', 'Doyle, Nicole', 'Abdul-..."
4,"Cicadidae Periostracum, the Cast-Off Skin of C...",Parkinson's disease (PD) is characterized by d...,http://dx.doi.org/10.1155/2019/5797512,"['Lim, Hye-Sun', 'Kim, Joong-Sun', 'Moon, Byeo..."


In [6]:
data['title'] = data['title'].fillna('') #convert NaN to empty string
data['abstract'] = data['abstract'].fillna('') #convert NaN to empty string
data['text'] = data['title'] + " " + data['abstract']
data['text'] = data['text'].str.lower()

In [7]:
data['text']

0        evaluation of a multiplex real-time reverse tr...
1        genomics of emerging infectious disease: a plo...
2        global response to pandemic flu: more research...
3        murine norovirus 1 (mnv1) replication induces ...
4        cicadidae periostracum, the cast-off skin of c...
                               ...                        
14995    prediction of invasion from the early stage of...
14996    lys-315 at the interfaces of diagonal subunits...
14997    a habitat-based model for the spread of hantav...
14998    smallpox virus resequencing genechips can also...
14999    impact of imitation processes on the effective...
Name: text, Length: 15000, dtype: object

In [8]:
def lemmatization(dataframe:pd.DataFrame()):
    dataframe['stopword']=dataframe['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
    dataframe['punct']=dataframe['stopword'].apply(lambda x: ' '.join([word for word in re.sub(r'[^a-zA-Z]', ' ', x).split()]))
    dataframe['lemma']=dataframe['punct'].apply(lambda x: ' '.join([word.lemma_ for word in nlp(x)]))
    return dataframe

In [9]:
df=lemmatization(data)

In [14]:
model=Top2Vec(documents=data['lemma'].tolist(), speed="learn", workers=4)

2024-02-12 14:03:31,862 - top2vec - INFO - Pre-processing documents for training
2024-02-12 14:03:34,566 - top2vec - INFO - Creating joint document/word embedding
2024-02-12 14:05:17,226 - top2vec - INFO - Creating lower dimension embedding of documents
2024-02-12 14:05:27,072 - top2vec - INFO - Finding dense areas of documents
2024-02-12 14:05:27,341 - top2vec - INFO - Finding topics


In [15]:
topic_sizes, topic_nums = model.get_topic_sizes()

In [16]:
print(len(topic_sizes), len(topic_nums)) #186 186

186 186


In [17]:
id_dic={}
topic_id={}
for element in zip(topic_nums, topic_sizes):
    documents, document_scores, document_ids = model.search_documents_by_topic(topic_num=element[0], num_docs=element[1])
    for score, id in zip(document_scores, document_ids):
        id_dic[id]=score
        topic_id[id]=element[0]

In [18]:
topic_words, word_scores, topic_scores = model.get_topics(len(topic_sizes))

In [19]:
df_words=pd.DataFrame(topic_words).transpose()
df_words.columns=topic_nums
df_words.columns = df_words.columns.astype(str)

In [30]:
df_words[['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']].iloc[:20] #show the first 10 topics and their top 50 words

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,health,chest,jat,metapneumovirus,wildlife,subgenomic,server,neutralize,irf,stochastic
1,public,patient,hubei,hospitalize,ecological,rna,alignment,mab,interferon,spatial
2,national,consolidation,wuhan,illness,ecology,minus,http,monoclonal,ifn,empirical
3,policy,radiograph,covid,parainfluenza,ecosystem,synthesis,annotation,epitope,rig,epidemic
4,security,icu,jats,child,habitat,rnas,algorithm,neutralizing,antagonize,simulation
5,stakeholder,ct,mainland,outpatient,spillover,untranslated,dataset,antibodie,tbk,network
6,collaboration,bilateral,estimate,enrol,zoonosis,replicase,computational,mabs,innate,assume
7,strengthen,admission,feb,rhinovirus,zoonotic,strand,www,fab,ifns,estimate
8,international,pneumonia,february,sari,trade,di,web,immunogen,signal,probability
9,agenda,radiological,china,ili,reservoir,template,freely,neutralization,isgs,dynamic


Top2vec calculates the cosine similarity between the topic vector and the document vector. The document vector is the average of the word vectors in the document. As topic is represented by the collection of words, theoretically, each document is represented by multiple topics. The cosine similarity between the topic vector and the document vector shows the relevance of the document to the topic. The reason for choosing cosine similarity is that it is a normalized measure of the multidimensional geometric angle between two vectors. In addition, cosine similarity is a measure of geometric similarity. (e.g., Jaccard similarity might be another similarity measure, but for the multidimensional vectors, it is hard to apply Jaccard similarity because it is appropriate for binary or boolean data, not vectors. Also, KL-divergence cannot be applied becuase vector values can have negative values.) 

In [21]:
df['topic_id']=df.index.map(topic_id) #map topic id to each document
df['topic_score']=df.index.map(id_dic) #map topic score to each document

In [22]:
Counter(df['topic_id']).most_common(10) #top 10 topics

[(0, 457),
 (1, 317),
 (2, 316),
 (3, 272),
 (4, 264),
 (5, 260),
 (6, 258),
 (7, 252),
 (8, 251),
 (9, 239)]

The highest cosine similarity and corresponding topic is selected as the academic discipline of the paper. The value is recorded under 'topic_score' and 'topic_id' columns

In [23]:
df[['text', 'topic_id', 'topic_score']].head(10)

Unnamed: 0,text,topic_id,topic_score
0,evaluation of a multiplex real-time reverse tr...,16,0.586479
1,genomics of emerging infectious disease: a plo...,6,0.275124
2,global response to pandemic flu: more research...,66,0.425031
3,murine norovirus 1 (mnv1) replication induces ...,94,0.614587
4,"cicadidae periostracum, the cast-off skin of c...",44,0.478137
5,recurrent wheezy bronchitis and viral respirat...,30,0.494299
6,in vivo transmission studies of ‘candidatus my...,59,0.492907
7,clinical utility of a near patient care microa...,83,0.477987
8,hantaviruses and tnf-alpha act synergistically...,160,0.733135
9,canine respiratory coronavirus employs caveoli...,54,0.548842


The important concept of this analysis relies on the vector representation of the documents. The topic vector is calculated based on the word vectors. For the visualization task, I attempt to reduce 300 dimensions of the document vectors to represent each paper in 2 dimensional space. I use Uniform Manifold Approximation and Projection (UMAP) algorithm to reduce the dimensionality of the document vectors. 

In [24]:
df['vector']=model.document_vectors.tolist()

In [26]:
df['vector'] #vector representation of each document

0        [-0.05479038879275322, -0.010598486289381981, ...
1        [0.03230651095509529, 0.011665897443890572, 0....
2        [0.014082466252148151, -0.03321152925491333, 0...
3        [-0.012030362151563168, -0.09689673781394958, ...
4        [0.09165066480636597, 0.02311382256448269, -0....
                               ...                        
14995    [0.0033561831805855036, -0.03329373523592949, ...
14996    [0.023134496062994003, 0.05239043012261391, -0...
14997    [0.05148150399327278, 0.02505842037498951, -0....
14998    [-0.00686113815754652, -0.009262137115001678, ...
14999    [0.03549158573150635, 0.011437198147177696, -0...
Name: vector, Length: 15000, dtype: object

In [27]:
embedding = umap.UMAP().fit_transform(df['vector'].tolist()) #reduce dimensionality of vector representation

In [28]:
clusterable_embedding = umap.UMAP(
    n_neighbors=30,
    min_dist=0.0,
    n_components=2,
    random_state=42,
).fit_transform(embedding.data)

In [29]:
fig = px.scatter(
    df, 
    x=clusterable_embedding[:, 0], 
    y=clusterable_embedding[:, 1], 
    color='topic_id',  # Specify the column for color
    hover_data=['topic_id', 'doi']
)

fig.update_layout(
    autosize=False,
    width=1000,
    height=1000,
    title="UMAP projection of the document vectors"
)

fig.show()
fig.write_html("interactive-plot.html")
#when hover over the dots, the topic id and doi will show up as well as x and y coordinates