# Hierarchical topic modelling

Here we fit a hierarchical topic model on the covid-19 paper data. We are specially interested in clustering papers into groups and then analyse their topics

Steps:

* Pre-process the data
* Train the model

## Preamble

In [None]:
%run ../notebook_preamble.ipy

In [None]:
from cord19.hSBM_Topicmodel.sbmtm import sbmtm
from cord19.transformers.nlp_2 import *

import gensim
from gensim.models.phrases import Phrases, Phraser

from toolz.curried import *
import altair as alt


In [None]:
def preview(x):
    print(x.head())
    print('\n')
    print(x.shape)
    
    return(x)

## 1. Read data

In [None]:
#This is the covid article data
cov_ = pd.read_csv(f"{project_dir}/data/processed/covid_df.csv").pipe(preview)

In [None]:
#Ids for AI papers based on previous analysis
ai_ids = set(pd.read_csv(f"{project_dir}/data/raw/ai_research/ai_paper_ids.csv")['id'])

## 2. Process data

In [None]:
#Drop papers without abstracts
cov_ = cov_.dropna(axis=0,subset=['abstract']).pipe(preview)

cov = cov_.loc[[len(x)>300 for x in cov_['abstract']]].pipe(preview)

cov.reset_index(drop=True,inplace=True)

In [None]:
id_magid_lookup = {r['id']:r['mag_id'] for rid,r in cov.iterrows()}

In [None]:
# #Clean and tokenise the data

# abst = cov['abstract']

# abst = [re.sub("\n"," ",x) for x in abst]

# ct = CleanTokenize(abst)

In [None]:
# ct.clean().bigram(threshold=20).bigram(threshold=20)

## 3. Model

In [None]:
# docs = ct.tokenised
# titles = list(cov['id'])

In [None]:
# %%time
# model = sbmtm()
# model.make_graph(docs,documents=titles)
# model.fit()

In [None]:
# #Save model

# with open(f"{project_dir}/models/top_sbm/top_sbm.p",'wb') as outfile:
#     pickle.dump(model,outfile)

In [None]:
with open(f"{project_dir}/models/top_sbm/top_sbm.p",'rb') as infile:
    model = pickle.load(infile)

### Extract relevant information

In [None]:
#Extract the word mix (word components of each topic)
word_mix = model.topics(l=0)

In [None]:
#Create tidier names
topic_name_lookup = {key:'_'.join([x[0] for x in values[:5]]) for key,values in word_mix.items()}
topic_names = list(topic_name_lookup.values())

In [None]:
#Extract the topic mix df
topic_mix_ = pd.DataFrame(model.get_groups(l=0)['p_tw_d'].T,
                        columns=topic_names,index=list(cov['id']))

In [None]:
#Remove highly uninformative / generic topics

topic_prevalence = topic_mix_.applymap(lambda x: x>0).mean().sort_values(ascending=False)

topic_prevalence.loc[topic_prevalence>0.4]

filter_topics = topic_prevalence.index[topic_prevalence<0.4]

topic_mix = topic_mix_[filter_topics]

In [None]:
#Extract the clusters to which different documents belong (we force all documents to belong to a cluster)
cluster_assigment = model.clusters(l=1,n=len(list(cov['id'])))
cluster_sets = {c:set([x[0] for x in papers]) for c,papers in cluster_assigment.items()}

In [None]:
#Assign topics to their clusters
#Add AI dummy and cluster dummy
topic_mix['is_ai'] = [x in ai_ids for x in topic_mix.index]

topic_mix['cluster'] = [[f'cluster_{n}' for n,v in cluster_sets.items() if x in v][0] for x in topic_mix.index]

In [None]:
topic_mix_long = topic_mix.reset_index().melt(id_vars=['index','is_ai','cluster'],
                                                            var_name='topic',value_name='weight')

In [None]:
topic_mix_long['mag_id'] = topic_mix_long['index'].map(id_magid_lookup)

In [None]:
topic_mix_long.rename(columns={'index':'article_id'},inplace=True)

In [None]:
topic_mix_long.to_csv(f"{project_dir}/data/processed/ai_research/tidy_paper_topics_ai_2.csv")

In [None]:
topic_mix['cluster'].to_csv(f"{project_dir}/data/processed/ai_research/paper_cluster.csv")