<a href="https://colab.research.google.com/github/nice-digital/nice-ds-literatureprocessing/blob/master/code/breast_cancer/topicmodel_colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Topic modelling on pre-processed dataset**

This Colab notebook creates topic models on pre-processed datasets


**Inputs**
- Data file: Please note that the input data file should be the output of preproc colab or categorisation colab. Name your input file *input_data.csv*.

**Outputs**

- The interactive plots is a key output, as well as the excel spreadsheet produced process_model.csv in Results folder which provides information on the best topic associated with each study. 

Upload the input file  by pressing the upload button on the top left of the left sidebar. The results will appear in a folder named *RESULTS*. RESULTS folder will be automatically created by the code

In [None]:
#@title Install Python packages { form-width: "20%" }

#@markdown Please execute this cell by pressing the _Play_ button 
#@markdown on the left to download and import third-party software 
#@markdown in this Colab notebook. 

#@markdown This installs the software on the Colab 
#@markdown notebook in the cloud and not on your computer.
from IPython.utils import io
try:
  with io.capture_output() as captured:
    %shell pip install pyLDAvis==3.3.1
    %shell pip install import-ipynb
    %shell pip install pandas
    %shell pip install shutup
   
except subprocess.CalledProcessError:
  print(captured)
  raise
import shutup
shutup.please()

import os
import numpy as np
import pandas as pd
from pathlib import Path

from gensim.corpora import Dictionary
from gensim.models import LdaModel
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
pyLDAvis.enable_notebook()


pd. set_option('display.max_colwidth', None)


In [None]:
#@title File settings to get started { form-width: "20%" }

#@markdown Please ensure the input_data.csv is uploaded and execute this cell by pressing the _Play_ button 
#@markdown on the left 
DATA_PATH = 'input_data.csv'

results_folder = 'RESULTS' 
RESULTS_FOLDER = results_folder     #***user input
if not os.path.isdir(RESULTS_FOLDER):
    os.makedirs(RESULTS_FOLDER)
RESULTS_PATH = Path(RESULTS_FOLDER)

In [None]:
#@title Read input file  { form-width: "20%" }

def handle_listStrings(list_tokens):
  return(eval(list_tokens))

df = pd.read_csv('input_data.csv')
lit_data = df.loc[df['include'] > 0]

lit_data['medTokens'] = lit_data.medTokens.apply(handle_listStrings)
lit_data['cancerTokens'] = lit_data.cancerTokens.apply(handle_listStrings)
lit_data['entTokens'] = lit_data.entTokens.apply(handle_listStrings)

In [None]:
lit_data.shape[0]

822

In [None]:
#@title Function definitions for LDA model results { form-width: "20%" }

#@markdown Please execute this cell by pressing the _Play_ button on the left 

from matplotlib import pyplot as plt
from wordcloud import WordCloud
import matplotlib.colors as mcolors
from math import ceil
from collections import Counter

def get_document_topic_table(lda_model, corpus):
  # Init output
  document_topic_df = pd.DataFrame()

  # Get main topic in each document
  for i, row_list in enumerate(lda_model[corpus]):   #i - abstract number, row_list: topic number and probability
    row = sorted(row_list, key=lambda x: (x[1]), reverse=True)
    topic_num=row[0][0]
    prop_topic=row[0][1]
    wp = lda_model.show_topic(topic_num)
    topic_keywords = ", ".join([word for word, prop in wp])
    document_topic_df.at[i,'best_topic']=topic_num
    document_topic_df.at[i,'prop_topic']=prop_topic
    document_topic_df.at[i,'topic_keywords']=topic_keywords
    document_topic_df.at[i,'document_num']=i
    document_topic_df.at[i,'topic_dist'] = str(row) 
        
  return document_topic_df

def model_lda(dict, corpus):
  # Build LDA model
  lda_model = LdaModel(corpus=corpus, id2word=dict, num_topics=NUM_TOPICS, random_state=100,
                chunksize=200, passes=100)
  print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

  print("***Word distribution in topics***")
  lda_model.print_topics()[:NUM_TOPICS-1]

  
  return lda_model

def show_wordcloud(lda_model):
  cols = [color for name, color in mcolors.TABLEAU_COLORS.items()]  # more colors: 'mcolors.XKCD_COLORS'

  cloud = WordCloud(background_color='white',
                  width=2500,
                  height=1800,
                  max_words=30,
                  colormap='tab10',
                  color_func=lambda *args, **kwargs: cols[i],
                  prefer_horizontal=1.0)
  topics = lda_model.show_topics(formatted=False)
  ncol = 2
  nrow = ceil(NUM_TOPICS/ncol)
  fig, axes = plt.subplots(nrow, ncol, figsize=(10,10), sharex=True, sharey=True)

  for i, ax in enumerate(axes.flatten()):
    ax.set_axis_off()
    if i == NUM_TOPICS:
      break
    fig.add_subplot(ax)
    topic_words = dict(topics[i][1])
    cloud.generate_from_frequencies(topic_words, max_font_size=300)
    plt.gca().imshow(cloud)
    plt.gca().set_title('Topic ' + str(i), fontdict=dict(size=16))
    plt.gca().axis('off')
    # ax.set_axis_on()


  plt.subplots_adjust(wspace=0, hspace=0)
  plt.axis('off')
  plt.margins(x=0, y=0)
  plt.tight_layout()
  plt.show()


def show_wordimportance(lda_model, data):
  topics = lda_model_med.show_topics(formatted=False)
  data_flat = [w for w_list in data for w in w_list]
  counter = Counter(data_flat)  
  out = []

  for i, topic in topics:
    for word, weight in topic:
      out.append([word, i , weight, counter[word]])

  df = pd.DataFrame(out, columns=['word', 'topic_id', 'importance', 'word_count'])  
  ncol = 2
  nrow = ceil(NUM_TOPICS/ncol)      

  # Plot Word Count and Weights of Topic Keywords
  fig, axes = plt.subplots(nrow, ncol, figsize=(16,20), sharey=True,dpi=160)
  cols = [color for name, color in mcolors.TABLEAU_COLORS.items()]
  for i, ax in enumerate(axes.flatten()):
    ax.bar(x='word', height="word_count", data=df.loc[df.topic_id==i, :], color=cols[i], width=0.5, alpha=0.3, label='Word Count')
    ax_twin = ax.twinx()
    ax_twin.bar(x='word', height="importance", data=df.loc[df.topic_id==i, :], color=cols[i], width=0.2, label='Weights')
    ax.set_ylabel('Word Count', color=cols[i])
    ax_twin.set_ylim(0, 0.5); ax.set_ylim(0, 1000)
    ax.set_title('Topic: ' + str(i), color=cols[i], fontsize=16)
    ax.tick_params(axis='y', left=False)
    ax.set_xticklabels(df.loc[df.topic_id==i, 'word'], rotation=30, horizontalalignment= 'right')
    ax.legend(loc='upper left'); ax_twin.legend(loc='upper right')

  fig.tight_layout(w_pad=5)    


In [None]:
#@title Specify number of topics  { form-width: "20%" }

#@markdown Please execute this cell by pressing the _Play_ button on the left 

NUM_TOPICS = 4

In [None]:
#@title Specify exclusions { form-width: "20%" }

# Below are the list of irrelevant words for medical entity model
MED_IRRELEVANT = ['Cancer', 'Kidney cancer', 'cancer', 'kidney cancer', 'cancers', 'kidney cancers', 'objective(s', 'purpose(s', 'tumors', 'patients','patient', 'kidney cancer patients']

In [None]:
#@title LDA modelling on med tokens  { form-width: "20%" }
DATA = lit_data["medTokens"]
DATA = DATA.apply(lambda x: [y for y in x if y not in MED_IRRELEVANT])

DICTIONARY = Dictionary(DATA)
# Bag-of-words representation of the documents.
CORPUS = [DICTIONARY.doc2bow(doc) for doc in DATA]
lda_model_med = model_lda(DICTIONARY, CORPUS)

document_topic_df = get_document_topic_table(lda_model=lda_model_med, corpus=CORPUS)
lit_data.reset_index(drop=True, inplace=True)
document_topic_df.reset_index(drop = True, inplace=True)
out_df = pd.concat([lit_data, document_topic_df], axis = 1)
out_df.to_csv(RESULTS_PATH / "processed_model.csv", index=False)

lda_model_med.print_topics()[:NUM_TOPICS]


In [None]:
#@title Word cloud of topics  { form-width: "20%" }
show_wordcloud(lda_model_med)

In [None]:
#@title Word importance in the topics  { form-width: "20%" }
show_wordimportance(lda_model_med, DATA)

In [None]:
#@title PyLDAVis  { form-width: "20%" }
vis = gensimvis.prepare(lda_model_med, CORPUS, dictionary=lda_model_med.id2word, sort_topics=False)
vis