In [1]:
from bertopic import BERTopic
from pathlib import Path
from tqdm import tqdm
import pandas as pd
import spacy
import gensim
from gensim.models.phrases import ENGLISH_CONNECTOR_WORDS

In [19]:
MY_STOP_WORDS = ['study', 'task', 'test']

# Note: run this to download the SpaCy model: `python -m spacy download en_core_web_sm`
nlp = spacy.load('en_core_web_sm')


def preprocess(texts: list[str], corpus_name: str):
  """Opinionated preprocessing pipeline.

  Args:
      texts (list[str]): list of texts, each item is one text document.
      corpus_name (str): Name of the corpus

  Returns:
      list[str]: preprocessed documents
  """
  # DEBUG standard preprocessing pipeline
  # docs = \
  #   texts['abstract'].progress_apply(lambda abstract: gensim.parsing.preprocess_string(abstract)).to_list()

  print('Preprocessing...')

  # additional stop words
  for stop_word in MY_STOP_WORDS:
    lexeme = nlp.vocab[stop_word]
    lexeme.is_stop = True

  # flake8: noqa: W503
  def _clean(doc):
    cleaned = []
    for token in doc:
      if (not token.is_punct
          and token.is_alpha
          and not token.is_stop
          and not token.like_num
          and not token.is_space):
        cleaned.append(token.lemma_.lower().strip())
    return cleaned

  docs = tqdm([_clean(txt) for txt in nlp.pipe(texts)], desc='Cleaning docs')

  # bigram
  ngram_phrases = gensim.models.Phrases(docs, connector_words=ENGLISH_CONNECTOR_WORDS)

  # there are cases that a test or construct contains 4 terms; a heuristic is to count spaces in the corpus_name
  for _ in range(max(1, 2 + corpus_name.count(' '))):
    ngram_phrases = gensim.models.Phrases(ngram_phrases[docs], connector_words=ENGLISH_CONNECTOR_WORDS)

  ngram = gensim.models.phrases.Phraser(ngram_phrases)
  docs = [' '.join(doc) for doc in ngram[docs]]
  # DEBUG filter ngram stop words: docs = [[w for w in doc if w not in my_stop_words] for doc in docs]

  return docs


In [23]:
csv_files = Path('data/pubmed/tests').glob('Odd One Out.csv')


corpora = []

for csv_file in tqdm(csv_files, desc='Reading CSV files'):
    df = pd.read_csv(csv_file)
    df['corpus_name'] = csv_file.stem
    corpora.append(df)

df = pd.concat(corpora, axis=0)

df['abstract'].fillna(df['title'], inplace=True)

docs = preprocess(df['abstract'].to_list(), 'Odd One Out')

Reading CSV files: 1it [00:00, 98.17it/s]
Preprocessing...
Cleaning docs: 100%|██████████| 107/107 [00:00<00:00, 7989.29it/s]


In [25]:
topic_model = BERTopic(verbose=True)
topics, _ = topic_model.fit_transform(docs)
topic_model.get_topic_info()

Batches: 100%|██████████| 4/4 [00:14<00:00,  3.57s/it]
2021-06-03 19:17:43,272 - BERTopic - Transformed documents to Embeddings
2021-06-03 19:17:45,780 - BERTopic - Reduced dimensionality with UMAP
2021-06-03 19:17:45,788 - BERTopic - Clustered UMAP embeddings with HDBSCAN


Unnamed: 0,Topic,Count,Name
0,-1,71,-1_child_stimulus_experiment_color
1,0,20,0_color_reward_treatment_allele
2,1,16,1_protein_cell_gene_helicase


In [27]:
topic_model.get_topics()
# topic_model.get_topic(0)

{-1: [('child', 0.03153178064934693),
  ('stimulus', 0.028456058651006318),
  ('experiment', 0.024494730017555595),
  ('color', 0.02255099067164911),
  ('test', 0.02231742068266177),
  ('perception', 0.02086488715683126),
  ('representation', 0.019964141467129288),
  ('distractor', 0.019606871528897132),
  ('show', 0.018906157510529965),
  ('processing', 0.0183176041532666)],
 0: [('color', 0.035486059391031714),
  ('reward', 0.02843603237278446),
  ('treatment', 0.02756451441487837),
  ('allele', 0.02604570095238959),
  ('food', 0.02604570095238959),
  ('attentionalbias', 0.024808062973390534),
  ('colour', 0.024454631211526794),
  ('foodcue', 0.023602147318167184),
  ('stimulus', 0.023043766123039936),
  ('compare', 0.022728946003413363)],
 1: [('protein', 0.07095482612846987),
  ('cell', 0.051791519395012066),
  ('gene', 0.045993375578756775),
  ('helicase', 0.03630565602176893),
  ('cytokine', 0.03630565602176893),
  ('domain', 0.035612814635554914),
  ('mutation', 0.03245474295167

In [26]:
topics_over_time = topic_model.topics_over_time(docs, topics, df['year'], datetime_format="%b")
topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=20)

29it [00:14,  1.99it/s]


In [12]:
# word cloud
# from wordcloud import WordCloud

# text = df['preprocessed_abstract'].str.join(' ').str.cat()

# cloud = WordCloud(width=500,height=500,background_color ='white').generate(text)

# plt.figure(figsize=(10,10))
# plt.imshow(cloud)
# plt.axis('off')
# plt.show()