## c-TD-IDF

Steps:

- cluster (DBSCAN)
- dimension reduction
- tf-IDF


$c\_td\_idf = {t_i \over w_i} \times log{m \over {\Sigma_j^n t_j}}$


In [2]:
from bertopic import BERTopic
from pathlib import Path
from tqdm import tqdm
import pandas as pd
import spacy
import gensim
from gensim.models.phrases import ENGLISH_CONNECTOR_WORDS

In [3]:
MY_STOP_WORDS = ['study', 'task', 'test']

# Note: run this to download the SpaCy model: `python -m spacy download en_core_web_sm`
nlp = spacy.load('en_core_web_sm')


def preprocess(texts: list[str], corpus_name: str):
  """Opinionated preprocessing pipeline.

  Args:
      texts (list[str]): list of texts, each item is one text document.
      corpus_name (str): Name of the corpus

  Returns:
      list[str]: preprocessed documents
  """
  # DEBUG standard preprocessing pipeline
  # docs = \
  #   texts['abstract'].progress_apply(lambda abstract: gensim.parsing.preprocess_string(abstract)).to_list()

  print('Preprocessing...')

  # additional stop words
  for stop_word in MY_STOP_WORDS:
    lexeme = nlp.vocab[stop_word]
    lexeme.is_stop = True

  # flake8: noqa: W503
  def _clean(doc):
    cleaned = []
    for token in doc:
      if (not token.is_punct
          and token.is_alpha
          and not token.is_stop
          and not token.like_num
          and not token.is_space):
        cleaned.append(token.lemma_.lower().strip())
    return cleaned

  docs = tqdm([_clean(txt) for txt in nlp.pipe(texts)], desc='Cleaning docs')

  # bigram
  ngram_phrases = gensim.models.Phrases(docs, connector_words=ENGLISH_CONNECTOR_WORDS)

  # there are cases that a test or construct contains 4 terms; a heuristic is to count spaces in the corpus_name
  for _ in range(max(1, 2 + corpus_name.count(' '))):
    ngram_phrases = gensim.models.Phrases(ngram_phrases[docs], connector_words=ENGLISH_CONNECTOR_WORDS)

  ngram = gensim.models.phrases.Phraser(ngram_phrases)
  docs = [' '.join(doc) for doc in ngram[docs]]
  # DEBUG filter ngram stop words: docs = [[w for w in doc if w not in my_stop_words] for doc in docs]

  return docs


In [4]:
csv_files = Path('data/pubmed/tests').glob('N-back.csv')


corpora = []

for csv_file in tqdm(csv_files, desc='Reading CSV files'):
    df = pd.read_csv(csv_file)
    df['corpus_name'] = csv_file.stem
    corpora.append(df)

df = pd.concat(corpora, axis=0)

df['abstract'].fillna(df['title'], inplace=True)

docs = preprocess(df['abstract'].to_list(), 'Odd One Out')

Reading CSV files: 1it [00:00, 16.78it/s]
Preprocessing...
Cleaning docs: 100%|██████████| 2091/2091 [00:00<00:00, 6529.09it/s]


In [5]:
topic_model = BERTopic(verbose=True)
topics, _ = topic_model.fit_transform(docs)
topic_model.get_topic_info()

Batches: 100%|██████████| 66/66 [04:37<00:00,  4.20s/it]
2021-06-07 15:18:22,605 - BERTopic - Transformed documents to Embeddings
2021-06-07 15:18:35,779 - BERTopic - Reduced dimensionality with UMAP
2021-06-07 15:18:35,843 - BERTopic - Clustered UMAP embeddings with HDBSCAN


Unnamed: 0,Topic,Count,Name
0,-1,840,-1_performance_workmemory_group_activation
1,0,188,0_schizophrenia_patientschizophrenia_patient_s...
2,1,124,1_child_adhd_adolescent_school
3,2,79,2_activation_increase_activate_functional
4,3,58,3_training_fluidintelligence_transfereffect_pe...
5,4,57,4_depression_mdd_bipolardisorder_bipolar
6,5,57,5_anodaltdcs_currentstimulation_transcranialdi...
7,6,56,6_auditory_speech_sound_noise
8,7,48,7_sleep_sleepdeprivation_sleepiness_sleepduration
9,8,46,8_da_receptor_tolcapone_dopamine


In [6]:
topic_model.get_topics()
# topic_model.get_topic(0)

{-1: [('performance', 0.010888964538598979),
  ('workmemory', 0.009720892020278222),
  ('group', 0.00953702049354962),
  ('activation', 0.00857420752877851),
  ('patient', 0.008544621142061939),
  ('show', 0.007953325208550764),
  ('cognitive', 0.007470983083734355),
  ('memory', 0.007309992248294156),
  ('fmri', 0.0069166095176036516),
  ('brain', 0.0069154893947964745)],
 0: [('schizophrenia', 0.05164991204086579),
  ('patientschizophrenia', 0.02192184755623473),
  ('patient', 0.018884513373034578),
  ('schizophreniapatient', 0.01563152149473723),
  ('cortex', 0.011935310614606487),
  ('psychosis', 0.010549994118156948),
  ('dorsolateralprefrontal', 0.010501943471310686),
  ('treatment', 0.00970186793034461),
  ('schizophrenicpatient', 0.008615256390980824),
  ('prefrontalcortex', 0.00812892157621763)],
 1: [('child', 0.05466766583895227),
  ('adhd', 0.028794282776514257),
  ('adolescent', 0.018339723081029307),
  ('school', 0.014883090590819511),
  ('childadhd', 0.013939385452704492

In [7]:
topics_over_time = topic_model.topics_over_time(docs, topics, df['year'], datetime_format="%b")
topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=20)

30it [02:39,  5.33s/it]


In [12]:
# word cloud
# from wordcloud import WordCloud

# text = df['preprocessed_abstract'].str.join(' ').str.cat()

# cloud = WordCloud(width=500,height=500,background_color ='white').generate(text)

# plt.figure(figsize=(10,10))
# plt.imshow(cloud)
# plt.axis('off')
# plt.show()