# Load CogAtlas data and assign categories to the paper

In [1]:
from neurovlm.retrieval_resources import (
    _load_dataframe,
    _load_latent_text,
    _load_latent_cogatlas,
    _load_latent_cogatlas_disorder,
    _load_latent_cogatlas_task,
     _load_cogatlas_dataset,
    _load_cogatlas_disorder_dataset,
    _load_cogatlas_task_dataset
)

from neurovlm.brain_input import search_cogatlas_from_brain
from neurovlm.text_input import search_cogatlas_from_text

import pandas as pd
from tqdm import tqdm


In [2]:
papers_df = _load_dataframe()
latent_papers, latent_pmid = _load_latent_text()

cogatlas_df = _load_cogatlas_dataset()
latent_cogatlas,latent_terms  = _load_latent_cogatlas()

cogatlas_disorder_df = _load_cogatlas_disorder_dataset()
latent_cogatlas_disorder, cogatlas_disorder = _load_latent_cogatlas_disorder()

cogatlas_task_df = _load_cogatlas_task_dataset(filtered=True)
latent_cogatlas_task, cogatlas_task = _load_latent_cogatlas_task()

In [3]:
# Initialize lists to store results
pmids = []
cogatlas_concepts_list = []
cogatlas_disorders_list = []
cogatlas_tasks_list = []

# Loop through each paper
for i in tqdm(range(len(latent_papers))):
  paper_embedding = latent_papers[i]
  pmid = latent_pmid[i]

  # Get top 2 cogatlas concepts
  _, concepts, _ = search_cogatlas_from_text(
      query=paper_embedding,
      top_k=2,
      category="cogatlas"
  )

  # Get top 2 cogatlas disorders
  _, disorders, _ = search_cogatlas_from_text(
      query=paper_embedding,
      top_k=2,
      category="cogatlas_disorder"
  )

  # Get top 2 cogatlas tasks
  _, tasks, _ = search_cogatlas_from_text(
      query=paper_embedding,
      top_k=2,
      category="cogatlas_task"
  )

  # Store results
  pmids.append(pmid)
  cogatlas_concepts_list.append(concepts)
  cogatlas_disorders_list.append(disorders)
  cogatlas_tasks_list.append(tasks)

# Create dataframe
df_results = pd.DataFrame({
  'pmid': pmids,
  'cogatlas_concepts': cogatlas_concepts_list,
  'cogatlas_disorder': cogatlas_disorders_list,
  'cogatlas_task': cogatlas_tasks_list
})

df_results

100%|██████████| 30826/30826 [03:18<00:00, 155.11it/s]


Unnamed: 0,pmid,cogatlas_concepts,cogatlas_disorder,cogatlas_task
0,1589767,"[speech processing, auditory working memory]","[speech disorder, articulation disorder]","[auditory scene perception, passive listening]"
1,8530552,"[attentional effort, vocal response execution]","[stroke, tetraplegia]","[isometric force, motor sequencing task]"
2,8624678,"[language processing, language]","[surface dyslexia, aphasia]","[semantic anomaly judgement task, orthographic..."
3,8670634,"[goal maintenance, active maintenance]","[tetraplegia, rigidity]","[articulatory suppression task, bimanual coord..."
4,8994101,"[pitch perception, spontaneous recovery]","[philoprogeria, periodic limb movement disorder]","[multistability, nine-hole peg test]"
...,...,...,...,...
30821,38828086,"[search, loss]","[somatization disorder, kleine-levin syndrome]","[brief symptom inventory, broader phenotype au..."
30822,38832323,"[behavioral inhibition, attentional effort]","[learning disability, writing disorder]","[oculomotor delayed response, braille reading ..."
30823,38832325,"[autobiographical recall, autobiographical mem...","[borderline personality disorder, anterograde ...","[broader phenotype autism symptom scale, immed..."
30824,38832358,"[auditory word comprehension, auditory word re...","[aphasia, verbal auditory agnosia]","[chewing/swallowing, abstract/concrete judgmen..."


In [5]:
from neurovlm.data import data_dir
df_results.to_parquet(data_dir / "paper_cogatlas_categories.parquet")

In [10]:
df_results.to_csv("papers_cogatlas_categories.parquet")