In [46]:
import pandas as pd
from neurovlm.data import get_data_dir
from ollama import chat
from pydantic import BaseModel, Field
from typing import Optional

## Ollama Setup

** 1. Install Ollama **
- ** using brew: **
```sh
brew install ollama
```

** 2. Start Ollama Service **
```
ollama serve
```

** 3. Download the ollama app **

** 4. Pull a Model(Our default model is qwen2.5:7b) **
```
ollama pull qwen2.5:7b-instruct  or ollama pull qwen2.5:3b-instruct
```


# Load data

In [47]:
# Load data and Specter
data_dir = get_data_dir()

wiki_df = pd.read_parquet(f"{data_dir}/neurowiki_with_ids.parquet")
papers_df = pd.read_parquet(f"{data_dir}/publications_more.parquet")


In [48]:
papers_df

Unnamed: 0,pmid,pmcid,doi,name,description
0,24911975,,10.1371/journal.pone.0099222,Acute aerobic exercise increases cortical acti...,There is increasing evidence that acute aerobi...
1,22884992,,10.1016/j.dcn.2012.07.001,Developmental differences in the neural correl...,Despite vast knowledge on the behavioral proce...
2,15722210,,10.1016/j.cogbrainres.2004.09.011,The neural substrate of arithmetic operations ...,Recent functional neuroimaging studies have be...
3,21930137,,10.1016/j.neuropsychologia.2011.09.006,Neural processing associated with comprehensio...,"In daily communication, we often use indirect ..."
4,21930160,,10.1097/gme.0b013e3181cc49e9,Postmenopausal hormone use impact on emotion p...,Despite considerable evidence for potential ef...
...,...,...,...,...,...
1074,30298028,6160565.0,10.3389/fpsyt.2018.00449,Gray Matter Structural Alterations in Social A...,\nThe current insight into the neurobiologica...
1075,36617994,10028637.0,10.1002/hbm.26199,Responsiveness variability during anaesthesia ...,\nAnaesthesia combined with functional neuroi...
1076,32722197,7465457.0,10.3390/brainsci10080477,Pain Processing in Older Adults and Its Associ...,\nAging is known to affect nociceptive proces...
1077,31178707,6537025.0,10.3389/fnhum.2019.00154,Language Brain Representation in Bilinguals Wi...,\nLanguage representation in the bilingual br...


In [49]:
class PaperDescriptionCategories(BaseModel):
    networks_implicated: list[str] = Field(
        default_factory=list,
        description="Brain networks mentioned (single or multiple)")

    key_regions: list[str] = Field(
        default_factory=list,
        description="Brain regions or structures (single or multiple)")

    core_cognitive_functions: list[str] = Field(
        default_factory=list,
        description="Cognitive processes studied (single or multiple)")

    typical_tasks: list[str] = Field(
        default_factory=list,
        description="Experimental tasks or paradigms (single or multiple)")

    implicated_conditions: list[str] = Field(
        default_factory=list,
        description="Clinical conditions mentioned (single or multiple)")

    populations: list[str] = Field(
        default_factory=list,
        description="Study participant demographics (single or multiple)")


    def to_dataframe_row(self,
                         paper_id: Optional[str] = None,
                         title: Optional[str] = None,
                         join_lists: bool = True) -> dict:
        """
        Convert the extraction to a dictionary suitable for DataFrame row.

        Args:
            paper_id: Optional identifier for the paper
            title: Optional paper title
            join_lists: If True, join lists as comma-separated strings.
                       If False, keep as lists.

        Returns:
            Dictionary with all fields ready for DataFrame
        """
        row = {}

        # Add optional metadata
        if paper_id is not None:
            row['paper_id'] = paper_id
        if title is not None:
            row['title'] = title

        # Add extraction fields
        if join_lists:
            row['networks_implicated'] = ', '.join(self.networks_implicated) if self.networks_implicated else ''
            row['key_regions'] = ', '.join(self.key_regions) if self.key_regions else ''
            row['core_cognitive_functions'] = ', '.join(self.core_cognitive_functions) if self.core_cognitive_functions else ''
            row['typical_tasks'] = ', '.join(self.typical_tasks) if self.typical_tasks else ''
            row['implicated_conditions'] = ', '.join(self.implicated_conditions) if self.implicated_conditions else ''
            row['populations'] = ', '.join(self.populations) if self.populations else ''
        else:
            row['networks_implicated'] = self.networks_implicated
            row['key_regions'] = self.key_regions
            row['core_cognitive_functions'] = self.core_cognitive_functions
            row['typical_tasks'] = self.typical_tasks
            row['implicated_conditions'] = self.implicated_conditions
            row['populations'] = self.populations

        return row

In [72]:
SYSTEM_PROMPT = """
# Role
You are a specialized information extraction system for neuroimaging research papers. Your task is to extract specific neuroscientific information from paper titles and abstracts.

# Fields to Extract
Extract the following six fields: networks_implicated, key_regions, core_cognitive_functions, typical_tasks, implicated_conditions, populations

# Input Format
You will receive a paper's title and abstract. Extract information ONLY from the provided text—do not infer or add information not explicitly mentioned.

# Output Format
Return a valid JSON object with these exact keys. All fields must be lists (arrays), even if there's only one item or no items. Use lowercase for all values. If a field has no information in the text, return an empty list [].

# Field Definitions and Examples

**networks_implicated**: Large-scale brain networks mentioned in the paper
- Examples: "default mode network", "central executive network", "salience network", "dorsal attention network", "ventral attention network", "frontoparietal network", "somatomotor network", "visual network", "limbic network"
- Format: Always a list

**key_regions**: Specific brain regions, structures, or anatomical areas
- Examples: "prefrontal cortex", "hippocampus", "amygdala", "cerebellum", "brainstem", "anterior cingulate cortex", "insula", "striatum", "thalamus", "temporal lobe", "frontal lobe", "parietal lobe", "occipital lobe"
- Include both broad areas (lobes) and specific structures (nuclei, gyri)
- Format: Always a list

**core_cognitive_functions**: Mental processes or cognitive domains studied
- Examples: "attention", "working memory", "episodic memory", "semantic memory", "executive function", "language processing", "visual perception", "emotional regulation", "decision making", "cognitive control", "reward processing"
- Format: Always a list

**typical_tasks**: Experimental paradigms, cognitive tasks, or behavioral assessments used
- Examples: "n-back task", "stroop task", "go/no-go task", "emotional face matching", "verbal fluency", "resting state", "naturalistic viewing", "memory encoding", "fear conditioning", "memory task"
- Include both task names and task types
- Format: Always a list

**implicated_conditions**: Clinical conditions, disorders, or health states mentioned
- Examples: "alzheimer's disease", "major depressive disorder", "schizophrenia", "autism spectrum disorder", "parkinson's disease", "traumatic brain injury", "anxiety disorders", "substance use disorder"
- Include both diagnosed conditions and risk states
- Format: Always a list

**populations**: Demographic or clinical characteristics of study participants
- Examples: "healthy adults", "older adults", "children", "adolescents", "patients with depression", "treatment-naive patients", "chronic pain patients", "athletes"
- Include age groups, clinical status, or special populations
- Format: Always a list

# Formatting Rules
1. All text must be lowercase
2. Use standard neuroanatomical terminology
3. For abbreviations mentioned in text, use the full term if provided (e.g., "default mode network" not "dmn")
4. All fields must be lists, even if empty or containing only one item
5. Return only valid JSON—no markdown code blocks, no explanations

# Few-Shot Examples

## Example 1
**Input:**
Title: "Altered Default Mode Network Connectivity in Major Depressive Disorder"
Abstract: "We investigated resting-state functional connectivity in the default mode network (DMN) in patients with major depressive disorder (MDD) and healthy controls. Using fMRI, we found reduced connectivity between the posterior cingulate cortex and medial prefrontal cortex in MDD patients. These findings suggest disrupted DMN function may contribute to rumination and negative self-referential thinking in depression."

**Output:**
{
  "networks_implicated": ["default mode network"],
  "key_regions": ["posterior cingulate cortex", "medial prefrontal cortex"],
  "core_cognitive_functions": ["self-referential thinking"],
  "typical_tasks": ["resting state"],
  "implicated_conditions": ["major depressive disorder"],
  "populations": ["patients with major depressive disorder", "healthy controls"]
}

## Example 2
**Input:**
Title: "Cerebellar Contributions to Working Memory in Children"
Abstract: "This study examined the role of the cerebellum in working memory performance in typically developing children aged 8-12. Participants completed an n-back task during fMRI scanning. Results showed robust cerebellar activation, particularly in lobules VI and VII, during working memory maintenance and manipulation. Cerebellar activity correlated with task accuracy, suggesting the cerebellum supports executive aspects of working memory beyond its traditional motor functions."

**Output:**
{
  "networks_implicated": [],
  "key_regions": ["cerebellum"],
  "core_cognitive_functions": ["working memory", "executive function"],
  "typical_tasks": ["n-back task"],
  "implicated_conditions": [],
  "populations": ["children"]
}

## Example 3
**Input:**
Title: "Multimodal Imaging of Reward Processing Deficits in Substance Use Disorder"
Abstract: "We used combined fMRI and PET imaging to investigate reward processing abnormalities in individuals with cocaine use disorder compared to healthy adults. During a monetary reward task, patients showed blunted ventral striatum and anterior cingulate cortex activation. These alterations in the salience network and mesolimbic circuitry correlated with craving severity and treatment outcomes."

**Output:**
{
  "networks_implicated": ["salience network"],
  "key_regions": ["ventral striatum", "anterior cingulate cortex"],
  "core_cognitive_functions": ["reward processing"],
  "typical_tasks": ["monetary reward task"],
  "implicated_conditions": ["cocaine use disorder"],
  "populations": ["individuals with cocaine use disorder", "healthy adults"]
}

Now extract information from the provided paper.
"""

# Helper functions

In [65]:
def create_papers_dataframe(include_metadata: bool = True) -> pd.DataFrame:
    """
    Create an empty DataFrame with the correct columns.

    Args:
        include_metadata: If True, include paper_id and title columns

    Returns:
        Empty DataFrame with appropriate columns
    """
    columns = []

    if include_metadata:
        columns.extend(['paper_id', 'title'])

    columns.extend([
        'networks_implicated',
        'key_regions',
        'core_cognitive_functions',
        'typical_tasks',
        'implicated_conditions',
        'populations'
    ])

    return pd.DataFrame(columns=columns)


def extract_paper_info(title: str, abstract: str) -> PaperDescriptionCategories:
    """Extract information from a paper using LLM."""
    response = chat(
        model= 'qwen2.5:3b-instruct', #'qwen2.5:3b-instruct',
        messages=[
            {
                'role': 'system',
                'content': SYSTEM_PROMPT
            },
            {
                'role': 'user',
                'content': f"Title: {title}\n\nAbstract: {abstract}"
            }
        ],
        format=PaperDescriptionCategories.model_json_schema(),
        options={
            'temperature': 0.1,
        }
    )

    return PaperDescriptionCategories.model_validate_json(response.message.content)


def process_papers_to_dataframe(input_df: pd.DataFrame,
                                model: str = 'qwen2.5:7b',
                                join_lists: bool = True) -> pd.DataFrame:
    """
    Process papers from a DataFrame and create a new DataFrame with extracted information.

    Args:
        input_df: DataFrame with columns 'pmid', 'name' (title), 'description' (abstract)
        model: Ollama model to use
        join_lists: Whether to join lists as comma-separated strings

    Returns:
        DataFrame with all extracted information
    """
    # Create output dataframe
    output_df = create_papers_dataframe(include_metadata=True)

    # Iterate through input dataframe rows
    for idx, row in input_df.iterrows():
        # Extract the required fields
        pmid = row['pmid']
        title = row['name']
        abstract = row['description']

        print(f"Processing [{idx+1}/{len(input_df)}]: {title[:50]}...")

        try:
            # Extract information
            result = extract_paper_info(title, abstract)

            # Convert to row
            extracted_row = result.to_dataframe_row(
                paper_id=pmid,
                title=title,
                join_lists=join_lists
            )

            # Add to output dataframe
            output_df = pd.concat([output_df, pd.DataFrame([extracted_row])], ignore_index=True)

        except Exception as e:
            print(f"Error processing paper {pmid}: {e}")
            # Optionally add a row with NaNs or empty values
            continue

    return output_df

# Process one paper

In [61]:
# Single paper extraction and adding to DataFrame
df = create_papers_dataframe(include_metadata=True)

title = papers_df['name'].iloc[0]
abstract = papers_df['description'].iloc[0]

# Extract information
result = extract_paper_info(title, abstract)

# Add to dataframe
new_row = result.to_dataframe_row(
    paper_id="paper_001",
    title=title,
    join_lists=True  # Comma-separated strings
)
df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True)

In [62]:
df

Unnamed: 0,paper_id,title,networks_implicated,key_regions,core_cognitive_functions,typical_tasks,implicated_conditions,populations
0,paper_001,Acute aerobic exercise increases cortical acti...,"default mode network, frontoparietal network","right middle prefrontal gyrus, right lingual g...","working memory, executive control",N-back task,,female college students


In [63]:
result

PaperDescriptionCategories(networks_implicated=['default mode network', 'frontoparietal network'], key_regions=['right middle prefrontal gyrus', 'right lingual gyrus', 'left fusiform gyrus', 'anterior cingulate cortexes', 'left inferior frontal gyrus', 'right paracentral lobule'], core_cognitive_functions=['working memory', 'executive control'], typical_tasks=['N-back task'], implicated_conditions=[], populations=['female college students'])

In [55]:
abstract

'There is increasing evidence that acute aerobic exercise is associated with improved cognitive function. However, neural correlates of its cognitive plasticity remain largely unknown. The present study examined the effect of a session of acute aerobic exercise on working memory task-evoked brain activity as well as task performance. A within-subjects design with a counterbalanced order was employed. Fifteen young female participants (M = 19.56, SD = 0.81) were scanned using functional magnetic resonance imaging while performing a working memory task, the N-back task, both following an acute exercise session with 20 minutes of moderate intensity and a control rest session. Although an acute session of exercise did not improve behavioral performance, we observed that it had a significant impact on brain activity during the 2-back condition of the N-back task. Specifically, acute exercise induced increased brain activation in the right middle prefrontal gyrus, the right lingual gyrus, an

# Process multiple papers

In [66]:
# input DataFrame
input_papers_df = papers_df.iloc[0:15]

# Process all papers
results_df = process_papers_to_dataframe(
    input_df=input_papers_df,
    model='qwen2.5:7b-instruct',
    join_lists=True
)

# View results
results_df

# Save to CSV
results_df.to_csv('extracted_neuroimaging_data.csv', index=False)

Processing [1/15]: Acute aerobic exercise increases cortical activity...
Processing [2/15]: Developmental differences in the neural correlates...
Processing [3/15]: The neural substrate of arithmetic operations and ...
Processing [4/15]: Neural processing associated with comprehension of...
Processing [5/15]: Postmenopausal hormone use impact on emotion proce...
Processing [6/15]: Neural correlates of disbalanced motor control in ...
Processing [7/15]: Propofol disrupts functional interactions between ...
Processing [8/15]: Functionally distinct regions for spatial processi...
Processing [9/15]: Imitation components in the human brain: an fMRI s...
Processing [10/15]: Spatial language processing in the blind: evidence...
Processing [11/15]: Specifically progressive deficits of brain functio...
Processing [12/15]: The effect of negative and positive emotionality o...
Processing [13/15]: The synchronization of spontaneous BOLD activity p...
Processing [14/15]: Right hemispheric participa

In [79]:
results_df.iloc[5]

paper_id                                                             21930304
title                       Neural correlates of disbalanced motor control...
networks_implicated                    salience network, default mode network
key_regions                 left caudal cingulate zone (CCZ), right extern...
core_cognitive_functions                     motor control, reward processing
typical_tasks                                                                
implicated_conditions                               major depressive disorder
populations                                    control subjects, MDD patients
Name: 5, dtype: object

In [80]:
input_papers_df.iloc[5]['description']

'BACKGROUND: Motor retardation is a common symptom of major depressive disorder (MDD). Despite the existence of various assessment methods, little is known on the pathobiology of motor retardation. We aimed to elucidate aspects of motor control investigating the association of objective motor activity and resting state cerebral blood flow (CBF). METHODS: Nineteen control subjects and 20 MDD patients were investigated using arterial spin labeling (ASL) at 3T in the morning to quantify resting state CBF. Afterwards wrist actigraphy was recorded for 24h. CBF, group and activity level (AL) were entered into a whole brain general linear model. RESULTS: MDD patients had reduced AL. Both groups had linear associations of AL and CBF in bilateral rostral prefrontal cortex. Groups differed in four clusters associated with motor control. In controls a positive association was found in the left caudal cingulate zone (CCZ) and an inverse association in the right external globus pallidus (GPe). MDD 