In [1]:
import pandas as pd
import json
import os
import logging
from collections import Counter
from datetime import datetime

grab weak annotation file 

In [2]:
BASE_DIR = "../../data"
WEAK_ANNOTATION_POOL_FILE = os.path.join(BASE_DIR, "weak_annotation_pool.jsonl")


def load_weak_annotations_to_dataframe(file_path: str) -> pd.DataFrame:
    """
    Loads weak annotations from a .jsonl file into a Pandas DataFrame.
    Normalizes the 'weak_annotations' nested field.
    """
    data = []
    if os.path.exists(file_path):
        with open(file_path, 'r', encoding='utf-8') as f:
            for line_num, line in enumerate(f, 1):
                try:
                    data.append(json.loads(line))
                except json.JSONDecodeError:
                    logger.warning(f"Could not parse line in {file_path} at line {line_num}: {line.strip()}")
    else:
        logger.error(f"File not found: {file_path}")
        return pd.DataFrame() # Return an empty DataFrame

    if not data:
        logger.info(f"No data found in {file_path}. Returning empty DataFrame.")
        return pd.DataFrame()

    # Load the main data
    df = pd.DataFrame(data)

    # Normalize the 'weak_annotations' column if it exists
    if 'weak_annotations' in df.columns:
        # json_normalize flattens the nested dictionary into new columns, prefixed by 'weak_annotations.'
        # If 'record_path' is specified, it flattens lists of dictionaries within the nested structure.
        # Here, 'weak_annotations' is a dict, so we just pass it as data to normalize.
        weak_anns_df = pd.json_normalize(df['weak_annotations'])

        # Concatenate the flattened weak_annotations with the original DataFrame,
        # dropping the original 'weak_annotations' column to avoid redundancy.
        df = pd.concat([df.drop('weak_annotations', axis=1), weak_anns_df], axis=1)
    return df

df_annotations = load_weak_annotations_to_dataframe(WEAK_ANNOTATION_POOL_FILE)

In [3]:
    print("\n--- DataFrame Info ---")
    df_annotations.info()


--- DataFrame Info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3385 entries, 0 to 3384
Data columns (total 23 columns):
 #   Column                                       Non-Null Count  Dtype 
---  ------                                       --------------  ----- 
 0   doi                                          3385 non-null   object
 1   pmid                                         3381 non-null   object
 2   title                                        3385 non-null   object
 3   paper_type                                   3385 non-null   object
 4   relevance_score_llm                          3385 non-null   object
 5   biological_context                           3385 non-null   object
 6   data_modalities_used                         3385 non-null   object
 7   annotation_status                            3385 non-null   object
 8   llm_query_date                               3385 non-null   object
 9   llm_model                                    3385 non-null   

In [5]:
print("\n--- First 5 rows ---")
df_annotations.head()


--- First 5 rows ---


Unnamed: 0,doi,pmid,title,paper_type,relevance_score_llm,biological_context,data_modalities_used,annotation_status,llm_query_date,llm_model,...,methodology_type,covered_analysis_steps,pipeline_package_name,main_goals,data_links_mentioned,code_availability.status,code_availability.link_in_abstract_or_title,is_spatial_omics,is_disease_specific_review,is_pipeline_paper
0,10.1371/journal.pcbi.1008887,33872301,MAUI (MBI Analysis User Interface)-An image pr...,method,highly_relevant,,"[MIBI, IMC, multiplexed imaging, spatial prote...",weak_llm_annotated,2025-06-26T01:48:31.159972,models/gemini-2.5-flash-preview-04-17,...,pipeline,"[Preprocessing, Denoising]",MAUI,"To present MAUI, a graphical user interface-ba...",[],Not mentioned,,,,
1,10.1016/j.cell.2020.07.005,32763154,Coordinated Cellular Neighborhoods Orchestrate...,application,relevant,Colorectal cancer,"[CODEX, multiplexed imaging, spatial proteomics]",weak_llm_annotated,2025-06-26T01:48:56.823109,models/gemini-2.5-flash-preview-04-17,...,,"[Cell Segmentation, Clustering, Spatial Domain...",,To re-engineer CODEX for FFPE tissue microarra...,[],Not mentioned,,,,
2,10.1038/s41467-024-48870-5,38879602,Deep cell phenotyping and spatial analysis of ...,method,highly_relevant,,"[multiplexed imaging, IMC, CODEX]",weak_llm_annotated,2025-06-26T01:49:13.426536,models/gemini-2.5-flash-preview-04-17,...,pipeline,"[Cell Segmentation, Cell-type annotation, Spat...",TRACERx-PHLEX,Develop a reproducible and user-friendly compu...,[],Not mentioned,,,,
3,10.1038/s41592-020-01018-x,33318659,Cellpose: a generalist algorithm for cellular ...,method,highly_relevant,Varied cell types and organisms from microscop...,[microscopy images],weak_llm_annotated,2025-06-26T01:49:30.946373,models/gemini-2.5-flash-preview-04-17,...,algorithm,[Cell Segmentation],Cellpose,"Introduce Cellpose, a generalist deep learning...",[],Not mentioned,,,,
4,10.1038/s42003-024-06480-3,38971915,A point cloud segmentation framework for image...,method,highly_relevant,,"[image-based spatial RNA profiling, spatial tr...",weak_llm_annotated,2025-06-26T01:49:49.090237,models/gemini-2.5-flash-preview-04-17,...,framework,[Cell Segmentation],ComSeg,"Introduces ComSeg, a point cloud segmentation ...",[],GitHub link provided,https://github.com/fish-quant/ComSeg,,,


In [7]:
print("\n--- Basic Statistics (e.g., relevance and paper type counts) ---")
print("\nRelevance Score Counts:")
print(df_annotations['relevance_score_llm'].value_counts())


--- Basic Statistics (e.g., relevance and paper type counts) ---

Relevance Score Counts:
relevance_score_llm
relevant           1695
highly_relevant     899
low_relevance       520
irrelevant          271
Name: count, dtype: int64


In [8]:
print("\nPaper Type Counts:")
print(df_annotations['paper_type'].value_counts())


Paper Type Counts:
paper_type
application          1505
method               1266
review                472
other                  88
method_comparison      54
Name: count, dtype: int64


Paper types are divided in several types. Application, Methds, Review (can be methods or application), Method comparison. 

Lets start with the application papers. 

***Application papers***

In [11]:
# Filter the DataFrame for 'application' papers
application_papers_df = df_annotations[df_annotations['paper_type'] == 'application'].copy()

print(f"\n--- Statistics for Application Papers ---")
print(f"Total application papers found: {len(application_papers_df)}")

application_papers_df.head()


--- Statistics for Application Papers ---
Total application papers found: 1505


Unnamed: 0,doi,pmid,title,paper_type,relevance_score_llm,biological_context,data_modalities_used,annotation_status,llm_query_date,llm_model,...,methodology_type,covered_analysis_steps,pipeline_package_name,main_goals,data_links_mentioned,code_availability.status,code_availability.link_in_abstract_or_title,is_spatial_omics,is_disease_specific_review,is_pipeline_paper
1,10.1016/j.cell.2020.07.005,32763154,Coordinated Cellular Neighborhoods Orchestrate...,application,relevant,Colorectal cancer,"[CODEX, multiplexed imaging, spatial proteomics]",weak_llm_annotated,2025-06-26T01:48:56.823109,models/gemini-2.5-flash-preview-04-17,...,,"[Cell Segmentation, Clustering, Spatial Domain...",,To re-engineer CODEX for FFPE tissue microarra...,[],Not mentioned,,,,
49,10.1016/j.cell.2021.12.023,35063072,Transition to invasive breast cancer is associ...,application,relevant,Breast cancer progression (DCIS to IBC),"[MIBI, spatial proteomics, multiplexed imaging]",weak_llm_annotated,2025-06-26T02:06:30.329631,models/gemini-2.5-flash-preview-04-17,...,,"[Cell Segmentation, Clustering, Morphometrics]",,To understand the changes in the tumor microen...,[],Not mentioned,,,,
59,10.1126/scitranslmed.adq3852,40560997,Podocyte YAP and TAZ hyperactivation drives gl...,application,low_relevance,Human kidney diseases (collapsing glomerulopat...,"[spatial transcriptomics, immunostaining]",weak_llm_annotated,2025-06-26T02:08:32.323956,models/gemini-2.5-flash-preview-04-17,...,,[],,To identify common molecular drivers of glomer...,[],Not mentioned,,,,
60,10.1097/HEP.0000000000001432,40560681,Multimodal transcriptomics identifies metallot...,application,low_relevance,Primary sclerosing cholangitis and liver disease,"[spatial transcriptomics, snRNA-seq]",weak_llm_annotated,2025-06-26T02:08:42.172942,models/gemini-2.5-flash-preview-04-17,...,,[],,To localize potential disease pathways to spec...,[],Not mentioned,,,,
61,10.3390/proteomes13020017,40559990,Integrative Spatial Proteomics and Single-Cell...,application,low_relevance,Rheumatoid Arthritis (RA) synovium,"[spatial proteomics, scRNA-seq, mass spectrome...",weak_llm_annotated,2025-06-26T02:08:52.921783,models/gemini-2.5-flash-preview-04-17,...,,"[Integration with other omics, Deconvolution]",,To understand molecular heterogeneity in Rheum...,[],Not mentioned,,,,


In [15]:
print("\n--- Relevance Score Breakdown for Application Papers ---")

# Get both absolute counts and normalized percentages
relevance_counts_app = application_papers_df['relevance_score_llm'].value_counts()
relevance_percentages_app = application_papers_df['relevance_score_llm'].value_counts(normalize=True) * 100

# Iterate through the index (relevance scores) of the counts Series
for score in relevance_counts_app.index:
    count = relevance_counts_app[score]
    percentage = relevance_percentages_app[score]
    print(f"- {score}: {percentage:.2f}% ({count} papers)")



--- Relevance Score Breakdown for Application Papers ---
- relevant: 75.02% (1129 papers)
- low_relevance: 21.59% (325 papers)
- irrelevant: 3.39% (51 papers)


In [16]:
print("\n--- Data Modalities Used Counts for Application Papers ---")
# This column contains lists, so we need to use Counter to count individual modalities
application_modalities_counter = Counter()

# Iterate through the 'data_modalities_used' column, handling potential missing values (NaN)
# and ensuring the content is actually a list before updating the counter.
if 'data_modalities_used' in application_papers_df.columns:
    for modalities_list in application_papers_df['data_modalities_used'].dropna():
        if isinstance(modalities_list, list):
            application_modalities_counter.update(modalities_list)

    if application_modalities_counter:
        print("Counts of each data modality used in Application Papers:")
        # Display all counts, or use .most_common(N) to show only top N
        for modality, count in application_modalities_counter.most_common():
            print(f"- {modality}: {count}")
    else:
        print("No data modalities found or processed for application papers.")
else:
    print("'data_modalities_used' column not found in application papers DataFrame.")



--- Data Modalities Used Counts for Application Papers ---
Counts of each data modality used in Application Papers:
- spatial transcriptomics: 1087
- histology images: 295
- single-cell RNA sequencing: 211
- snRNA-seq: 117
- spatial proteomics: 98
- scRNA-seq: 97
- single-cell transcriptomics: 95
- sequencing: 81
- immunohistochemistry: 78
- spatial metabolomics: 51
- bulk RNA-seq: 45
- bulk RNA sequencing: 34
- immunofluorescence: 32
- proteomics: 32
- multiplexed imaging: 31
- Visium: 30
- single-cell sequencing: 30
- flow cytometry: 26
- bulk transcriptomics: 23
- single-cell RNA-sequencing: 23
- transcriptomics: 22
- single-cell analysis: 22
- single-cell RNA-seq: 18
- multiplex immunofluorescence: 18
- RNA sequencing: 17
- RNA-seq: 15
- HE-stained images: 15
- metabolomics: 15
- multiplex immunohistochemistry: 15
- mass spectrometry imaging: 15
- single-nucleus RNA sequencing: 13
- immunofluorescence staining: 12
- bulk RNA-sequencing: 12
- IHC: 11
- mass spectrometry: 10
- in vi

ATTENTION! This needs curation. As several are not methods or are just written in different ways.  

In [17]:
print("\n--- Biological Context for Application Papers ---")

# Check if the 'biological_context' column exists
if 'biological_context' in application_papers_df.columns:
    # Count how many papers have a non-empty biological_context
    # .dropna() removes NaN (missing values)
    # .astype(bool) converts non-empty strings to True, empty strings to False
    # .sum() counts the True values
    num_with_context = application_papers_df['biological_context'].dropna().astype(bool).sum()
    num_without_context = len(application_papers_df) - num_with_context

    print(f"Total application papers: {len(application_papers_df)}")
    print(f"Papers with 'biological_context' provided: {num_with_context}")
    print(f"Papers without 'biological_context' provided: {num_without_context}")
    if len(application_papers_df) > 0:
        print(f"Percentage with context: {num_with_context / len(application_papers_df):.2%}")
        print(f"Percentage without context: {num_without_context / len(application_papers_df):.2%}")

    # Display some example biological contexts (first few non-empty ones)
    print("\nExample 'biological_context' entries (first 5 non-empty):")
    # Filter for non-empty contexts, drop duplicates for unique examples, and take head
    examples = application_papers_df[application_papers_df['biological_context'].astype(bool)]['biological_context'].drop_duplicates().head(5)
    if not examples.empty:
        for i, context in enumerate(examples):
            print(f"{i+1}. {context}")
    else:
        print("No non-empty 'biological_context' entries found in application papers.")

else:
    print("'biological_context' column not found in the DataFrame. Please check column names.")


--- Biological Context for Application Papers ---
Total application papers: 1505
Papers with 'biological_context' provided: 1505
Papers without 'biological_context' provided: 0
Percentage with context: 100.00%
Percentage without context: 0.00%

Example 'biological_context' entries (first 5 non-empty):
1. Colorectal cancer
2. Breast cancer progression (DCIS to IBC)
3. Human kidney diseases (collapsing glomerulopathy, crescentic glomerulonephritis) and mouse models of podocyte dysfunction
4. Primary sclerosing cholangitis and liver disease
5. Rheumatoid Arthritis (RA) synovium


In [18]:

if not application_papers_df.empty:
    if 'biological_context' in application_papers_df.columns:
        # Get value counts for the 'biological_context' column
        # .fillna('None Provided') ensures that missing values (NaN) are also counted as a category
        biological_context_counts = application_papers_df['biological_context'].fillna('None Provided').value_counts()

        if not biological_context_counts.empty:
            print("Counts of each unique 'biological_context' entry:")
            # .to_string() is used to ensure all rows are printed without truncation
            print(biological_context_counts.to_string())
            print(f"\nTotal unique biological contexts (including 'None Provided'): {len(biological_context_counts)}")
        else:
            print("No 'biological_context' entries found or processed for application papers (after filtering).")
    else:
        print("'biological_context' column not found in the application papers DataFrame. Please check column names.")
else:
    print("The application papers DataFrame is empty. Cannot generate biological context counts.")

Counts of each unique 'biological_context' entry:
biological_context
Alzheimer's disease                                                                                                                                                                                                                                                                 29
Hepatocellular carcinoma (HCC)                                                                                                                                                                                                                                                      29
Colorectal cancer                                                                                                                                                                                                                                                                   26
Breast cancer                                                                                 

In [9]:
print("\n--- Methodology Type Counts for 'method' papers ---")
method_papers = df_annotations[df_annotations['paper_type'] == 'method']
if not method_papers.empty:
    print(method_papers['methodology_type'].value_counts())
else:
    print("No 'method' papers found.")

# Example: Count occurrences of each data modality
print("\n--- Data Modalities Counts ---")
# For list columns, you need to "explode" them or use apply with Counter
all_modalities = Counter()
for modalities_list in df_annotations['data_modalities_used'].dropna():
    all_modalities.update(modalities_list)
print(all_modalities.most_common(10)) # Top 10 most common modalities


--- Methodology Type Counts for 'method' papers ---
methodology_type
algorithm            526
pipeline             237
framework            193
tool                 193
workflow              62
other_methodology     43
N/A                   10
dataset                1
method                 1
Name: count, dtype: int64

--- Data Modalities Counts ---
[('spatial transcriptomics', 1927), ('histology images', 548), ('single-cell RNA sequencing', 296), ('spatial proteomics', 242), ('scRNA-seq', 173), ('single-cell transcriptomics', 156), ('snRNA-seq', 153), ('sequencing', 147), ('multiplexed imaging', 124), ('immunohistochemistry', 95)]
