# Candida auris RNA-seq Literature Survey Reproduction

This notebook reproduces the methodology for creating a comprehensive literature survey of RNA-seq studies on *Candida auris* published since 2020.

**Date**: December 2, 2025  
**Target**: 14 papers from PubMed and Europe PMC searches  
**Scope**: Excludes 2 papers found through repository analysis

## Setup and Dependencies

In [None]:
# Install required packages
# !pip install biopython pandas requests beautifulsoup4 lxml

In [None]:
import pandas as pd
from Bio import Entrez
import requests
from bs4 import BeautifulSoup
import time
import json
from typing import List, Dict
import re

# Set your email for NCBI Entrez (required)
Entrez.email = "your.email@example.com"  # CHANGE THIS!

## Phase 1: PubMed Search

Search PubMed for Candida auris RNA-seq papers since 2020.

In [None]:
def search_pubmed(query: str, min_year: int = 2020) -> List[str]:
    """
    Search PubMed and return list of PMIDs.
    
    Args:
        query: Search query string
        min_year: Minimum publication year
    
    Returns:
        List of PubMed IDs
    """
    search_query = f"({query}) AND ({min_year}[PDAT] : 3000[PDAT])"
    
    try:
        handle = Entrez.esearch(
            db="pubmed",
            term=search_query,
            retmax=100,
            sort="relevance"
        )
        record = Entrez.read(handle)
        handle.close()
        return record["IdList"]
    except Exception as e:
        print(f"Error searching PubMed: {e}")
        return []

# Search queries used in original analysis
search_queries = [
    "Candida auris RNA-seq",
    "Candida auris transcriptome",
    "Candida auris differential expression",
    "Candidozyma auris RNA sequencing",
    "Candida auris biofilm RNA-seq",
]

# Collect all PMIDs
all_pmids = set()

for query in search_queries:
    print(f"Searching: {query}")
    pmids = search_pubmed(query)
    all_pmids.update(pmids)
    print(f"  Found: {len(pmids)} papers")
    time.sleep(0.5)  # Be nice to NCBI servers

print(f"\nTotal unique PMIDs: {len(all_pmids)}")

## Phase 2: Fetch Paper Details from PubMed

In [None]:
def fetch_paper_details(pmid: str) -> Dict:
    """
    Fetch paper details from PubMed.
    
    Args:
        pmid: PubMed ID
    
    Returns:
        Dictionary with paper information
    """
    try:
        handle = Entrez.efetch(db="pubmed", id=pmid, rettype="medline", retmode="text")
        record = handle.read()
        handle.close()
        
        # Parse key information
        info = {
            'pmid': pmid,
            'title': '',
            'abstract': '',
            'year': '',
            'journal': '',
            'authors': ''
        }
        
        # Extract title
        title_match = re.search(r'TI  - (.+?)\n', record)
        if title_match:
            info['title'] = title_match.group(1)
        
        # Extract abstract
        abstract_match = re.search(r'AB  - (.+?)(?=\n[A-Z]{2}  -|$)', record, re.DOTALL)
        if abstract_match:
            info['abstract'] = abstract_match.group(1).replace('\n      ', ' ')
        
        # Extract year
        year_match = re.search(r'DP  - (\d{4})', record)
        if year_match:
            info['year'] = year_match.group(1)
        
        # Extract journal
        journal_match = re.search(r'JT  - (.+?)\n', record)
        if journal_match:
            info['journal'] = journal_match.group(1)
        
        return info
        
    except Exception as e:
        print(f"Error fetching PMID {pmid}: {e}")
        return {'pmid': pmid, 'error': str(e)}

# Fetch details for all papers
papers = []
for pmid in sorted(all_pmids):
    print(f"Fetching PMID: {pmid}")
    details = fetch_paper_details(pmid)
    papers.append(details)
    time.sleep(0.5)  # Be nice to NCBI servers

# Create DataFrame
df_papers = pd.DataFrame(papers)
print(f"\nFetched {len(df_papers)} papers")
df_papers.head()

## Phase 3: Filter for RNA-seq Papers

Filter papers that actually performed RNA-seq analysis based on title/abstract keywords.

In [None]:
def is_rnaseq_paper(title: str, abstract: str) -> bool:
    """
    Check if paper likely involves RNA-seq based on keywords.
    """
    text = (title + ' ' + abstract).lower()
    
    # RNA-seq keywords
    rnaseq_keywords = [
        'rna-seq', 'rna seq', 'rnaseq',
        'transcriptome', 'transcriptomic',
        'differential expression',
        'gene expression profiling',
        'rna sequencing'
    ]
    
    return any(keyword in text for keyword in rnaseq_keywords)

# Filter papers
df_papers['is_rnaseq'] = df_papers.apply(
    lambda row: is_rnaseq_paper(row.get('title', ''), row.get('abstract', '')),
    axis=1
)

df_rnaseq = df_papers[df_papers['is_rnaseq']].copy()
print(f"RNA-seq papers: {len(df_rnaseq)}")

# Display filtered papers
df_rnaseq[['pmid', 'year', 'title']].sort_values('year')

## Phase 4: Europe PMC Search

Search Europe PMC for additional papers.

In [None]:
def search_europe_pmc(query: str, page_size: int = 100) -> List[Dict]:
    """
    Search Europe PMC using their REST API.
    
    Args:
        query: Search query
        page_size: Number of results per page
    
    Returns:
        List of paper dictionaries
    """
    base_url = "https://www.ebi.ac.uk/europepmc/webservices/rest/search"
    
    params = {
        'query': query,
        'format': 'json',
        'pageSize': page_size,
        'cursorMark': '*'
    }
    
    try:
        response = requests.get(base_url, params=params)
        response.raise_for_status()
        data = response.json()
        
        results = []
        for result in data.get('resultList', {}).get('result', []):
            results.append({
                'pmid': result.get('pmid', 'N/A'),
                'title': result.get('title', ''),
                'abstract': result.get('abstractText', ''),
                'year': result.get('pubYear', ''),
                'journal': result.get('journalTitle', ''),
                'source': 'Europe PMC'
            })
        
        return results
        
    except Exception as e:
        print(f"Error searching Europe PMC: {e}")
        return []

# Europe PMC searches
epmc_queries = [
    '"Candida auris" AND "RNA-seq" AND PUB_YEAR:[2020 TO 2025]',
    '"Candida auris" AND transcriptome AND PUB_YEAR:[2020 TO 2025]',
    'Candidozyma auris AND RNA sequencing',
]

epmc_papers = []
for query in epmc_queries:
    print(f"Searching Europe PMC: {query}")
    results = search_europe_pmc(query)
    epmc_papers.extend(results)
    print(f"  Found: {len(results)} papers")
    time.sleep(1)

# Create DataFrame and remove duplicates
df_epmc = pd.DataFrame(epmc_papers)
if not df_epmc.empty:
    df_epmc = df_epmc[df_epmc['pmid'] != 'N/A'].drop_duplicates('pmid')
    print(f"\nEurope PMC unique papers: {len(df_epmc)}")
    df_epmc.head()
else:
    print("No results from Europe PMC")

## Phase 5: Combine Results and Identify New Papers

In [None]:
# Get PMIDs already found in PubMed search
pubmed_pmids = set(df_rnaseq['pmid'].astype(str))

# Find papers unique to Europe PMC
if not df_epmc.empty:
    df_epmc['pmid'] = df_epmc['pmid'].astype(str)
    epmc_only = df_epmc[~df_epmc['pmid'].isin(pubmed_pmids)].copy()
    print(f"Papers found only in Europe PMC: {len(epmc_only)}")
    
    # Mark source
    df_rnaseq['source'] = 'PubMed'
    epmc_only['source'] = 'Europe PMC'
    
    # Combine
    df_combined = pd.concat([df_rnaseq, epmc_only], ignore_index=True)
else:
    df_rnaseq['source'] = 'PubMed'
    df_combined = df_rnaseq.copy()

print(f"\nTotal papers for detailed analysis: {len(df_combined)}")
df_combined[['pmid', 'year', 'source', 'title']].sort_values('year')

## Phase 6: Manual Data Extraction

**NOTE**: The following information requires manual extraction from full-text articles:
1. Genome version used
2. Specific RNA-seq analysis type
3. Bioinformatics tools used

This step was performed using:
- PubMed Central full-text access
- Publisher websites (when open access)
- AI-assisted extraction from methods sections

### Known Papers from Survey (14 papers excluding repository analysis):

In [None]:
# Create the final dataset with manually extracted information
# This matches the 14 papers from PubMed + Europe PMC searches

survey_data = [
    {
        'pmid': '32581078',
        'source': 'PubMed',
        'genome_version': 'N/A',
        'rnaseq_type': 'Differential expression analysis of biofilm vs. planktonic cells',
        'tools': 'N/A (not specified in abstract)',
        'year': 2020
    },
    {
        'pmid': '33937102',
        'source': 'PubMed',
        'genome_version': 'B11221 (GCF_002775015.1)',
        'rnaseq_type': 'Differential expression analysis comparing clinical isolates',
        'tools': 'FastQC, cutadapt, NextGenMap, Picard, HTseq, edgeR, clusterProfiler',
        'year': 2021
    },
    {
        'pmid': '34354695',
        'source': 'PubMed',
        'genome_version': 'N/A (specific reference genome)',
        'rnaseq_type': 'Differential expression analysis: drug-sensitive vs. resistant strains',
        'tools': 'HISAT2, Cufflinks, HTSeq-count, DEseq, Trimmomatic, pheatmap, STRING database',
        'year': 2021
    },
    {
        'pmid': '34485470',
        'source': 'PubMed',
        'genome_version': 'GCA_002759435 (Ensembl Fungi)',
        'rnaseq_type': 'Differential expression analysis: farnesol response',
        'tools': 'DESeq (StrandNGS software), Agilent BioAnalyzer, NEBNext Ultra II kit',
        'year': 2021
    },
    {
        'pmid': '34630944',
        'source': 'Europe PMC',
        'genome_version': 'B8441 (GCA_002759435.2 V2)',
        'rnaseq_type': 'Differential expression and translational profiling: caspofungin response',
        'tools': 'CLC Genomics Workbench v20, TMM normalization, EdgeR, DAVID v6.8, BLASTp, qRT-PCR',
        'year': 2021
    },
    {
        'pmid': '34788438',
        'source': 'PubMed',
        'genome_version': 'B8441 (GCA_002759435.2 V2)',
        'rnaseq_type': 'Small RNA sequencing of cellular and extracellular vesicles',
        'tools': 'CLC Genomics Workbench v20, TMM normalization, TruSeq small RNA kit, qRT-PCR',
        'year': 2021
    },
    {
        'pmid': '35652307',
        'source': 'PubMed',
        'genome_version': 'B8441 (Candida Genome Database)',
        'rnaseq_type': 'Comparative transcriptomics: AmB-resistant vs. sensitive isolates',
        'tools': 'HISAT2, HTSeq, DESeq2, Orange3, BioVenn, Fungifun2, Gene Ontology Term Finder',
        'year': 2022
    },
    {
        'pmid': '35968956',
        'source': 'Europe PMC',
        'genome_version': 'B8441 (s01-m01-r10)',
        'rnaseq_type': 'Comparative transcriptomics: echinocandin-resistant vs. susceptible isolates',
        'tools': 'FastQC, cutadapt, NextGenMap, Picard, HTseq, edgeR, clusterProfiler, VennDiagram',
        'year': 2022
    },
    {
        'pmid': '36913408',
        'source': 'Europe PMC',
        'genome_version': 'GCA_002759435.2',
        'rnaseq_type': 'Differential expression analysis: aggregative vs. nonaggregative strains (biofilm)',
        'tools': 'HiSat2 v2.0.5, Stringtie v1.3.3b, DESeq2, Illumina NovaSeq 6000',
        'year': 2023
    },
    {
        'pmid': '37350781',
        'source': 'PubMed',
        'genome_version': 'B11221',
        'rnaseq_type': 'Transcriptomic profiling: rough vs. smooth morphotypes',
        'tools': 'Bowtie2, HISAT2, HTSeq, DESeq, topGO, KOBAS, Pheatmap',
        'year': 2023
    },
    {
        'pmid': '38990436',
        'source': 'PubMed',
        'genome_version': 'N/A',
        'rnaseq_type': 'Comparative transcriptomics: host dermal cells infected with C. auris',
        'tools': 'qRT-PCR, flow cytometry, KEGG, Reactome analyses',
        'year': 2024
    },
    {
        'pmid': 'PMC11385638',
        'source': 'PubMed',
        'genome_version': 'B11221',
        'rnaseq_type': 'Differential expression analysis: reduced AmB sensitivity',
        'tools': 'DESeq2, KEGG, Gene Ontology, STRING database, qPCR, Illumina NovaSeq',
        'year': 2024
    },
    {
        'pmid': 'PMC11459930',
        'source': 'Europe PMC',
        'genome_version': 'B8441 (GCA_002759435.2)',
        'rnaseq_type': 'Whole transcriptome sequencing: pan-drug resistant strains',
        'tools': 'HISAT2 v2.2.1, StringTie v1.3.3b, Ballgown v3.15, BiNGO, HMMER v3.3.2, CLC Genomics Server v23',
        'year': 2024
    },
    {
        'pmid': '40099908',
        'source': 'Europe PMC',
        'genome_version': 'B8441 (reference allele)',
        'rnaseq_type': 'Gene expression profiling and SNP identification: flucytosine resistance',
        'tools': 'STAR (two-pass workflow), drc R package, IGV viewer, enrichGO (clusterProfiler), Sanger sequencing',
        'year': 2025
    },
]

df_final = pd.DataFrame(survey_data)
print(f"Final survey: {len(df_final)} papers")
print(f"\nBreakdown by source:")
print(df_final['source'].value_counts())
print(f"\nYear distribution:")
print(df_final['year'].value_counts().sort_index())

## Phase 7: Create Final Table

In [None]:
# Create formatted table
df_display = df_final[['pmid', 'source', 'genome_version', 'rnaseq_type', 'tools']].copy()
df_display.columns = ['PubMed ID', 'Data Source', 'Genome Version', 'Type of RNA-seq', 'Tools Used']

# Display
print("\n=== RNA-seq Literature Survey: Candida auris (2020-2025) ===")
print(f"Total papers: {len(df_display)}\n")

df_display

## Phase 8: Export Results

In [None]:
# Export to CSV
output_file = 'cauris_rnaseq_survey.csv'
df_final.to_csv(output_file, index=False)
print(f"Results exported to: {output_file}")

# Export to Markdown table
def create_markdown_table(df):
    """Create a markdown table from DataFrame."""
    md = "| PubMed ID | Data Source | Genome Version | Type of RNA-seq | Tools Used |\n"
    md += "|-----------|-------------|----------------|-----------------|------------|\n"
    
    for _, row in df.iterrows():
        pmid = row['pmid']
        if pmid.startswith('PMC'):
            pmid_link = f"[{pmid}](https://pmc.ncbi.nlm.nih.gov/articles/{pmid}/)"
        else:
            pmid_link = f"[{pmid}](https://pubmed.ncbi.nlm.nih.gov/{pmid}/)"
        
        md += f"| {pmid_link} | {row['source']} | {row['genome_version']} | {row['rnaseq_type']} | {row['tools']} |\n"
    
    return md

markdown_table = create_markdown_table(df_final)
with open('cauris_rnaseq_survey_table.md', 'w') as f:
    f.write("# Candida auris RNA-seq Literature Survey (2020-2025)\n\n")
    f.write(markdown_table)

print("Markdown table exported to: cauris_rnaseq_survey_table.md")

## Summary Statistics

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Set style
sns.set_style("whitegrid")

# Create visualizations
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Papers by year
year_counts = df_final['year'].value_counts().sort_index()
axes[0].bar(year_counts.index, year_counts.values, color='steelblue')
axes[0].set_xlabel('Year', fontsize=12)
axes[0].set_ylabel('Number of Papers', fontsize=12)
axes[0].set_title('Papers by Publication Year', fontsize=14, fontweight='bold')
axes[0].grid(axis='y', alpha=0.3)

# Papers by source
source_counts = df_final['source'].value_counts()
colors = ['steelblue', 'coral']
axes[1].pie(source_counts.values, labels=source_counts.index, autopct='%1.1f%%',
            colors=colors, startangle=90)
axes[1].set_title('Papers by Data Source', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.savefig('survey_statistics.png', dpi=300, bbox_inches='tight')
plt.show()

print("\n=== Survey Statistics ===")
print(f"Total papers: {len(df_final)}")
print(f"\nPapers by source:")
print(source_counts)
print(f"\nPapers by year:")
print(year_counts)

# Most common genome versions
genome_counts = df_final[df_final['genome_version'] != 'N/A']['genome_version'].str.extract(r'(B\d+|GCA_\d+)')[0].value_counts()
print(f"\nMost common genome versions:")
print(genome_counts)

## Methodology Notes

### Automated Steps:
1. PubMed search via Entrez API
2. Europe PMC search via REST API
3. Initial filtering based on keywords
4. Deduplication

### Manual Steps:
1. **Full-text review**: Accessed PMC and publisher websites for methods sections
2. **Data extraction**: 
   - Genome version from methods
   - Specific RNA-seq workflow details
   - Complete tool lists
3. **Quality assessment**: Verified RNA-seq was actually performed (not just cited)
4. **Cross-referencing**: Checked against BioProject/SRA for validation

### Limitations:
- Some papers lack publicly accessible full text
- Genome versions not always clearly stated
- Tool details may be incomplete in methods sections
- Manual extraction subject to interpretation

### Reproducibility:
- Automated searches are reproducible
- Results may vary with timing (new publications)
- Manual extraction requires domain expertise
- Cross-validation recommended for comprehensive surveys