In [None]:
import re
import os
import shutil
import pandas as pd
import matplotlib.pyplot as plt
from alphagenome.data import genome
from alphagenome.models import dna_client
from alphagenome.visualization import plot_components
import time
import requests

# --- 0. Set up data paths and API keys ---
data_dir = '../data'
excel_file = os.path.join(data_dir, 'Supplementary Table 3.xlsx')
db_file = os.path.join(data_dir, 'longevitymap.sqlite')

NCBI_API_KEY = os.environ.get('NCBI_API_KEY')
ALPHA_GENOME_API_KEY = os.environ.get('ALPHA_GENOME_API_KEY')


# --- Main Analysis Function ---
def run_analysis():
    # --- 1. Regenerate Data Files ---
    print("\nRegenerating data files from the source Excel file...")
    try:
        xls = pd.ExcelFile(excel_file)
        sheet_to_csv_map = {
            'Cluster 1 AD - upregulated': 'Supplementary Table 3.xlsx - Cluster 1 AD - upregulated.csv',
            'Cluster 1 AD - downregulated': 'Supplementary Table 3.xlsx - Cluster 1 AD - downregulated.csv',
            'Cluster 2 PD - upregulated': 'Supplementary Table 3.xlsx - Cluster 2 PD - upregulated.csv',
            'Cluster 2 PD - downregulated': 'Supplementary Table 3.xlsx - Cluster 2 PD - downregulated.csv'
        }
        for sheet_name, csv_filename in sheet_to_csv_map.items():
            df = pd.read_excel(xls, sheet_name=sheet_name)
            df.to_csv(csv_filename, index=False)
        print("Data files regenerated successfully.")
    except FileNotFoundError:
        print(f"ERROR: '{excel_file}' not found. Please make sure the file is in the data/ directory.")
        return # Stop execution

    # --- 2. Fetch Detailed Gene Info from NCBI ---
    print("\nFetching detailed gene information from NCBI...")
    FILENAMES = list(sheet_to_csv_map.values())
    OUTPUT_FILENAME = 'Comprehensive_Gene_Analysis.csv'
    # Safely get the NCBI API key from environment variables
    if not NCBI_API_KEY:
        print("WARNING: 'NCBI_API_KEY' not found in environment variables. Proceeding without API key (may be slower).")

    def get_gene_info(entrez_id):
        base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
        db = "gene"
        summary_url = f"{base_url}esummary.fcgi?db={db}&id={entrez_id}&retmode=json"
        if NCBI_API_KEY:
            summary_url += f"&api_key={NCBI_API_KEY}"
        try:
            response = requests.get(summary_url)
            response.raise_for_status()
            data = response.json()
            result = data['result'][str(entrez_id)]
            gene_symbol = result.get('name', 'N/A')
            summary = result.get('summary', 'No summary available.')
            location = result.get('maplocation', 'N/A')
            genomic_info = result.get('genomicinfo')
            if genomic_info:
                genomic_info = genomic_info[0]
                chromosome = genomic_info.get('chraccver', 'N/A')
                start = genomic_info.get('chrstart', 'N/A')
                end = genomic_info.get('chrstop', 'N/A')
            else:
                location_hist = result.get('locationhist')
                if location_hist:
                    latest_location = location_hist[0]
                    chromosome = latest_location.get('chraccver', 'N/A')
                    start = latest_location.get('chrstart', 'N/A')
                    end = latest_location.get('chrstop', 'N/A')
                else:
                    chromosome, start, end = 'N/A', 'N/A', 'N/A'
            return gene_symbol, f"Location: {location}", summary or "No summary available.", chromosome, start, end
        except Exception:
            return 'Error', 'Error', 'Error', 'Error', 'Error', 'Error'

    all_results = []
    for filename in FILENAMES:
        if os.path.exists(filename):
            df = pd.read_csv(filename)
            list_type = " ".join(filename.split(' - ')[1:3]).replace('.csv', '')
            for index, row in df.iterrows():
                entrez_id = int(row['ENTREZ'])
                symbol, annotations, association, chromosome, start, end = get_gene_info(entrez_id)
                all_results.append({
                    'List': list_type, 'Gene Symbol': symbol, 'Entrez ID': entrez_id,
                    'Genomic Annotations': annotations, 'Disease Association Studies': association,
                    'Chromosome': chromosome, 'Start': start, 'End': end
                })
                time.sleep(0.3) # Rate limit requests
    final_df = pd.DataFrame(all_results)
    final_df.to_csv(OUTPUT_FILENAME, index=False)
    print("Gene information fetched and saved.")

    # --- 3. AlphaGenome Analysis ---
    print("\nStarting AlphaGenome analysis...")
    gene_analysis_df = pd.read_csv(OUTPUT_FILENAME)
    # Safely get the API key from environment variables
    if not ALPHA_GENOME_API_KEY:
        print("ERROR: 'ALPHA_GENOME_API_KEY' not found in environment variables. Please set it.")
        return # Stop execution

    model = dna_client.create(ALPHA_GENOME_API_KEY)
    chromosome_mapping = {
        'NC_000001.11': 'chr1', 'NC_000002.12': 'chr2', 'NC_000003.12': 'chr3',
        'NC_000004.12': 'chr4', 'NC_000005.10': 'chr5', 'NC_000006.12': 'chr6',
        'NC_000007.14': 'chr7', 'NC_000008.11': 'chr8', 'NC_000009.12': 'chr9',
        'NC_000010.11': 'chr10', 'NC_000011.10': 'chr11', 'NC_000012.12': 'chr12',
        'NC_000013.11': 'chr13', 'NC_000014.9': 'chr14', 'NC_000015.10': 'chr15',
        'NC_000016.10': 'chr16', 'NC_000017.11': 'chr17', 'NC_000018.10': 'chr18',
        'NC_000019.10': 'chr19', 'NC_000020.11': 'chr20', 'NC_000021.9': 'chr21',
        'NC_000022.11': 'chr22', 'NC_000023.11': 'chrX', 'NC_000024.10': 'chrY',
