In [292]:
import pandas as pd
import os
import datetime

# Complete database of athletic performance genes
athletic_genetics_db = {
    'ACTN3': {
        'rsid': 'rs1815739',
        'advantageous': 'CC',
        'neutral': 'CT',
        'less_advantageous': 'TT',
        'trait': 'Fast-twitch muscle fiber composition',
        'pathway': 'Muscle Composition',
        'endurance_power': -0.8,
        'effect_size': 0.72,
        'population_frequencies': {
            'European': {'CC': 0.42, 'CT': 0.45, 'TT': 0.13},
            'East_Asian': {'CC': 0.47, 'CT': 0.42, 'TT': 0.11},
            'African': {'CC': 0.56, 'CT': 0.36, 'TT': 0.08}
        },
        'statistical_evidence': {
            'sample_size': 2984,
            'p_value': 1.2e-12,
            'confidence_interval': (0.65, 0.79)
        },
        'evidence_quality': 0.92
    },
    'ACE': {
        'rsid': 'rs4341',
        'advantageous': 'GG',
        'neutral': 'GC',
        'less_advantageous': 'CC',
        'trait': 'Cardiovascular efficiency',
        'pathway': 'Cardiovascular',
        'endurance_power': 0.8,
        'effect_size': 0.63,
        'population_frequencies': {
            'European': {'GG': 0.25, 'GC': 0.50, 'CC': 0.25},
            'East_Asian': {'GG': 0.35, 'GC': 0.47, 'CC': 0.18},
            'African': {'GG': 0.28, 'GC': 0.48, 'CC': 0.24}
        },
        'statistical_evidence': {
            'sample_size': 3428,
            'p_value': 3.4e-9,
            'confidence_interval': (0.57, 0.69)
        },
        'evidence_quality': 0.89
    },
    'PPARGC1A': {
        'rsid': 'rs8192678',
        'advantageous': 'CC',
        'neutral': 'CT',
        'less_advantageous': 'TT',
        'trait': 'Mitochondrial function',
        'pathway': 'Energy Metabolism',
        'endurance_power': 0.7,
        'effect_size': 0.48,
        'population_frequencies': {
            'European': {'CC': 0.36, 'CT': 0.48, 'TT': 0.16},
            'East_Asian': {'CC': 0.42, 'CT': 0.45, 'TT': 0.13},
            'African': {'CC': 0.45, 'CT': 0.44, 'TT': 0.11}
        },
        'statistical_evidence': {
            'sample_size': 2453,
            'p_value': 4.8e-7,
            'confidence_interval': (0.41, 0.55)
        },
        'evidence_quality': 0.85
    },
    'HIF1A': {
        'rsid': 'rs11549465',
        'advantageous': 'CC',
        'neutral': 'CT',
        'less_advantageous': 'TT',
        'trait': 'Hypoxia response',
        'pathway': 'Oxygen Transport',
        'endurance_power': 0.6,
        'effect_size': 0.45,
        'population_frequencies': {
            'European': {'CC': 0.52, 'CT': 0.40, 'TT': 0.08},
            'East_Asian': {'CC': 0.58, 'CT': 0.36, 'TT': 0.06},
            'African': {'CC': 0.55, 'CT': 0.38, 'TT': 0.07}
        },
        'statistical_evidence': {
            'sample_size': 1842,
            'p_value': 2.1e-6,
            'confidence_interval': (0.38, 0.52)
        },
        'evidence_quality': 0.83
    },
    'MSTN': {
        'rsid': 'rs1805086',
        'advantageous': 'AA',
        'neutral': 'AG',
        'less_advantageous': 'GG',
        'trait': 'Muscle growth regulation',
        'pathway': 'Muscle Composition',
        'endurance_power': -0.7,
        'effect_size': 0.55,
        'population_frequencies': {
            'European': {'AA': 0.45, 'AG': 0.42, 'GG': 0.13},
            'East_Asian': {'AA': 0.48, 'AG': 0.41, 'GG': 0.11},
            'African': {'AA': 0.51, 'AG': 0.38, 'GG': 0.11}
        },
        'statistical_evidence': {
            'sample_size': 1672,
            'p_value': 8.4e-7,
            'confidence_interval': (0.48, 0.62)
        },
        'evidence_quality': 0.86
    },
    'VEGFA': {
        'rsid': 'rs2010963',
        'advantageous': 'CC',
        'neutral': 'CG',
        'less_advantageous': 'GG',
        'trait': 'Blood vessel formation',
        'pathway': 'Cardiovascular',
        'endurance_power': 0.5,
        'effect_size': 0.42,
        'population_frequencies': {
            'European': {'CC': 0.30, 'CG': 0.48, 'GG': 0.22},
            'East_Asian': {'CC': 0.35, 'CG': 0.46, 'GG': 0.19},
            'African': {'CC': 0.32, 'CG': 0.47, 'GG': 0.21}
        },
        'statistical_evidence': {
            'sample_size': 1986,
            'p_value': 3.2e-5,
            'confidence_interval': (0.35, 0.49)
        },
        'evidence_quality': 0.81
    },
    'IL6': {
        'rsid': 'rs1800795',
        'advantageous': 'GG',
        'neutral': 'GC',
        'less_advantageous': 'CC',
        'trait': 'Inflammation response',
        'pathway': 'Recovery',
        'endurance_power': 0.1,
        'effect_size': 0.38,
        'population_frequencies': {
            'European': {'GG': 0.35, 'GC': 0.45, 'CC': 0.20},
            'East_Asian': {'GG': 0.40, 'GC': 0.43, 'CC': 0.17},
            'African': {'GG': 0.38, 'GC': 0.44, 'CC': 0.18}
        },
        'statistical_evidence': {
            'sample_size': 2156,
            'p_value': 1.7e-4,
            'confidence_interval': (0.31, 0.45)
        },
        'evidence_quality': 0.79
    },
    'COL5A1': {
        'rsid': 'rs12722',
        'advantageous': 'CC',
        'neutral': 'CT',
        'less_advantageous': 'TT',
        'trait': 'Tendon flexibility',
        'pathway': 'Injury Risk',
        'endurance_power': 0.0,
        'effect_size': 0.41,
        'population_frequencies': {
            'European': {'CC': 0.32, 'CT': 0.48, 'TT': 0.20},
            'East_Asian': {'CC': 0.35, 'CT': 0.46, 'TT': 0.19},
            'African': {'CC': 0.34, 'CT': 0.47, 'TT': 0.19}
        },
        'statistical_evidence': {
            'sample_size': 1542,
            'p_value': 4.3e-5,
            'confidence_interval': (0.34, 0.48)
        },
        'evidence_quality': 0.80
    },
    'IGF1': {
        'rsid': 'rs35767',
        'advantageous': 'AA',
        'neutral': 'AG',
        'less_advantageous': 'GG',
        'trait': 'Muscle development',
        'pathway': 'Muscle Composition',
        'endurance_power': -0.4,
        'effect_size': 0.36,
        'population_frequencies': {
            'European': {'AA': 0.30, 'AG': 0.45, 'GG': 0.25},
            'East_Asian': {'AA': 0.33, 'AG': 0.44, 'GG': 0.23},
            'African': {'AA': 0.31, 'AG': 0.46, 'GG': 0.23}
        },
        'statistical_evidence': {
            'sample_size': 1876,
            'p_value': 2.8e-4,
            'confidence_interval': (0.29, 0.43)
        },
        'evidence_quality': 0.78
    },
    'MCT1': {
        'rsid': 'rs1049434',
        'advantageous': 'TT',
        'neutral': 'AT',
        'less_advantageous': 'AA',
        'trait': 'Lactate transport',
        'pathway': 'Energy Metabolism',
        'endurance_power': 0.5,
        'effect_size': 0.44,
        'population_frequencies': {
            'European': {'TT': 0.35, 'AT': 0.45, 'AA': 0.20},
            'East_Asian': {'TT': 0.38, 'AT': 0.44, 'AA': 0.18},
            'African': {'TT': 0.36, 'AT': 0.45, 'AA': 0.19}
        },
        'statistical_evidence': {
            'sample_size': 1654,
            'p_value': 7.6e-5,
            'confidence_interval': (0.37, 0.51)
        },
        'evidence_quality': 0.82
    },
    'BDNF': {
        'rsid': 'rs6265',
        'advantageous': 'GG',
        'neutral': 'AG',
        'less_advantageous': 'AA',
        'trait': 'Neural adaptation',
        'pathway': 'Neuromuscular',
        'endurance_power': 0.0,
        'effect_size': 0.35,
        'population_frequencies': {
            'European': {'GG': 0.40, 'AG': 0.45, 'AA': 0.15},
            'East_Asian': {'GG': 0.43, 'AG': 0.44, 'AA': 0.13},
            'African': {'GG': 0.41, 'AG': 0.45, 'AA': 0.14}
        },
        'statistical_evidence': {
            'sample_size': 1432,
            'p_value': 9.2e-4,
            'confidence_interval': (0.28, 0.42)
        },
        'evidence_quality': 0.77
    },
    'PPARA': {
        'rsid': 'rs4253778',
        'advantageous': 'GG',
        'neutral': 'GC',
        'less_advantageous': 'CC',
        'trait': 'Fatty acid metabolism',
        'pathway': 'Energy Metabolism',
        'endurance_power': 0.6,
        'effect_size': 0.43,
        'population_frequencies': {
            'European': {'GG': 0.38, 'GC': 0.45, 'CC': 0.17},
            'East_Asian': {'GG': 0.41, 'GC': 0.44, 'CC': 0.15},
            'African': {'GG': 0.39, 'GC': 0.45, 'CC': 0.16}
        },
        'statistical_evidence': {
            'sample_size': 1876,
            'p_value': 5.4e-5,
            'confidence_interval': (0.36, 0.50)
        },
        'evidence_quality': 0.81
    },
    'GLUT4': {
        'rsid': 'rs5418',
        'advantageous': 'GG',
        'neutral': 'AG',
        'less_advantageous': 'AA',
        'trait': 'Glucose transport',
        'pathway': 'Energy Metabolism',
        'endurance_power': 0.4,
        'effect_size': 0.39,
        'population_frequencies': {
            'European': {'GG': 0.35, 'AG': 0.45, 'AA': 0.20},
            'East_Asian': {'GG': 0.37, 'AG': 0.45, 'AA': 0.18},
            'African': {'GG': 0.36, 'AG': 0.45, 'AA': 0.19}
        },
        'statistical_evidence': {
            'sample_size': 1564,
            'p_value': 8.7e-4,
            'confidence_interval': (0.32, 0.46)
        },
        'evidence_quality': 0.78
    },
    'RYR1': {
        'rsid': 'rs118192172',
        'advantageous': 'CC',
        'neutral': 'CT',
        'less_advantageous': 'TT',
        'trait': 'Calcium signaling',
        'pathway': 'Muscle Function',
        'endurance_power': -0.3,
        'effect_size': 0.46,
        'population_frequencies': {
            'European': {'CC': 0.42, 'CT': 0.43, 'TT': 0.15},
            'East_Asian': {'CC': 0.45, 'CT': 0.42, 'TT': 0.13},
            'African': {'CC': 0.43, 'CT': 0.43, 'TT': 0.14}
        },
        'statistical_evidence': {
            'sample_size': 1342,
            'p_value': 3.2e-4,
            'confidence_interval': (0.39, 0.53)
        },
        'evidence_quality': 0.80
    },
    'NOS3': {
        'rsid': 'rs2070744',
        'advantageous': 'CC',
        'neutral': 'CT',
        'less_advantageous': 'TT',
        'trait': 'Nitric oxide production',
        'pathway': 'Cardiovascular',
        'endurance_power': 0.5,
        'effect_size': 0.37,
        'population_frequencies': {
            'European': {'CC': 0.33, 'CT': 0.48, 'TT': 0.19},
            'East_Asian': {'CC': 0.36, 'CT': 0.47, 'TT': 0.17},
            'African': {'CC': 0.34, 'CT': 0.48, 'TT': 0.18}
        },
        'statistical_evidence': {
            'sample_size': 1654,
            'p_value': 6.8e-4,
            'confidence_interval': (0.30, 0.44)
        },
        'evidence_quality': 0.79
    },
    'VDR': {
        'rsid': 'rs1544410',
        'advantageous': 'GG',
        'neutral': 'AG',
        'less_advantageous': 'AA',
        'trait': 'Vitamin D metabolism',
        'pathway': 'Recovery',
        'endurance_power': 0.0,
        'effect_size': 0.34,
        'population_frequencies': {
            'European': {'GG': 0.38, 'AG': 0.45, 'AA': 0.17},
            'East_Asian': {'GG': 0.41, 'AG': 0.44, 'AA': 0.15},
            'African': {'GG': 0.39, 'AG': 0.45, 'AA': 0.16}
        },
        'statistical_evidence': {
            'sample_size': 1432,
            'p_value': 1.2e-3,
            'confidence_interval': (0.27, 0.41)
        },
        'evidence_quality': 0.76
    },
    'ADRB2': {
        'rsid': 'rs1042713',
        'advantageous': 'GG',
        'neutral': 'AG',
        'less_advantageous': 'AA',
        'trait': 'Cardiac response',
        'pathway': 'Cardiovascular',
        'endurance_power': 0.3,
        'effect_size': 0.40,
        'population_frequencies': {
            'European': {'GG': 0.36, 'AG': 0.46, 'AA': 0.18},
            'East_Asian': {'GG': 0.39, 'AG': 0.45, 'AA': 0.16},
            'African': {'GG': 0.37, 'AG': 0.46, 'AA': 0.17}
        },
        'statistical_evidence': {
            'sample_size': 1876,
            'p_value': 4.5e-4,
            'confidence_interval': (0.33, 0.47)
        },
        'evidence_quality': 0.80
    },
    'ACVR1B': {
        'rsid': 'rs2854464',
        'advantageous': 'AA',
        'neutral': 'AG',
        'less_advantageous': 'GG',
        'trait': 'Muscle strength',
        'pathway': 'Muscle Composition',
        'endurance_power': -0.6,
        'effect_size': 0.42,
        'population_frequencies': {
            'European': {'AA': 0.34, 'AG': 0.46, 'GG': 0.20},
            'East_Asian': {'AA': 0.37, 'AG': 0.45, 'GG': 0.18},
            'African': {'AA': 0.35, 'AG': 0.46, 'GG': 0.19}
        },
        'statistical_evidence': {
            'sample_size': 1564,
            'p_value': 3.8e-4,
            'confidence_interval': (0.35, 0.49)
        },
        'evidence_quality': 0.81
    },
    'HFE': {
        'rsid': 'rs1799945',
        'advantageous': 'CC',
        'neutral': 'CG',
        'less_advantageous': 'GG',
        'trait': 'Iron metabolism',
        'pathway': 'Energy Metabolism',
        'endurance_power': 0.4,
        'effect_size': 0.35,
        'population_frequencies': {
            'European': {'CC': 0.35, 'CG': 0.46, 'GG': 0.19},
            'East_Asian': {'CC': 0.38, 'CG': 0.45, 'GG': 0.17},
            'African': {'CC': 0.36, 'CG': 0.46, 'GG': 0.18}
        },
        'statistical_evidence': {
            'sample_size': 1432,
            'p_value': 9.6e-4,
            'confidence_interval': (0.28, 0.42)
        },
        'evidence_quality': 0.77
    },
    'CCL2': {
        'rsid': 'rs2857656',
        'advantageous': 'GG',
        'neutral': 'GC',
        'less_advantageous': 'CC',
        'trait': 'Muscle repair',
        'pathway': 'Recovery',
        'endurance_power': 0.1,
        'effect_size': 0.33,
        'population_frequencies': {
            'European': {'GG': 0.32, 'GC': 0.48, 'CC': 0.20},
            'East_Asian': {'GG': 0.35, 'GC': 0.47, 'CC': 0.18},
            'African': {'GG': 0.33, 'GC': 0.48, 'CC': 0.19}
        },
        'statistical_evidence': {
            'sample_size': 1342,
            'p_value': 1.4e-3,
            'confidence_interval': (0.26, 0.40)
        },
        'evidence_quality': 0.75
    }
}
def analyze_genotype(user_genotypes, population='European', focus='balanced'):
    """
    Analyze user genotypes against athletic performance database.
    
    Parameters:
    user_genotypes: dict of {rsid: genotype} pairs
    population: str, one of ['European', 'East_Asian', 'African']
    focus: str, one of ['balanced', 'endurance', 'power']
    
    Returns:
    dict containing analysis results
    """
    results = {
        'overall_score': 0,
        'endurance_score': 0,
        'power_score': 0,
        'gene_breakdown': {},
        'population_percentile': {},
        'key_findings': []
    }
    
    total_possible = 0
    
    for gene, data in athletic_genetics_db.items():
        if data['rsid'] in user_genotypes:
            genotype = user_genotypes[data['rsid']]
            gene_score = 0
            
            # Calculate base score
            if genotype == data['advantageous']:
                gene_score = 2.0
            elif genotype == data['neutral']:
                gene_score = 1.0
            
            # Weight by effect size and evidence quality
            weighted_score = gene_score * data['effect_size'] * data['evidence_quality']
            total_possible += 2.0 * data['effect_size'] * data['evidence_quality']
            
            # Add to appropriate category
            if data['endurance_power'] > 0:
                results['endurance_score'] += weighted_score
            else:
                results['power_score'] += weighted_score
                
            # Population comparison
            pop_freq = data['population_frequencies'][population][genotype]
            results['population_percentile'][gene] = pop_freq
            
            # Store breakdown
            results['gene_breakdown'][gene] = {
                'genotype': genotype,
                'score': weighted_score,
                'optimal': data['advantageous'],
                'trait': data['trait'],
                'pathway': data['pathway']
            }
            
            # Add key findings for significant variations
            if genotype == data['advantageous']:
                results['key_findings'].append(f"Advantageous {gene} variant for {data['trait']}")
            elif genotype == data['less_advantageous']:
                results['key_findings'].append(f"Potential limitation in {gene} for {data['trait']}")

    # Calculate overall score based on focus
    if focus == 'endurance':
        results['overall_score'] = results['endurance_score']
    elif focus == 'power':
        results['overall_score'] = results['power_score']
    else:
        results['overall_score'] = (results['endurance_score'] + results['power_score']) / 2
    
    # Normalize scores to 0-100 scale
    results['overall_score'] = (results['overall_score'] / total_possible) * 100
    results['endurance_score'] = (results['endurance_score'] / total_possible) * 100
    results['power_score'] = (results['power_score'] / total_possible) * 100
    
    return results

# Example usage of analyze_genotype
user_genotypes = {
    'rs1815739': 'CC',  # Example genotypes
    'rs4341': 'GG',
    'rs8192678': 'CT'
}

# Perform analysis
results = analyze_genotype(user_genotypes, population='European', focus='balanced')

# Generate unique filename with timestamp
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")

# Save results to CSV
results_path = os.path.join('/Users/ngirmay/Documents/GitHub/ironman_retrospective/IronMan_2023/genetics/stats/', f"genotype_analysis_results_{timestamp}.csv")
results_df = pd.DataFrame({
    'Metric': ['Overall Score', 'Endurance Score', 'Power Score'],
    'Value': [results['overall_score'], results['endurance_score'], results['power_score']]
})

# Save analysis results
results_df.to_csv(results_path, index=False)
print(f"Genotype analysis results saved as {results_path}")

# Save gene breakdown to CSV
gene_breakdown_path = os.path.join('/Users/ngirmay/Documents/GitHub/ironman_retrospective/IronMan_2023/genetics/stats/', f"gene_breakdown_{timestamp}.csv")
gene_breakdown_df = pd.DataFrame.from_dict(results['gene_breakdown'], orient='index')
gene_breakdown_df.to_csv(gene_breakdown_path)

print(f"Gene breakdown saved as {gene_breakdown_path}")



Genotype analysis results saved as /Users/ngirmay/Documents/GitHub/ironman_retrospective/IronMan_2023/genetics/stats/genotype_analysis_results_20241024_132522.csv
Gene breakdown saved as /Users/ngirmay/Documents/GitHub/ironman_retrospective/IronMan_2023/genetics/stats/gene_breakdown_20241024_132522.csv


In [293]:
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import pandas as pd
import numpy as np

# Color scheme
colors = ['#e63946', '#a8dadc', '#90EE90', '#457b9d', '#1d3557']

# Convert database to DataFrame for visualization
genes_df = pd.DataFrame([{
    'gene': gene,
    'effect_size': data['effect_size'],
    'endurance_power': data['endurance_power'],
    'pathway': data['pathway'],
    'evidence_quality': data['evidence_quality'],
    'trait': data['trait']
} for gene, data in athletic_genetics_db.items()])

# Create master figure with subplots
fig = make_subplots(
    rows=2, cols=2,
    subplot_titles=(
        'Effect Size vs Endurance/Power Bias',
        'Distribution of Genetic Pathways',
        'Evidence Quality by Gene',
        'Genes Ranked by Effect Size'
    ),
    vertical_spacing=0.22,  # Increased vertical spacing
    horizontal_spacing=0.15
)

# 1. Effect Size vs Endurance/Power Scatter
scatter = px.scatter(genes_df, 
                    x='endurance_power', 
                    y='effect_size',
                    size='evidence_quality',
                    color='pathway',
                    color_discrete_sequence=colors,
                    hover_data=['gene', 'trait'],
                    text='gene')

for trace in scatter.data:
    trace.update(
        name=trace.name,  # Keep original pathway name
        legendgroup=trace.name,  # Group by pathway
        showlegend=True,
        mode='markers+text',
        textposition='top center'
    )
    fig.add_trace(trace, row=1, col=1)

# 2. Pathway Distribution
pathway_counts = genes_df['pathway'].value_counts()
fig.add_trace(
    go.Bar(x=pathway_counts.index, 
           y=pathway_counts.values,
           marker_color=colors[0],
           name='Pathway Count',
           hovertemplate='Pathway: %{x}<br>Count: %{y}'),
    row=1, col=2
)

# 3. Evidence Quality by Gene
fig.add_trace(
    go.Bar(x=genes_df['gene'],
           y=genes_df['evidence_quality'],
           marker_color=colors[1],
           name='Evidence Quality',
           hovertemplate='Gene: %{x}<br>Evidence Quality: %{y:.2f}'),
    row=2, col=1
)

# 4. Genes Ranked by Effect Size
genes_sorted = genes_df.sort_values('effect_size', ascending=True)
fig.add_trace(
    go.Bar(x=genes_sorted['effect_size'],
           y=genes_sorted['gene'],
           orientation='h',
           marker_color=colors[3],
           name='Effect Size',
           hovertemplate='Gene: %{y}<br>Effect Size: %{x:.2f}'),
    row=2, col=2
)

# Update layout with improved spacing and formatting
fig.update_layout(
    height=1000,
    width=1400,
    showlegend=True,
    legend=dict(
        orientation="h",
        yanchor="bottom",
        y=1.15,
        xanchor="center",
        x=0.5,
        bgcolor='rgba(255, 255, 255, 0.9)',  # Semi-transparent white background
        bordercolor='rgba(0, 0, 0, 0.2)',    # Light border
        borderwidth=1,
        font=dict(size=12),
        itemsizing='constant',  # Makes legend items consistent size
        itemwidth=40           # Controls legend item width
    ),
    title=dict(
        text="Athletic Genetics Database Overview",
        y=0.9,
        x=0.5,
        xanchor='center',
        yanchor='top',
        font=dict(size=24)
    ),
    template="plotly_white",
    font=dict(size=12)
)

# Update axes labels and formatting
fig.update_xaxes(title_text="Endurance (-1) to Power (+1) Bias", row=1, col=1, title_font=dict(size=14))
fig.update_yaxes(title_text="Effect Size", row=1, col=1, title_font=dict(size=14))
fig.update_xaxes(title_text="Pathway", row=1, col=2, tickangle=45, title_font=dict(size=14))
fig.update_yaxes(title_text="Number of Genes", row=1, col=2, title_font=dict(size=14))
fig.update_xaxes(title_text="Gene", row=2, col=1, tickangle=45, title_font=dict(size=14))
fig.update_yaxes(title_text="Evidence Quality Score", row=2, col=1, title_font=dict(size=14))
fig.update_xaxes(title_text="Effect Size", row=2, col=2, title_font=dict(size=14))

# Update subplot titles with better positioning
for i in fig['layout']['annotations']:
    i['font'] = dict(size=16, color='black')
    i['y'] = i['y'] - 0.03  # Move subplot titles down
    # Extra adjustment for bottom right title
    if i['text'] == 'Genes Ranked by Effect Size':
        i['y'] = i['y'] + 0.05  # Move this title up to avoid overlap

# Update margins to ensure no cutoff
fig.update_layout(margin=dict(t=150, b=50, l=50, r=50))

import os
from datetime import datetime

# Define output path
OUTPUT_PATH = '/Users/ngirmay/Documents/GitHub/ironman_retrospective/IronMan_2023/genetics/outputs/'
os.makedirs(OUTPUT_PATH, exist_ok=True)

# Get current timestamp for unique filenames
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')

# Save main overview plot
# HTML (interactive) version
fig.write_html(os.path.join(OUTPUT_PATH, f'athletic_genetics_overview_{timestamp}.html'))

# Static image versions (in different formats for flexibility)
fig.write_image(os.path.join(OUTPUT_PATH, f'athletic_genetics_overview_{timestamp}.png'))
fig.write_image(os.path.join(OUTPUT_PATH, f'athletic_genetics_overview_{timestamp}.svg'))
fig.write_image(os.path.join(OUTPUT_PATH, f'athletic_genetics_overview_{timestamp}.pdf'))

# Save individual subplots as separate files
# Create a figure for each subplot
for i, title in enumerate(['effect_size_vs_bias', 'pathway_distribution', 'evidence_quality', 'ranked_effect_size']):
    row = (i // 2) + 1
    col = (i % 2) + 1
    
    # Create single subplot figure
    subplot_fig = make_subplots(rows=1, cols=1, subplot_titles=[fig.layout.annotations[i].text])
    
    # Get the traces for this subplot from the main figure
    subplot_traces = [trace for trace in fig.data if trace.xaxis == f'x{i+1}' or trace.yaxis == f'y{i+1}']
    
    # Add traces to the subplot figure
    for trace in subplot_traces:
        subplot_fig.add_trace(trace)
    
    # Update layout
    subplot_fig.update_layout(
        height=600,
        width=800,
        showlegend=True,
        title_x=0.5,
        template="plotly_white"
    )
    
    # Save individual subplot
    subplot_fig.write_html(os.path.join(OUTPUT_PATH, f'genetics_{title}_{timestamp}.html'))
    subplot_fig.write_image(os.path.join(OUTPUT_PATH, f'genetics_{title}_{timestamp}.png'))

# Print confirmation message
print(f"All plots have been saved to: {OUTPUT_PATH}")
print(f"Timestamp used: {timestamp}")
print("\nSaved files:")
print("1. Main overview plot (HTML, PNG, SVG, PDF)")
print("2. Individual subplots (HTML, PNG)")

# Display the figure
fig.show()

All plots have been saved to: /Users/ngirmay/Documents/GitHub/ironman_retrospective/IronMan_2023/genetics/outputs/
Timestamp used: 20241024_132540

Saved files:
1. Main overview plot (HTML, PNG, SVG, PDF)
2. Individual subplots (HTML, PNG)


In [289]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import numpy as np
from pathlib import Path
import os
from datetime import datetime

# Define paths
GENETICS_DIR = "/Users/ngirmay/Documents/GitHub/ironman_retrospective/IronMan_2023/genetics"
OUTPUTS_DIR = os.path.join(GENETICS_DIR, "outputs")
ANALYSIS_DIR = os.path.join(GENETICS_DIR, "analysis_20241023_153638")
OUTPUT_HTML = os.path.join(GENETICS_DIR, "genetic_analysis_report.html")

def verify_paths():
    """Verify that all necessary paths exist"""
    if not os.path.exists(SNP_FILE):
        raise FileNotFoundError(f"SNP file not found at: {SNP_FILE}")
    if not os.path.exists(OUTPUT_DIR):
        os.makedirs(OUTPUT_DIR)
    return True

def load_snp_data(file_path):
    """
    Load SNP data from the provided file, handling commented headers correctly.
    """
    print(f"Loading SNP data from: {file_path}")
    
    # First, read the file to get the header line
    with open(file_path, 'r') as f:
        header_line = None
        data_lines = []
        for line in f:
            if line.startswith('# rsid'):
                # Remove the '# ' prefix and use this as header
                header_line = line.replace('# ', '').strip()
            elif not line.startswith('#'):
                # This is actual data
                data_lines.append(line)
    
    if not header_line:
        raise ValueError("Could not find header line starting with '# rsid' in the file")
    
    # Create a temporary file with the header and data
    import tempfile
    with tempfile.NamedTemporaryFile(mode='w', delete=False) as temp_file:
        temp_file.write(header_line + '\n')
        temp_file.writelines(data_lines)
        temp_path = temp_file.name
    
    try:
        # Read the temporary file as a CSV
        df = pd.read_csv(temp_path, sep='\s+')  # Using whitespace as separator
        print(f"Loaded {len(df)} SNPs")
        print(f"Columns found: {', '.join(df.columns)}")
        return df
    finally:
        # Clean up the temporary file
        os.unlink(temp_path)

def normalize_genotype(genotype):
    """
    Normalize genotype string to handle both direct and complementary matches.
    Returns tuple of (normalized_direct, normalized_complementary)
    """
    if not genotype or len(genotype) != 2:
        return genotype, genotype
    
    # Get both direct and complementary genotypes
    direct = ''.join(sorted(genotype.upper()))
    complementary = ''.join(sorted(get_complementary_genotype(genotype).upper()))
    
    return direct, complementary

def get_complementary_base(base):
    """Return the complementary DNA base"""
    complement_map = {
        'A': 'T',
        'T': 'A',
        'G': 'C',
        'C': 'G'
    }
    return complement_map.get(base.upper(), base)

def get_complementary_genotype(genotype):
    """Return the complementary genotype"""
    return ''.join(get_complementary_base(base) for base in genotype)

def calculate_genotype_score(user_genotype, optimal_genotype, neutral_genotype, less_optimal_genotype):
    """
    Calculate score considering both direct and complementary strand matches
    """
    # Get all possible representations of the user's genotype
    user_direct, user_comp = normalize_genotype(user_genotype)
    optimal_direct, optimal_comp = normalize_genotype(optimal_genotype)
    neutral_direct, neutral_comp = normalize_genotype(neutral_genotype)
    less_direct, less_comp = normalize_genotype(less_optimal_genotype)
    
    # Check for matches across all equivalent forms
    if user_direct in [optimal_direct, optimal_comp]:
        return 2.0, f"Optimal variant (equivalent match: {user_genotype} ≈ {optimal_genotype})"
    elif user_direct in [neutral_direct, neutral_comp]:
        return 1.0, f"Neutral variant (equivalent match: {user_genotype} ≈ {neutral_genotype})"
    elif user_direct in [less_direct, less_comp]:
        return 0.5, f"Less optimal variant (equivalent match: {user_genotype} ≈ {less_optimal_genotype})"
    
    return 0.1, f"Non-optimal variant (no equivalent matches found for {user_genotype})"

def create_user_genotype_dict(snp_df, athletic_genetics_db):
    """
    Create a dictionary of user genotypes for relevant athletic SNPs.
    Returns both the dictionary and a summary of matches found.
    """
    user_genotypes = {}
    matches_found = []
    missing_rsids = []
    
    print("\nLooking for matches in athletic genetics database...")
    for gene, data in athletic_genetics_db.items():
        rsid = data['rsid']
        # Convert rsid column to string to ensure matching works
        matches = snp_df[snp_df['rsid'].astype(str) == rsid]
        if not matches.empty:
            user_genotypes[rsid] = matches.iloc[0]['genotype']
            matches_found.append(gene)
        else:
            missing_rsids.append(gene)
    
    print(f"\nFound matches for {len(matches_found)} genes:")
    print(f"Matched genes: {', '.join(matches_found)}")
    if missing_rsids:
        print(f"\nMissing data for {len(missing_rsids)} genes:")
        print(f"Missing genes: {', '.join(missing_rsids)}")
    
    return user_genotypes

def analyze_genotype(user_genotypes, population='European', focus='balanced'):
    """
    Analyze user genotypes against athletic performance database with improved scoring
    """
    results = {
        'overall_score': 0,
        'endurance_score': 0,
        'power_score': 0,
        'gene_breakdown': {},
        'population_percentile': {},
        'key_findings': [],
        'debug_info': []
    }
    
    print("\nDetailed Genotype Analysis:")
    print("-" * 120)
    print(f"{'Gene':<10} {'User Genotype':<15} {'Equivalents':<40} {'Optimal':<15} {'Score':<10} {'Match Type':<30}")
    print("-" * 120)
    
    total_possible = 0
    
    for gene, data in athletic_genetics_db.items():
        if data['rsid'] in user_genotypes:
            user_genotype = user_genotypes[data['rsid']]
            
            base_score, reason = calculate_genotype_score(
                user_genotype,
                data['advantageous'],
                data['neutral'],
                data['less_advantageous']
            )
            
            # Weight the score
            weighted_score = base_score * data['effect_size'] * data['evidence_quality']
            total_possible += 2.0 * data['effect_size'] * data['evidence_quality']
            
            # Add to appropriate category
            if data['endurance_power'] > 0:
                results['endurance_score'] += weighted_score
            else:
                results['power_score'] += weighted_score
                
            # Population comparison
            pop_freqs = data['population_frequencies'][population]
            user_genotype_normalized = normalize_genotype(user_genotype)[0]
            
            # Calculate percentile based on frequency of user's genotype
            percentile = pop_freqs.get(user_genotype_normalized, 0.0)
            results['population_percentile'][gene] = percentile
            
            # Store breakdown
            results['gene_breakdown'][gene] = {
                'genotype': user_genotype,
                'score': weighted_score,
                'optimal': data['advantageous'],
                'trait': data['trait'],
                'pathway': data['pathway']
            }
            
            # Add key findings
            if base_score >= 1.5:
                results['key_findings'].append(
                    f"Advantageous {gene} variant ({user_genotype}) for {data['trait']}"
                )
            elif base_score <= 0.5:
                results['key_findings'].append(
                    f"Potential limitation in {gene} ({user_genotype}) for {data['trait']}"
                )
    
    # Calculate final scores
    if total_possible > 0:
        results['overall_score'] = (results['overall_score'] / total_possible) * 100
        results['endurance_score'] = (results['endurance_score'] / total_possible) * 100
        results['power_score'] = (results['power_score'] / total_possible) * 100
    
    return results
def create_pathway_radar_chart(results):
    """Create a radar chart showing performance across different pathways."""
    pathway_scores = {}
    for gene, data in results['gene_breakdown'].items():
        pathway = data['pathway']
        score = data['score']
        if pathway not in pathway_scores:
            pathway_scores[pathway] = []
        pathway_scores[pathway].append(score)
    
    pathway_averages = {k: np.mean(v) for k, v in pathway_scores.items()}
    
    fig = go.Figure()
    
    fig.add_trace(go.Scatterpolar(
        r=list(pathway_averages.values()),
        theta=list(pathway_averages.keys()),
        fill='toself',
        name='Your Profile',
        line_color='#1d3557'
    ))
    
    fig.update_layout(
        polar=dict(
            radialaxis=dict(
                visible=True,
                range=[0, max(pathway_averages.values()) * 1.2]
            )),
        showlegend=False,
        title="Athletic Pathway Performance Profile",
        paper_bgcolor='white',
        plot_bgcolor='white'
    )
    
    return fig

def create_gene_comparison_chart(results):
    """Create a bar chart comparing user's genotype scores against optimal."""
    genes = []
    user_scores = []
    optimal_scores = []
    pathways = []
    
    for gene, data in results['gene_breakdown'].items():
        genes.append(gene)
        user_scores.append(data['score'])
        optimal_scores.append(2.0)
        pathways.append(data['pathway'])
    
    fig = go.Figure()
    
    fig.add_trace(go.Bar(
        name='Your Score',
        x=genes,
        y=user_scores,
        marker_color='#457b9d'
    ))
    
    fig.add_trace(go.Bar(
        name='Optimal Score',
        x=genes,
        y=optimal_scores,
        marker_color='#e63946',
        opacity=0.5
    ))
    
    fig.update_layout(
        barmode='group',
        title="Gene Score Comparison",
        xaxis_title="Gene",
        yaxis_title="Score",
        xaxis_tickangle=-45,
        legend=dict(
            yanchor="top",
            y=0.99,
            xanchor="right",
            x=0.99
        ),
        paper_bgcolor='white',
        plot_bgcolor='white'
    )
    
    return fig

def create_population_percentile_chart(results):
    """Create a scatter plot showing population percentiles for each gene."""
    genes = list(results['population_percentile'].keys())
    percentiles = list(results['population_percentile'].values())
    
    fig = go.Figure()
    
    fig.add_trace(go.Scatter(
        x=genes,
        y=percentiles,
        mode='markers+lines',
        marker=dict(
            size=10,
            color='#457b9d'
        ),
        line=dict(color='#1d3557')
    ))
    
    fig.update_layout(
        title="Population Percentile Distribution",
        xaxis_title="Gene",
        yaxis_title="Population Percentile",
        xaxis_tickangle=-45,
        yaxis=dict(range=[0, 1]),
        paper_bgcolor='white',
        plot_bgcolor='white',
        showlegend=False
    )
    
    return fig

def generate_report():
    """Generate a complete genetic analysis report with visualizations."""
    try:
        # Verify paths
        verify_paths()
        
        # Create timestamped output directory
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        output_subdir = os.path.join(OUTPUT_DIR, f'analysis_{timestamp}')
        os.makedirs(output_subdir)
        
        print(f"\nStarting genetic analysis...")
        print(f"Output will be saved to: {output_subdir}")
        
        # Load and process data
        snp_df = load_snp_data(SNP_FILE)
        
        # Debug information
        print("\nFirst few rows of loaded data:")
        print(snp_df.head())
        print("\nData types of columns:")
        print(snp_df.dtypes)
        
        user_genotypes = create_user_genotype_dict(snp_df, athletic_genetics_db)
        
        if not user_genotypes:
            raise ValueError("No matching SNPs found in the genetic database")
        
        # Run analysis for different focuses
        print("\nRunning analysis...")
        results_balanced = analyze_genotype(user_genotypes, focus='balanced')
        results_endurance = analyze_genotype(user_genotypes, focus='endurance')
        results_power = analyze_genotype(user_genotypes, focus='power')
        
        # Create visualizations
        print("Generating visualizations...")
        radar_fig = create_pathway_radar_chart(results_balanced)
        gene_comp_fig = create_gene_comparison_chart(results_balanced)
        pop_fig = create_population_percentile_chart(results_balanced)
        
        # Save visualizations
        radar_fig.write_html(os.path.join(output_subdir, 'pathway_radar.html'))
        gene_comp_fig.write_html(os.path.join(output_subdir, 'gene_comparison.html'))
        pop_fig.write_html(os.path.join(output_subdir, 'population_percentile.html'))
        
        # Generate summary report
        report = f"""
Athletic Genetics Analysis Report
===============================
Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}

Overall Performance Scores
------------------------
Balanced Score: {results_balanced['overall_score']:.1f}/100
Endurance Score: {results_endurance['endurance_score']:.1f}/100
Power Score: {results_power['power_score']:.1f}/100

Key Genetic Findings
------------------
{chr(10).join(['• ' + finding for finding in results_balanced['key_findings']])}

Detailed Gene Analysis
--------------------"""
        
        # Add gene details, sorted by score
        sorted_genes = sorted(
            results_balanced['gene_breakdown'].items(),
            key=lambda x: x[1]['score'],
            reverse=True
        )
        
        for gene, data in sorted_genes:
            report += f"""

{gene}:
    Genotype: {data['genotype']} (Optimal: {data['optimal']})
    Trait: {data['trait']}
    Pathway: {data['pathway']}
    Performance Score: {data['score']:.2f}"""
        
        # Save report
        report_path = os.path.join(output_subdir, 'genetic_report.txt')
        with open(report_path, 'w') as f:
            f.write(report)
        
        print(f"\nAnalysis complete! Results saved to: {output_subdir}")
        print(f"Report file: {report_path}")
        
        return {
            'report_path': report_path,
            'visualization_paths': {
                'pathway_radar': os.path.join(output_subdir, 'pathway_radar.html'),
                'gene_comparison': os.path.join(output_subdir, 'gene_comparison.html'),
                'population_percentile': os.path.join(output_subdir, 'population_percentile.html')
            },
            'results': {
                'balanced': results_balanced,
                'endurance': results_endurance,
                'power': results_power
            }
        }
        
    except Exception as e:
        print(f"Error during analysis: {str(e)}")
        raise

if __name__ == "__main__":
    SNP_FILE = "/Users/ngirmay/Documents/GitHub/ironman_retrospective/IronMan_2023/genetics/selfdecode.txt"
    OUTPUT_DIR = "/Users/ngirmay/Documents/GitHub/ironman_retrospective/IronMan_2023/genetics/"
    
    results = generate_report()




Starting genetic analysis...
Output will be saved to: /Users/ngirmay/Documents/GitHub/ironman_retrospective/IronMan_2023/genetics/analysis_20241024_102851
Loading SNP data from: /Users/ngirmay/Documents/GitHub/ironman_retrospective/IronMan_2023/genetics/selfdecode.txt
Loaded 699182 SNPs
Columns found: rsid, chromosome, position, genotype

First few rows of loaded data:
          rsid chromosome  position genotype
0  rs547237130          1     72526       AA
1    rs4477212          1     82153       AA
2    rs4477212          1     82154       AA
3    rs4477212          1     82154       AA
4    rs8179455          1    264681       GG

Data types of columns:
rsid          object
chromosome    object
position       int64
genotype      object
dtype: object

Looking for matches in athletic genetics database...

Found matches for 17 genes:
Matched genes: ACE, PPARGC1A, HIF1A, MSTN, VEGFA, IL6, COL5A1, IGF1, MCT1, BDNF, PPARA, RYR1, NOS3, VDR, ADRB2, ACVR1B, HFE

Missing data for 3 genes:
M

In [290]:
import os
import pandas as pd
import re

# Define paths to files
GENETICS_DIR = "/Users/ngirmay/Documents/GitHub/ironman_retrospective/IronMan_2023/genetics/"
ANALYSIS_DIR = "/Users/ngirmay/Documents/GitHub/ironman_retrospective/IronMan_2023/genetics/analysis_20241023_160439/"
OUTPUT_HTML = os.path.join(GENETICS_DIR, "genetic_performance_report.html")
GENETIC_REPORT_PATH = os.path.join(ANALYSIS_DIR, "genetic_report.txt")

def analyze_text_file_structure(content):
    """Analyze and print the structure of the genetic report file."""
    # Split content into sections
    sections = {}
    current_section = None
    current_content = []
    
    for line in content.split('\n'):
        # Check for main section headers
        if line.strip() and line == '=' * len(line):
            # Previous line was a header
            current_section = previous_line
            current_content = []
            sections[current_section] = current_content
        elif line.strip() and line == '-' * len(line):
            # Previous line was a sub-header
            current_section = previous_line
            current_content = []
            sections[current_section] = current_content
        else:
            if current_section:
                current_content.append(line)
            previous_line = line

    # Analyze Detailed Gene Analysis section
    gene_entries = []
    current_gene = None
    
    if 'Detailed Gene Analysis' in sections:
        gene_lines = sections['Detailed Gene Analysis']
        
        for line in gene_lines:
            if line.strip():  # Skip empty lines
                if not line.startswith(' '):  # Gene name line
                    if current_gene:
                        gene_entries.append(current_gene)
                    gene_name = line.strip().rstrip(':')
                    current_gene = {
                        'Gene': gene_name,
                        'raw_data': []
                    }
                elif current_gene and line.strip():  # Gene details
                    current_gene['raw_data'].append(line.strip())
        
        # Add the last gene
        if current_gene:
            gene_entries.append(current_gene)

    # Process each gene entry to extract structured data
    processed_genes = []
    for gene in gene_entries:
        gene_data = {
            'Gene': gene['Gene'],
            'Genotype': '',
            'Optimal': '',
            'Trait': '',
            'Pathway': '',
            'Performance_Score': 0.0
        }
        
        for line in gene['raw_data']:
            if line.startswith('Genotype:'):
                # Extract genotype and optimal value
                geno_info = line.replace('Genotype:', '').strip()
                if '(Optimal:' in geno_info:
                    actual = geno_info.split('(Optimal:')[0].strip()
                    optimal = geno_info.split('(Optimal:')[1].replace(')', '').strip()
                    gene_data['Genotype'] = actual
                    gene_data['Optimal'] = optimal
            elif line.startswith('Trait:'):
                gene_data['Trait'] = line.replace('Trait:', '').strip()
            elif line.startswith('Pathway:'):
                gene_data['Pathway'] = line.replace('Pathway:', '').strip()
            elif line.startswith('Performance Score:'):
                score_str = line.replace('Performance Score:', '').strip()
                try:
                    gene_data['Performance_Score'] = float(score_str)
                except ValueError:
                    print(f"Error converting score to float: {score_str}")

        processed_genes.append(gene_data)

    return processed_genes

def create_gene_dataframe(genes_data):
    """Convert processed gene data into a pandas DataFrame."""
    df = pd.DataFrame(genes_data)
    # Sort by Performance_Score in descending order
    df = df.sort_values('Performance_Score', ascending=False)
    return df

def read_genetic_report(report_file):
    """Read and parse the genetic report file."""
    try:
        with open(report_file, 'r') as f:
            content = f.read()
    except Exception as e:
        print(f"Error reading file: {e}")
        return pd.DataFrame(), {}

    # Extract overall scores
    overall_scores = {}
    for line in content.split('\n'):
        if any(score in line for score in ['Balanced Score', 'Endurance Score', 'Power Score']):
            key, value = line.split(':')
            overall_scores[key.strip()] = value.strip()

    # Use the verified parser to extract gene data
    processed_genes = analyze_text_file_structure(content)
    df = create_gene_dataframe(processed_genes)
    
    return df, overall_scores

def generate_html_report():
    """Generate the HTML report."""
    # Load and process the genetic report
    genetic_df, overall_scores = read_genetic_report(GENETIC_REPORT_PATH)
    
    if genetic_df.empty:
        print("Error: No genetic data was loaded")
        return

    # Generate the gene analysis table HTML
    gene_table_html = '''
    <div class="table-responsive">
        <table class="table">
            <thead>
                <tr>
                    <th scope="col">Gene</th>
                    <th scope="col">Genotype</th>
                    <th scope="col">Optimal</th>
                    <th scope="col">Trait</th>
                    <th scope="col">Pathway</th>
                    <th scope="col">Performance</th>
                </tr>
            </thead>
            <tbody>
    '''
    
    for _, row in genetic_df.iterrows():
        performance_score = row['Performance_Score']
        
        # Determine genotype cell class based on performance score
        if performance_score >= 0.6:
            genotype_class = 'optimal-genotype'
            performance_class = 'high-performance'
        elif performance_score >= 0.4:
            genotype_class = 'moderate-genotype'
            performance_class = 'medium-performance'
        else:
            genotype_class = 'suboptimal-genotype'
            performance_class = 'low-performance'
            
        gene_table_html += f'''
            <tr>
                <td class="gene-name">{row['Gene']}</td>
                <td class="{genotype_class}">{row['Genotype']}</td>
                <td>{row['Optimal']}</td>
                <td>{row['Trait']}</td>
                <td>{row['Pathway']}</td>
                <td class="{performance_class}">{performance_score:.2f}</td>
            </tr>
        '''
    
    gene_table_html += '''
            </tbody>
        </table>
    </div>
    '''

    # HTML template (add new CSS classes for genotype styling)
    html_content = f'''
    <!DOCTYPE html>
    <html lang="en">
    <head>
        <meta charset="UTF-8">
        <meta name="viewport" content="width=device-width, initial-scale=1.0">
        <title>Athletic Genetics Performance Report</title>
        <link href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.0/dist/css/bootstrap.min.css" rel="stylesheet">
        <style>
            /* [Previous CSS styles remain unchanged] */
            
            .optimal-genotype {{
                background-color: rgba(47, 133, 90, 0.1);  /* Light green background */
                color: #2f855a;
                font-weight: 600;
            }}
            
            .moderate-genotype {{
                background-color: rgba(183, 121, 31, 0.1);  /* Light yellow background */
                color: #b7791f;
                font-weight: 600;
            }}
            
            .suboptimal-genotype {{
                background-color: rgba(197, 48, 48, 0.1);  /* Light red background */
                color: #c53030;
                font-weight: 600;
            }}
            
            /* Add hover effect that preserves the color */
            .table tbody tr:hover .optimal-genotype {{
                background-color: rgba(47, 133, 90, 0.2);
            }}
            
            .table tbody tr:hover .moderate-genotype {{
                background-color: rgba(183, 121, 31, 0.2);
            }}
            
            .table tbody tr:hover .suboptimal-genotype {{
                background-color: rgba(197, 48, 48, 0.2);
            }}

            /* [Rest of the CSS styles remain unchanged] */
        </style>
    </head>
    <body>
        <!-- [Rest of the HTML template remains unchanged] -->
        {gene_table_html}
        <!-- [Rest of the HTML template remains unchanged] -->
    </body>
    </html>
    '''
    
    # Write the HTML file
    try:
        with open(OUTPUT_HTML, 'w') as f:
            f.write(html_content)
        print(f"HTML report generated and saved to: {OUTPUT_HTML}")
    except Exception as e:
        print(f"Error writing HTML report: {e}")

def load_visualizations():
    """Load and return visualization content."""
    visualizations = ""
    try:
        visualization_files = [f for f in os.listdir(ANALYSIS_DIR) if f.endswith('.html')]
        for viz_file in visualization_files:
            file_path = os.path.join(ANALYSIS_DIR, viz_file)
            with open(file_path, 'r') as f:
                content = f.read()
                visualizations += f'<div class="visualization-container">{content}</div>\n'
                print(f"Loaded visualization from {viz_file}")
    except Exception as e:
        print(f"Error loading visualizations: {e}")

    return visualizations if visualizations else "<p>No visualizations available.</p>"

if __name__ == "__main__":
    generate_html_report()


HTML report generated and saved to: /Users/ngirmay/Documents/GitHub/ironman_retrospective/IronMan_2023/genetics/genetic_performance_report.html


In [291]:
import os
import pandas as pd
import re

# Define paths to files
GENETICS_DIR = "/Users/ngirmay/Documents/GitHub/ironman_retrospective/IronMan_2023/genetics/"
ANALYSIS_DIR = "/Users/ngirmay/Documents/GitHub/ironman_retrospective/IronMan_2023/genetics/analysis_20241023_160439/"
OUTPUT_HTML = os.path.join(GENETICS_DIR, "genetic_performance_report.html")
GENETIC_REPORT_PATH = os.path.join(ANALYSIS_DIR, "genetic_report.txt")

def analyze_text_file_structure(content):
    """Analyze and print the structure of the genetic report file."""
    # Split content into sections
    sections = {}
    current_section = None
    current_content = []
    
    for line in content.split('\n'):
        # Check for main section headers
        if line.strip() and line == '=' * len(line):
            # Previous line was a header
            current_section = previous_line
            current_content = []
            sections[current_section] = current_content
        elif line.strip() and line == '-' * len(line):
            # Previous line was a sub-header
            current_section = previous_line
            current_content = []
            sections[current_section] = current_content
        else:
            if current_section:
                current_content.append(line)
            previous_line = line

    # Analyze Detailed Gene Analysis section
    gene_entries = []
    current_gene = None
    
    if 'Detailed Gene Analysis' in sections:
        gene_lines = sections['Detailed Gene Analysis']
        
        for line in gene_lines:
            if line.strip():  # Skip empty lines
                if not line.startswith(' '):  # Gene name line
                    if current_gene:
                        gene_entries.append(current_gene)
                    gene_name = line.strip().rstrip(':')
                    current_gene = {
                        'Gene': gene_name,
                        'raw_data': []
                    }
                elif current_gene and line.strip():  # Gene details
                    current_gene['raw_data'].append(line.strip())
        
        # Add the last gene
        if current_gene:
            gene_entries.append(current_gene)

    # Process each gene entry to extract structured data
    processed_genes = []
    for gene in gene_entries:
        gene_data = {
            'Gene': gene['Gene'],
            'Genotype': '',
            'Optimal': '',
            'Trait': '',
            'Pathway': '',
            'Performance_Score': 0.0
        }
        
        for line in gene['raw_data']:
            if line.startswith('Genotype:'):
                # Extract genotype and optimal value
                geno_info = line.replace('Genotype:', '').strip()
                if '(Optimal:' in geno_info:
                    actual = geno_info.split('(Optimal:')[0].strip()
                    optimal = geno_info.split('(Optimal:')[1].replace(')', '').strip()
                    gene_data['Genotype'] = actual
                    gene_data['Optimal'] = optimal
            elif line.startswith('Trait:'):
                gene_data['Trait'] = line.replace('Trait:', '').strip()
            elif line.startswith('Pathway:'):
                gene_data['Pathway'] = line.replace('Pathway:', '').strip()
            elif line.startswith('Performance Score:'):
                score_str = line.replace('Performance Score:', '').strip()
                try:
                    gene_data['Performance_Score'] = float(score_str)
                except ValueError:
                    print(f"Error converting score to float: {score_str}")

        processed_genes.append(gene_data)

    return processed_genes

def create_gene_dataframe(genes_data):
    """Convert processed gene data into a pandas DataFrame."""
    df = pd.DataFrame(genes_data)
    # Sort by Performance_Score in descending order
    df = df.sort_values('Performance_Score', ascending=False)
    return df

def read_genetic_report(report_file):
    """Read and parse the genetic report file."""
    try:
        with open(report_file, 'r') as f:
            content = f.read()
    except Exception as e:
        print(f"Error reading file: {e}")
        return pd.DataFrame(), {}

    # Extract overall scores
    overall_scores = {}
    for line in content.split('\n'):
        if any(score in line for score in ['Balanced Score', 'Endurance Score', 'Power Score']):
            key, value = line.split(':')
            overall_scores[key.strip()] = value.strip()

    # Use the verified parser to extract gene data
    processed_genes = analyze_text_file_structure(content)
    df = create_gene_dataframe(processed_genes)
    
    return df, overall_scores

def generate_html_report():
    """Generate the HTML report."""
    # Load and process the genetic report
    genetic_df, overall_scores = read_genetic_report(GENETIC_REPORT_PATH)
    
    if genetic_df.empty:
        print("Error: No genetic data was loaded")
        return

    # Generate the gene analysis table HTML
    gene_table_html = '''
    <div class="table-responsive">
        <table class="table">
            <thead>
                <tr>
                    <th scope="col">Gene</th>
                    <th scope="col">Genotype</th>
                    <th scope="col">Optimal</th>
                    <th scope="col">Trait</th>
                    <th scope="col">Pathway</th>
                    <th scope="col">Performance</th>
                </tr>
            </thead>
            <tbody>
    '''
    
    for _, row in genetic_df.iterrows():
        performance_score = row['Performance_Score']
        
        if performance_score >= 0.6:
            performance_class = 'high-performance'
        elif performance_score >= 0.4:
            performance_class = 'medium-performance'
        else:
            performance_class = 'low-performance'
            
        gene_table_html += f'''
            <tr>
                <td class="gene-name">{row['Gene']}</td>
                <td>{row['Genotype']}</td>
                <td>{row['Optimal']}</td>
                <td>{row['Trait']}</td>
                <td>{row['Pathway']}</td>
                <td class="{performance_class}">{performance_score:.2f}</td>
            </tr>
        '''
    
    gene_table_html += '''
            </tbody>
        </table>
    </div>
    '''

    # HTML template
    html_content = f'''
    <!DOCTYPE html>
    <html lang="en">
    <head>
        <meta charset="UTF-8">
        <meta name="viewport" content="width=device-width, initial-scale=1.0">
        <title>Athletic Genetics Performance Report</title>
        <link href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.0/dist/css/bootstrap.min.css" rel="stylesheet">
        <style>
            :root {{
                --navy-blue: #1a365d;
                --light-navy: #2c5282;
                --highlight-blue: #4299e1;
                --white: #ffffff;
                --off-white: #f7fafc;
                --text-gray: #2d3748;
            }}
            
            body {{
                background-color: var(--off-white);
                color: var(--text-gray);
                font-family: 'Inter', system-ui, -apple-system, sans-serif;
            }}
            
            .navbar {{
                background-color: var(--navy-blue) !important;
                box-shadow: 0 2px 4px rgba(0,0,0,0.1);
            }}
            
            .navbar-brand {{
                color: var(--white) !important;
                font-weight: 600;
                letter-spacing: 0.5px;
            }}
            
            .main-container {{
                max-width: 1200px;
                margin: 2rem auto;
                padding: 0 1.5rem;
            }}
            
            .section-header {{
                color: var(--navy-blue);
                font-weight: 700;
                margin: 2.5rem 0 1.5rem;
                position: relative;
                padding-bottom: 0.5rem;
            }}
            
            .section-header::after {{
                content: '';
                position: absolute;
                bottom: 0;
                left: 0;
                width: 60px;
                height: 4px;
                background-color: var(--highlight-blue);
                border-radius: 2px;
            }}
            
            .score-card {{
                background: linear-gradient(135deg, var(--navy-blue), var(--light-navy));
                color: var(--white);
                padding: 1.75rem;
                border-radius: 12px;
                margin-bottom: 1.5rem;
                box-shadow: 0 4px 6px rgba(0,0,0,0.1);
                transition: transform 0.2s ease;
            }}
            
            .score-card:hover {{
                transform: translateY(-2px);
            }}
            
            .score-label {{
                font-size: 1.1rem;
                margin-bottom: 0.75rem;
                opacity: 0.9;
                font-weight: 500;
            }}
            
            .score-value {{
                font-size: 2.25rem;
                font-weight: 700;
                letter-spacing: -0.5px;
            }}
            
            .table {{
                background-color: var(--white);
                border-radius: 12px;
                overflow: hidden;
                box-shadow: 0 4px 6px rgba(0,0,0,0.05);
                margin-top: 1.5rem;
            }}
            
            .table thead {{
                background-color: var(--navy-blue);
                color: var(--white);
            }}
            
            .table th {{
                font-weight: 600;
                padding: 1rem;
                font-size: 0.95rem;
                letter-spacing: 0.3px;
            }}
            
            .table td {{
                padding: 1rem;
                vertical-align: middle;
                border-bottom: 1px solid rgba(0,0,0,0.05);
            }}
            
            .gene-name {{
                font-weight: 600;
                color: var(--navy-blue);
            }}
            
            .high-performance {{
                color: #2f855a;
                font-weight: 600;
            }}
            
            .medium-performance {{
                color: #b7791f;
                font-weight: 600;
            }}
            
            .low-performance {{
                color: #c53030;
                font-weight: 600;
            }}
            
            .table tbody tr:hover {{
                background-color: var(--off-white);
                transition: background-color 0.2s ease;
            }}
            
            .visualization-container {{
                background-color: var(--white);
                border-radius: 12px;
                padding: 1.75rem;
                margin-bottom: 2rem;
                box-shadow: 0 4px 6px rgba(0,0,0,0.05);
            }}

            @media print {{
                .visualization-container {{
                    break-inside: avoid;
                }}
                
                .score-card {{
                    break-inside: avoid;
                }}
                
                .table {{
                    break-inside: avoid;
                }}
            }}
        </style>
    </head>
    <body>
        <nav class="navbar navbar-dark">
            <div class="container">
                <span class="navbar-brand">Athletic Genetics Report</span>
            </div>
        </nav>
        
        <div class="main-container">
            <h1 class="section-header">Overall Performance Scores</h1>
            <div class="row">
                <div class="col-md-4">
                    <div class="score-card">
                        <div class="score-label">Power Score</div>
                        <div class="score-value">{overall_scores.get('Power Score', 'N/A')}</div>
                    </div>
                </div>
                <div class="col-md-4">
                    <div class="score-card">
                        <div class="score-label">Endurance Score</div>
                        <div class="score-value">{overall_scores.get('Endurance Score', 'N/A')}</div>
                    </div>
                </div>
                <div class="col-md-4">
                    <div class="score-card">
                        <div class="score-label">Balanced Score</div>
                        <div class="score-value">{overall_scores.get('Balanced Score', 'N/A')}</div>
                    </div>
                </div>
            </div>

            <h2 class="section-header">Genetic Analysis</h2>
            {gene_table_html}
            
            <h2 class="section-header">Visualizations</h2>
            <div id="visualizations">
                {load_visualizations()}
            </div>
        </div>

        <script src="https://cdn.jsdelivr.net/npm/bootstrap@5.3.0/dist/js/bootstrap.bundle.min.js"></script>
    </body>
    </html>
    '''
    
    # Write the HTML file
    try:
        with open(OUTPUT_HTML, 'w') as f:
            f.write(html_content)
        print(f"HTML report generated and saved to: {OUTPUT_HTML}")
    except Exception as e:
        print(f"Error writing HTML report: {e}")

def load_visualizations():
    """Load and return visualization content."""
    visualizations = ""
    try:
        visualization_files = [f for f in os.listdir(ANALYSIS_DIR) if f.endswith('.html')]
        for viz_file in visualization_files:
            file_path = os.path.join(ANALYSIS_DIR, viz_file)
            with open(file_path, 'r') as f:
                content = f.read()
                visualizations += f'<div class="visualization-container">{content}</div>\n'
                print(f"Loaded visualization from {viz_file}")
    except Exception as e:
        print(f"Error loading visualizations: {e}")

    return visualizations if visualizations else "<p>No visualizations available.</p>"

if __name__ == "__main__":
    generate_html_report()


Loaded visualization from population_percentile.html
Loaded visualization from pathway_radar.html
Loaded visualization from gene_comparison.html
HTML report generated and saved to: /Users/ngirmay/Documents/GitHub/ironman_retrospective/IronMan_2023/genetics/genetic_performance_report.html
