In [None]:
import altair as alt
import pandas as pd

# These numbers were derived from transpose_counts.sh output
union_pos_1plus=[4778, 1710, 1071, 929, 4694]
union_homog_neg_1plus=[1434, 147, 51, 46, 79]
union_pos_2plus=[3921, 1634, 1056, 925, 4694]
union_homog_neg_2plus=[1007, 104, 29, 32, 32]
union_pos_5plus=[2208, 1119, 869, 886, 4691]
union_homog_neg_5plus=[798, 84, 20, 23, 15]
union_pos_8plus=[1383, 555, 553, 774, 4667]
union_homog_neg_8plus=[694, 78, 19, 22, 13]
union_pos_10plus=[1144, 324, 379, 702, 4644]
union_homog_neg_10plus=[657, 72, 18, 19, 12]
union_totals=[20374, 2132, 1109, 939, 4699] # Took the lower of the two coverages

categories = ['Σ C(5,1)', 'Σ C(5,2)', 'Σ C(5,3)', 'Σ C(5,4)', 'Σ C(5,5)']

# Create comprehensive dataset for line chart
trend_data = []
datasets = {
    '1+ Reads': (union_pos_1plus, union_homog_neg_1plus),
    '2+ Reads': (union_pos_2plus, union_homog_neg_2plus),
    '5+ Reads': (union_pos_5plus, union_homog_neg_5plus),
    '8+ Reads': (union_pos_8plus, union_homog_neg_8plus),
    '10+ Reads': (union_pos_10plus, union_homog_neg_10plus)
}

for threshold, (pos_validated, neg_validated) in datasets.items():
    for i, category in enumerate(categories):
        # Positive validation percentage
        pos_rate = (pos_validated[i] / union_totals[i] * 100) if union_totals[i] > 0 else 0
        trend_data.append({
            'category': category,
            'threshold': threshold,
            'validation_rate': pos_rate,
            'site_type': 'RUFUS-Called',
            'line_id': f'{threshold} RUFUS-Called',
            'validated_count': pos_validated[i],
            'total_count': union_totals[i]
        })
        
        # Negative validation percentage
        neg_rate = (neg_validated[i] / union_totals[i] * 100) if union_totals[i] > 0 else 0
        trend_data.append({
            'category': category,
            'threshold': threshold,
            'validation_rate': neg_rate,
            'site_type': 'Control',
            'line_id': f'{threshold} Control',
            'validated_count': neg_validated[i],
            'total_count': union_totals[i]
        })

trend_df = pd.DataFrame(trend_data)

# Create the single line chart with all 10 lines
line_chart = alt.Chart(trend_df).mark_line(
    strokeWidth=4,
    point=alt.OverlayMarkDef(size=400, filled=True)
).encode(
    x=alt.X('category:O', 
            title='Category',
            sort=['Σ C(5,5)', 'Σ C(5,4)', 'Σ C(5,3)', 'Σ C(5,2)', 'Σ C(5,1)'],
            axis=alt.Axis(labelAngle=45, labelFontSize=28, titleFontSize=36)),
    y=alt.Y('validation_rate:Q',
            title='Validation Percentage (%)',
            scale=alt.Scale(domain=[0, 100]),
            axis=alt.Axis(labelFontSize=28, titleFontSize=36)),
    color=alt.Color('threshold:N',
                    scale=alt.Scale(
                        domain=['1+ Reads', '2+ Reads', '5+ Reads', '8+ Reads', '10+ Reads'],
                        range=['#D03227', '#EE8432', '#66AC55', '#4779B2', '#8C4E9D'], # One color per threshold
                    ),    
                    legend=None),                
    strokeDash=alt.StrokeDash('site_type:N', 
                            scale=alt.Scale(
                                domain=['RUFUS/Mutect2-Called', 'Control'],
                                range=[[0], [20,20]]  # Solid for union-Called, dashed for Control
                            ),
                            legend=None),
    tooltip=['threshold:O', 'category:O', 'site_type:N', 'validation_rate:Q', 'validated_count:Q', 'total_count:Q']
).properties(
    width=1200,
    height=600,
    title=alt.TitleParams(
        text="Validation Rates Across All Thresholds",
        fontSize=36,
        anchor="middle",
        fontWeight='bold',
    )
)

# Display the chart
line_chart.show()

# Optional: Save the chart
# line_chart.save('validation_trends.html')
# line_chart.save('validation_trends.png', scale_factor=2.0)