In [None]:
from scipy import odr
import matplotlib as mpl
import matplotlib.colors as mcolors
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
from matplotlib import gridspec
import matplotlib.colors as mcolors
import plotly.express as px, plotly.graph_objects as go
import os
import seaborn as sns
from matplotlib.colors import to_rgba

pd.options.mode.chained_assignment = None

mpl.rcParams['pdf.fonttype'] = 42
mpl.rcParams['font.size'] = 13  

# Define a custom color palette (e.g., from red to yellow to green)
colors = ["#e07a5f", "#f4f1de", "#3d405b"]  # Red -> Yellow -> Green

# Create a custom colormap
custom_cmap = mcolors.LinearSegmentedColormap.from_list("custom_cmap", colors)
custom_cmap

In [None]:
files = {
    'INFORM': "INFORM_germline_filtered_Breast_WGD_HRD_TimingResults_v9_timing_nboot200.csv",
    'SCANB': "SCANB_unfiltered_Breast_WGD_HRD_TimingResults_v9_timing_nboot200.csv",
    'PCAWG': "PCAWG_Breast_WGD_HRD_TimingResults_v9_timing_nboot200.csv"
}

# Load, sort within cohorts, and concatenate datasets
sorted_dfs = []
cohort_boundaries = []
colors = {'INFORM': '#3c1518', 'SCANB': '#71816d', 'PCAWG': '#e26d5c'}

x_pos = 0
for k in files.keys():
    df = pd.read_csv(files[k])[['ID', 'HRDTime', 'HRDTime_ci', 'WGDTime', 'WGDTime_ci']].dropna()
    df = df.sort_values('HRDTime').reset_index(drop=True)
    df['Dataset'] = k
    df['x'] = range(x_pos, x_pos + len(df))
    x_pos += len(df)
    cohort_boundaries.append(x_pos)
    sorted_dfs.append(df)

df_all = pd.concat(sorted_dfs, ignore_index=True)

# Plot
plt.figure(figsize=(18, 6))

for k, g in df_all.groupby('Dataset'):
    # WGDTime (lower opacity)
    plt.errorbar(g['x'], g['WGDTime'], yerr=g['WGDTime_ci'],
                 fmt='o', color=colors[k], ecolor='lightgrey', alpha=0.3, capsize=2, label=f"{k} WGDTime")
    
    # HRDTime
    plt.errorbar(g['x'], g['HRDTime'], yerr=g['HRDTime_ci'],
                 fmt='o', color=colors[k], ecolor='lightgrey', capsize=2, label=f"{k} HRDTime")

# Vertical dashed lines between cohorts
for boundary in cohort_boundaries[:-1]:
    plt.axvline(x=boundary - 0.5, color='grey', linestyle='--', linewidth=1)

# Formatting
plt.xticks(df_all['x'], df_all['ID'], rotation=90, fontsize=6)
plt.ylim(-0.02, 1.06)
plt.ylabel("Timing")
plt.xlabel("Sample ID")
plt.title("HRDTime and WGDTime Estimates per Sample by Cohort")
plt.tight_layout(rect=[0, 0, 0.8, 1])  # leave space on the right
plt.legend(loc='center left', bbox_to_anchor=(1.02, 0.5), fontsize=8, title="Cohort")
plt.savefig("AAA_Apr25_plots/HRD_WGD_Timing_per_sample_by_cohort.pdf", format='pdf', bbox_inches='tight')
plt.show()


In [None]:
colors = {'INFORM': '#3c1518', 'SCANB': '#71816d', 'PCAWG': '#e26d5c'}

def parse_list(s):
    try:
        return np.array(eval(s, {"nan": np.nan}), dtype=float)
    except:
        return np.array([np.nan])  # Fallback in case of error

sorted_dfs = []
cohort_boundaries = []
x_pos = 0

# Process each cohort and parse the 'N_mut_all' column
for k in files.keys():
    df = pd.read_csv(files[k])[['ID', 'HRDTime', 'HRDTime_ci', 'WGDTime', 'WGDTime_ci', 'N_mut_all']].dropna()
    
    # Parse and sum N_mut_all list strings
    df['N_mut_all_sum'] = df['N_mut_all'].apply(parse_list).apply(np.nansum)

    df = df.sort_values('HRDTime').reset_index(drop=True)
    df['Dataset'] = k
    df['x'] = range(x_pos, x_pos + len(df))
    x_pos += len(df)
    cohort_boundaries.append(x_pos)
    sorted_dfs.append(df)

df_all = pd.concat(sorted_dfs, ignore_index=True)

# Normalize N_mut_all_sum for log color mapping
norm = mcolors.Normalize(vmin=np.log10(df_all['N_mut_all_sum'].min() + 1), vmax=np.log10(df_all['N_mut_all_sum'].max()))
cmap = plt.cm.PuOr
cmap = custom_cmap

# Set up figure with a single row for heatmap strip
fig = plt.figure(figsize=(15, 4))
gs = gridspec.GridSpec(1, 1)

# Heatmap strip
ax2 = fig.add_subplot(gs[0])
for i, row in df_all.iterrows():
    ax2.add_patch(plt.Rectangle((row['x'] - 0.5, 0), 1, 1,
                                color=cmap(norm(np.log10(row['N_mut_all_sum'] + 1))), linewidth=0))

ax2.set_xlim(-0.5, len(df_all) - 0.5)
ax2.set_ylim(0, 1)
ax2.set_yticks([])
ax2.set_xticks(df_all['x'])
ax2.set_xticklabels(df_all['ID'], rotation=90, fontsize=6)
ax2.set_xlabel("Sample ID")

# Colorbar
sm = plt.cm.ScalarMappable(cmap=cmap, norm=norm)
sm.set_array([])
cbar = fig.colorbar(sm, ax=ax2, orientation='horizontal', pad=0.7, aspect=70)
cbar.set_label("Log10(N_mut_all)")

plt.tight_layout()
plt.savefig("AAA_Apr25_plots/N_Mut_all_HRD_WGD_Timing_per_sample_by_cohort.pdf", format='pdf', bbox_inches='tight')
plt.show()
