In [96]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [2]:
len_syn = 1308.0216666666665
len_mis = 2616.043333333333
len_lof = 167.616

In [3]:
prop_kept_syn = 32320/247378
prop_kept_mis = 32320/505963
prop_kept_lof = 1

In [None]:
### AVERAGE SFS

In [14]:
nbins = 13
n=10000
sfs_unif = pd.read_csv(f'../empirical/subsampling_SIR_v20250127/results/sfs/chr1_lof_uniformgeo_nSIR10000_nSIRreps10.SIRsfs', sep=' ')
sfs_unif_grouped = sfs_unif.groupby('OBS_N')['COUNT_SITES'].agg(['mean', 'std'])
sfs_unif_avg = sfs_unif_grouped['mean']

log_bins = np.logspace(np.log10(1), np.log10(n/2), num=nbins)
log_bins = np.unique(np.round(log_bins))  

log_bins = log_bins[log_bins <= 100][:-1]  
log_bins = np.append(log_bins, [100, np.inf])  


bin_centers = np.sqrt(log_bins[:-1] * log_bins[1:])
bin_centers[-1] = 100 

vartypes = ['synonymous', 'missense', 'lof']
centers_geo = ['centerE16N4', 'centerE9N9', 'centerE6N4']
w_list_geo = ['50000', '100000', '150000']
labs_geo = ['A', 'B', 'C', 'D']

centers_pca = ['centerX19Y4']
w_list_pca = [0.0015, 0.0025, 0.005]
labs_pca = ['E', 'F', 'G', 'H']


sfs_data = []
for k, vt in enumerate(vartypes):
    print(vt)
    if vt=='synonymous':
        scale_factor = len_syn*prop_kept_syn # also divide by prop kept
    elif vt=='missense':
        scale_factor = len_mis*prop_kept_mis # also divide by prop kept
    elif vt=='lof':
        scale_factor = len_lof
    print(scale_factor)
    
    # Process PCA data
    sfs_unif_pca = pd.read_csv(f'../empirical/subsampling_SIR_v20250127/results/sfs/chr1_{vt}_uniformpca_nSIR10000_nSIRreps10.SIRsfs', sep=' ')
    sfs_unif_pca_grouped = sfs_unif_pca.groupby('OBS_N')['COUNT_SITES'].agg(['mean', 'std'])
    sfs_unif_pca_avg = sfs_unif_pca_grouped['mean']/scale_factor
    sfs_unif_pca_std = sfs_unif_pca_grouped['std']/scale_factor
    
    sfs_unif_pca_binned, _ = np.histogram(sfs_unif_pca_avg.index, bins=log_bins, weights=sfs_unif_pca_avg.values)
    std_unif_pca_binned, _ = np.histogram(sfs_unif_pca_avg.index, bins=log_bins, weights=sfs_unif_pca_std.values)

    for i in range(len(bin_centers)):
        sfs_data.append([vt, "H", bin_centers[i], sfs_unif_pca_binned[i], std_unif_pca_binned[i]])  
    
    for j, w in enumerate(w_list_pca):
        all_sfs_avg = []
        all_sfs_std = []

        for center in centers_pca:
            sfs = pd.read_csv(f'../empirical/subsampling_SIR_v20250127/results/sfs/chr1_{vt}_{center}pca{w}_nSIR10000_nSIRreps10.SIRsfs', delimiter=' ')
            sfs_avg_grouped = sfs.groupby('OBS_N')['COUNT_SITES'].agg(['mean', 'std'])
            all_sfs_avg.append(sfs_avg_grouped['mean']/scale_factor)
            all_sfs_std.append(sfs_avg_grouped['std']/scale_factor)


        sfs_avg_final = pd.concat(all_sfs_avg, axis=1).values
        sfs_avg_forindex = pd.concat(all_sfs_avg, axis=1)
        sfs_std_final = pd.concat(all_sfs_std, axis=1).values
        sfs_forindex = sfs_avg_forindex.mean(axis=1)
        sfs_avg_binned, _ = np.histogram(sfs_forindex.index, 
                                          bins=log_bins,
                                          weights=sfs_avg_final.flatten())
        std_gaussian_binned, _ = np.histogram(sfs_forindex.index,
                                               bins=log_bins,
                                               weights=sfs_std_final.flatten())
        for i in range(len(bin_centers)):
            sfs_data.append([vt, labs_pca[j], bin_centers[i], sfs_avg_binned[i], std_gaussian_binned[i]])



        
    # Load and process uniform data for geo
    sfs_unif_geo = pd.read_csv(f'../empirical/subsampling_SIR_v20250127/results/sfs/chr1_{vt}_uniformgeo_nSIR10000_nSIRreps10.SIRsfs', sep=' ')
    sfs_unif_geo_grouped = sfs_unif_geo.groupby('OBS_N')['COUNT_SITES'].agg(['mean', 'std'])
    sfs_unif_geo_avg = sfs_unif_geo_grouped['mean']/scale_factor
    sfs_unif_geo_std = sfs_unif_geo_grouped['std']/scale_factor

    # Binning for uniform data (geo)
    sfs_unif_geo_binned, _ = np.histogram(sfs_unif_geo_avg.index, bins=log_bins, weights=sfs_unif_geo_avg.values)
    std_unif_geo_binned, _ = np.histogram(sfs_unif_geo_avg.index, bins=log_bins, weights=sfs_unif_geo_std.values)
    for i in range(len(bin_centers)):
        sfs_data.append([vt, "D", bin_centers[i], sfs_unif_geo_binned[i], std_unif_geo_binned[i]])

    # Process Gaussian data for geo
    for j, w in enumerate(w_list_geo):
        all_sfs_avg = []
        all_sfs_std = []

        for center in centers_geo:
            sfs = pd.read_csv(f'../empirical/subsampling_SIR_v20250127/results/sfs/chr1_{vt}_{center}geo{w}_nSIR10000_nSIRreps10.SIRsfs', delimiter=' ')
            sfs_avg_grouped = sfs.groupby('OBS_N')['COUNT_SITES'].agg(['mean', 'std'])
            all_sfs_avg.append(sfs_avg_grouped['mean']/scale_factor)
            all_sfs_std.append(sfs_avg_grouped['std']/scale_factor)
        
        combined_sfs_avg = pd.concat(all_sfs_avg, axis=1)
        combined_sfs_std = pd.concat(all_sfs_std, axis=1)
        sfs_avg_final = combined_sfs_avg.mean(axis=1)
        sfs_std_final = combined_sfs_std.std(axis=1)

        sfs_avg_binned, _ = np.histogram(sfs_avg_final.index, bins=log_bins, weights=sfs_avg_final.values)
        std_gaussian_binned, _ = np.histogram(sfs_avg_final.index, bins=log_bins, weights=sfs_std_final.values)
        for i in range(len(bin_centers)):
            sfs_data.append([vt, labs_geo[j], bin_centers[i], sfs_avg_binned[i], std_gaussian_binned[i]])

        

sfs_df = pd.DataFrame(sfs_data, columns=['Variant_Type', 'Panel', 'Bin_Center', 'Binned_SFS', 'Binned_Std'])
sfs_df.to_csv("sfs_binned_summary.csv", index=False)

synonymous
170.89337073897704
missense
167.10810974979066
lof
167.616


In [None]:
### RATIOS

In [97]:
nbins = 13
n=10000
sfs_unif = pd.read_csv(f'../empirical/subsampling_SIR_v20250127/results/sfs/chr1_lof_uniformgeo_nSIR10000_nSIRreps10.SIRsfs', sep=' ')
sfs_unif_grouped = sfs_unif.groupby('OBS_N')['COUNT_SITES'].agg(['mean', 'std'])
sfs_unif_avg = sfs_unif_grouped['mean']

log_bins = np.logspace(np.log10(1), np.log10(n/2), num=nbins)
log_bins = np.unique(np.round(log_bins))  

log_bins = log_bins[log_bins <= 100][:-1]  
log_bins = np.append(log_bins, [100, np.inf])  


bin_centers = np.sqrt(log_bins[:-1] * log_bins[1:])
bin_centers[-1] = 100 

vartypes = ['synonymous', 'missense', 'lof']
centers_geo = ['centerE16N4', 'centerE9N9', 'centerE6N4']
w_list_geo = ['50000', '100000', '150000']
labs_geo = ['A', 'B', 'C', 'D']

centers_pca = ['centerX19Y4']
w_list_pca = [0.0015, 0.0025, 0.005]
labs_pca = ['E', 'F', 'G', 'H']


sfs_data = []
for k, vt in enumerate(vartypes):
    print(vt)
    if vt=='synonymous':
        scale_factor = len_syn*prop_kept_syn # also divide by prop kept
    elif vt=='missense':
        scale_factor = len_mis*prop_kept_mis # also divide by prop kept
    elif vt=='lof':
        scale_factor = len_lof
    print(scale_factor)
    
    # Process PCA data
    sfs_unif_pca = pd.read_csv(f'../empirical/subsampling_SIR_v20250127/results/sfs/chr1_{vt}_uniformpca_nSIR10000_nSIRreps10.SIRsfs', sep=' ')
    for rep in np.arange(1,11):
        sfs_unif_pca_rep = sfs_unif_pca[sfs_unif_pca['SIR_REP']==rep]#.groupby('OBS_N')['COUNT_SITES'].agg(['mean', 'std'])
        sfs_unif_pca_rep['COUNT_SITES'] = sfs_unif_pca_rep['COUNT_SITES']/scale_factor
    
        sfs_unif_pca_binned, _ = np.histogram(sfs_unif_pca_rep['OBS_N'], bins=log_bins, weights=sfs_unif_pca_rep['COUNT_SITES'])
        

        for i in range(len(bin_centers)):
            sfs_data.append([vt, "H", rep, 'uniform', bin_centers[i], sfs_unif_pca_binned[i]])  
    
        for j, w in enumerate(w_list_pca):
            all_sfs_avg = []
            all_sfs_std = []
    
            for center in centers_pca:
                sfs = pd.read_csv(f'../empirical/subsampling_SIR_v20250127/results/sfs/chr1_{vt}_{center}pca{w}_nSIR10000_nSIRreps10.SIRsfs', delimiter=' ')
                sfs_rep = sfs[sfs['SIR_REP']==rep]
                sfs_rep['COUNT_SITES']=sfs_rep['COUNT_SITES']/scale_factor
                sfs_binned, _ = np.histogram(sfs_rep['OBS_N'],bins=log_bins,weights=sfs_rep['COUNT_SITES'])

                for i in range(len(bin_centers)):
                    sfs_data.append([vt, labs_pca[j], rep, center, bin_centers[i], sfs_binned[i]])  

    # Process geo data
    sfs_unif_geo = pd.read_csv(f'../empirical/subsampling_SIR_v20250127/results/sfs/chr1_{vt}_uniformgeo_nSIR10000_nSIRreps10.SIRsfs', sep=' ')
    for rep in np.arange(1,11):
        sfs_unif_geo_rep = sfs_unif_geo[sfs_unif_geo['SIR_REP']==rep]#.groupby('OBS_N')['COUNT_SITES'].agg(['mean', 'std'])
        sfs_unif_geo_rep['COUNT_SITES'] = sfs_unif_geo_rep['COUNT_SITES']/scale_factor
    
        sfs_unif_geo_binned, _ = np.histogram(sfs_unif_geo_rep['OBS_N'], bins=log_bins, weights=sfs_unif_geo_rep['COUNT_SITES'])
        

        for i in range(len(bin_centers)):
            sfs_data.append([vt, "D", rep, 'uniform', bin_centers[i], sfs_unif_geo_binned[i]])  
    
        for j, w in enumerate(w_list_geo):
            all_sfs_avg = []
            all_sfs_std = []
    
            for center in centers_geo:
                sfs = pd.read_csv(f'../empirical/subsampling_SIR_v20250127/results/sfs/chr1_{vt}_{center}geo{w}_nSIR10000_nSIRreps10.SIRsfs', delimiter=' ')
                sfs_rep = sfs[sfs['SIR_REP']==rep]
                sfs_rep['COUNT_SITES']=sfs_rep['COUNT_SITES']/scale_factor
                sfs_binned, _ = np.histogram(sfs_rep['OBS_N'],bins=log_bins,weights=sfs_rep['COUNT_SITES'])

                for i in range(len(bin_centers)):
                    sfs_data.append([vt, labs_geo[j], rep, center, bin_centers[i], sfs_binned[i]])  


synonymous
170.89337073897704
missense
167.10810974979066
lof
167.616


In [98]:
sfs_df_rep = pd.DataFrame(sfs_data,columns=['Variant Type','Panel','Rep','center','bin center', 'sfs_binned'])

In [99]:
df = sfs_df_rep
df_A = df[df['Panel']=='A']
df_D = df[df['Panel']=='D']
df_ratio = pd.merge(df_A, df_D, on=['Rep', 'bin center','Variant Type'], suffixes=('_A', '_D'))
df_ratio['ratio'] = df_ratio['sfs_binned_A'] / df_ratio['sfs_binned_D']

In [100]:
df_ratio_grouped = df_ratio.groupby(['Variant Type', 'bin center']).agg(
    ratio=('ratio', 'mean'),
    stdev=('ratio', 'std')
).reset_index()

In [101]:
df_ratio_grouped.to_csv('ratios.csv')

In [102]:
df = sfs_df_rep
df_E = df[df['Panel']=='E']
df_H = df[df['Panel']=='H']
df_ratio = pd.merge(df_E, df_H, on=['Rep', 'bin center','Variant Type'], suffixes=('_E', '_H'))
df_ratio['ratio'] = df_ratio['sfs_binned_E'] / df_ratio['sfs_binned_H']

In [103]:
df_ratio_grouped = df_ratio.groupby(['Variant Type', 'bin center']).agg(
    ratio=('ratio', 'mean'),
    stdev=('ratio', 'std')
).reset_index()
df_ratio_grouped.to_csv('ratios_pca.csv')

In [104]:
df_ratio_grouped

Unnamed: 0,Variant Type,bin center,ratio,stdev
0,lof,1.414214,0.87141,0.020613
1,lof,2.828427,0.820234,0.065244
2,lof,5.656854,0.855524,0.072503
3,lof,11.661904,0.760761,0.086532
4,lof,24.392622,0.685131,0.085319
5,lof,59.160798,0.615239,0.057704
6,lof,100.0,0.793529,0.034099
7,missense,1.414214,0.845347,0.020035
8,missense,2.828427,0.861208,0.056515
9,missense,5.656854,0.810716,0.039695
