## Seeds experiments
- This experiment tests the samples present in the single cell line GM12878 ('201666260058_R06C01', '201666260058_R06C02', '201662330194_R02C01','201662330194_R01C01' and '201662330194_R01C02').
- For each sample, 50 positions/SNPs were masked in a random and reproducible way using the random module, the sample library and assigning seeds (1 to 50).
- Thus, 50 SNPs for each seed are randomly masked and imputed back in the single cell SNParray data, finally compared with the bulk SNP array data.
- Similarity coefficients are calculated before (to test the initial situation) and after imputation (to test a possible improvement).
- A total of 1100 imputations were performed in the GenomeDK server using 25 CPUs. A total of 5500 jaccard scores were calculated referring to 22 chromosomes, excluding sex chromosomes.
- The calculation of the coefficients is performed via the sklearn.metrics package by the jaccard_score module.
- Using scatterplots, the jaccard scores before and after imputation are plotted.
- Subsequently, a paired test highlights the significant difference for each chromosome relative to the similarity coefficients.


attenzione che nel file single cell abbiamo i campioni rinominati nello stesso ordine del file bulk!
i nomi delle variabili sono state cambiate ma l'ordine e' lo stesso quindi possiamo ricostruire l'ordine iniziale 

In [None]:
# import libraries
# ! pip install openpyxl
# ! pip install -U kaleido
# ! pip install ipykernel
# ! 
import pandas as pd
import numpy as np
import random as rd
import subprocess
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt
from sklearn.metrics import jaccard_score
import scipy as sp
import scipy.stats as stats
import openpyxl
import os
import seaborn as sns

# avoid warnings
pd.options.mode.chained_assignment = None

In [None]:
# load datasets
bk_1 = pd.read_csv('data/GM12878_gDNA_1.vcf', header=30,sep='\t', dtype='object')
bk_2 = pd.read_csv('data/GM12878_gDNA_2.vcf', header=30,sep='\t', dtype='object')
bk_3 = pd.read_csv('data/GM12878_gDNA_3.vcf', header=30,sep='\t', dtype='object')
bk_4 = pd.read_csv('data/GM12878_gDNA_4.vcf', header=30,sep='\t', dtype='object')
bk_5 = pd.read_csv('data/GM12878_gDNA_5.vcf', header=30,sep='\t', dtype='object')
sc_129 = pd.read_csv('data/GM12878_SC_129.vcf.gz', header=30,sep='\t', dtype='object')
sc_130 = pd.read_csv('data/GM12878_SC_130.vcf.gz', header=30,sep='\t', dtype='object')
sc_131 = pd.read_csv('data/GM12878_SC_131.vcf.gz', header=30,sep='\t', dtype='object')
sc_132 = pd.read_csv('data/GM12878_SC_132.vcf.gz', header=30,sep='\t', dtype='object')
sc_133 = pd.read_csv('data/GM12878_SC_133.vcf.gz', header=30,sep='\t', dtype='object')

# merge datasets
bk_merg = pd.concat([bk_1, bk_2.iloc[:,-1], bk_3.iloc[:,-1],  bk_4.iloc[:,-1],  bk_5.iloc[:,-1]], axis=1)
sc_merg = pd.concat([sc_129, sc_130.iloc[:,-1], sc_131.iloc[:,-1],  sc_132.iloc[:,-1],  sc_133.iloc[:,-1]], axis=1)

In [None]:
bk_merg.head(3)

In [None]:
sc_merg.head(3)

In [None]:
# set the paths
result_dir = '../server/results/seeds/'
output_dir = '../server/outputs/seeds/'
plot_dir = '../server/plots/seeds/'

In [None]:
# set variables
#samples = ['201666260058_R06C01', '201666260058_R06C02', '201662330194_R02C01', '201662330194_R01C01', '201662330194_R01C02']
chromosomes = list(range(1,23))
chromosomes_str = [str(chrom) for chrom in chromosomes]
bulk_ids = ['1','2','3','4','5']
singlecell_ids = ['129','130','131','132','133']

In [None]:
#all_positions = bk_merg.POS.to_list()
column_names_original_bk = bk_merg.columns.to_list()
column_names_original_sc = sc_merg.columns.to_list()
column_names = ['#CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT', 'rep_1', 'rep_2', 'rep_3', 'rep_4', 'rep_5']

In [None]:
# splitting merged dataframes for each chromosome
for chrom in chromosomes_str:
    bk_merg_chr = bk_merg[bk_merg['#CHROM'] == chrom]
    sc_merg_chr = sc_merg[sc_merg['#CHROM'] == chrom]

    bk_merg_chr.columns = column_names
    sc_merg_chr.columns = column_names

    bk_merg_chr['FORMAT'] = 'GT'
    sc_merg_chr['FORMAT'] = 'GT'

    bk_merg_chr.iloc[:,9:] = bk_merg_chr.iloc[:,9:].apply(lambda x : x.str.split(':').str.get(0))
    sc_merg_chr.iloc[:,9:] = sc_merg_chr.iloc[:,9:].apply(lambda x : x.str.split(':').str.get(0))

    bk_merg_chr.to_csv(os.path.join(output_dir, f'merged_chr/GM12878_gDNA_merged_chr{chrom}.vcf'), sep='\t', index=False)
    sc_merg_chr.to_csv(os.path.join(output_dir, f'merged_chr/GM12878_SC_merged_chr{chrom}.vcf.gz'), sep='\t', index=False)


In [None]:
pd.read_csv(os.path.join(output_dir, 'merged_chr/GM12878_gDNA_merged_chr2.vcf'),header=0,sep='\t', dtype='object').head(3)

In [None]:
pd.read_csv(os.path.join(output_dir, 'merged_chr/GM12878_SC_merged_chr2.vcf.gz'),header=0,sep='\t', dtype='object').head(3)

#### Checking the genotypes

### Datasets preparation

In [None]:
percentages = ['10','20','30','40']
seeds = range(1, 11)

for chrom in chromosomes:
    for perc in percentages:
        for seed in seeds:
            sc_merg_chr = pd.read_csv(os.path.join(output_dir, f'merged_chr/GM12878_SC_merged_chr{chrom}.vcf.gz'), header=0, sep='\t', dtype='object')
            bk_merg_chr = pd.read_csv(os.path.join(output_dir, f'merged_chr/GM12878_gDNA_merged_chr{chrom}.vcf'), header=0, sep='\t', dtype='object')

            rd.seed(seed)
            pos = sc_merg_chr['POS'].tolist()
            pos_seed = rd.sample(pos, round((len(pos)*(int(perc)/100))))

            position_masked = os.path.join(result_dir, f'positions_masked/maskedpos_chr{chrom}_seed{seed}_perc{perc}_fifty.txt')
            with open(position_masked,'w') as output:
                output.write(str(pos_seed))
    
            sc_merg_chr_toimp = sc_merg_chr[~sc_merg_chr['POS'].isin(pos_seed)]
            bk_merg_chr_check = bk_merg_chr[bk_merg_chr['POS'].isin(pos_seed)]
            sc_merg_chr_check = sc_merg_chr[sc_merg_chr['POS'].isin(pos_seed)]

            sc_merg_chr_toimp.to_csv(os.path.join(output_dir, f'toimp_percs/GM12878_SC_merged_chr{chrom}_seed{seed}_perc{perc}_toimp_percs.vcf.gz'), sep='\t', index=False)
            bk_merg_chr_check.to_csv(os.path.join(output_dir, f'check_percs/GM12878_gDNA_merged_chr{chrom}_seed{seed}_perc{perc}_check_percs.vcf'), sep='\t', index=False)
            sc_merg_chr_check.to_csv(os.path.join(output_dir, f'check_percs/GM12878_SC_merged_chr{chrom}_seed{seed}_perc{perc}_check_percs.vcf.gz'), sep='\t', index=False)


In [None]:
percentages = ['10']
seeds = range(1, 51)

for chrom in chromosomes:
    for perc in percentages:
        for seed in seeds:
            sc_merg_chr = pd.read_csv(os.path.join(output_dir, f'merged_chr/GM12878_SC_merged_chr{chrom}.vcf.gz'), header=0, sep='\t', dtype='object')
            bk_merg_chr = pd.read_csv(os.path.join(output_dir, f'merged_chr/GM12878_gDNA_merged_chr{chrom}.vcf'), header=0, sep='\t', dtype='object')

            rd.seed(seed)
            pos = sc_merg_chr['POS'].tolist()
            pos_seed = rd.sample(pos, round((len(pos)*(int(perc)/100))))

            position_masked = os.path.join(result_dir, f'positions_masked/maskedpos_chr{chrom}_seed{seed}_perc{perc}_fifty.txt')
            with open(position_masked,'w') as output:
                output.write(str(pos_seed))
    
            sc_merg_chr_toimp = sc_merg_chr[~sc_merg_chr['POS'].isin(pos_seed)]
            bk_merg_chr_check = bk_merg_chr[bk_merg_chr['POS'].isin(pos_seed)]
            sc_merg_chr_check = sc_merg_chr[sc_merg_chr['POS'].isin(pos_seed)]

            sc_merg_chr_toimp.to_csv(os.path.join(output_dir, f'toimp_fifty/GM12878_SC_merged_chr{chrom}_seed{seed}_perc{perc}_toimp_fifty.vcf.gz'), sep='\t', index=False)
            bk_merg_chr_check.to_csv(os.path.join(output_dir, f'check_fifty/GM12878_gDNA_merged_chr{chrom}_seed{seed}_perc{perc}_check_fifty.vcf'), sep='\t', index=False)
            sc_merg_chr_check.to_csv(os.path.join(output_dir, f'check_fifty/GM12878_SC_merged_chr{chrom}_seed{seed}_perc{perc}_check_fifty.vcf.gz'), sep='\t', index=False)


In [None]:
pd.read_csv(os.path.join(output_dir, 'check_fifty/GM12878_gDNA_merged_chr11_seed2_perc10_check_fifty.vcf'),header=0,sep='\t', dtype='object').head(3)

### Imputation and jaccard score

In [None]:
import subprocess

def impute_seed(chrom, seed, perc):
    ref = f'/home/mreverenna/reference/chr{chrom}.1kg.phase3.v5a.vcf.gz '
    map = f'/home/mreverenna/map/plink.chr{chrom}.GRCh37.map '
    gt = f'/home/mreverenna/analysis/vcf_experiments/masking_seeds/percs_toimp/GM12878_SC_merged_chr{chrom}_seed{seed}_perc{perc}_toimp_percs.vcf.gz '
    out = f'/home/mreverenna/analysis/vcf_experiments/masking_seeds/percs_imputed/GM12878_SC_merged_chr{chrom}_seed{seed}_perc{perc}imputed_percs'
    positions = f'/home/mreverenna/analysis/vcf_experiments/masking_seeds/positions/file_diff_{chrom}.txt '

    print(f'Imputation of GM12878_SC_merged_chr{chrom}_seed{seed}_perc{perc}_toimp is started, take a little break...')
    command = "java -jar /home/mreverenna/programs/beagle.22Jul22.46e.jar " + f"excludemarkers={positions}" + f"ref={ref}" + f"gt={gt}" + f"map={map}" + f"out={out}"
    result = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()

if __name__ == '__main__':
    chromosomes = list(range(1,23))
    percentages = ['10','20','30','40']
    seeds = range(1,11)

    for chrom in chromosomes:
        for seed in seeds:
            for perc in percentages:
                impute_seed(chrom, seed, perc)

In [None]:
import subprocess

def impute_seed(chrom, seed, perc):
    ref = f'/home/mreverenna/reference/chr{chrom}.1kg.phase3.v5a.vcf.gz '
    map = f'/home/mreverenna/map/plink.chr{chrom}.GRCh37.map '
    gt = f'/home/mreverenna/analysis/vcf_experiments/dir_toimp/GM12878_SC_merged_chr{chrom}_seed{seed}_perc{perc}_toimp_hundred.vcf.gz '
    out = f'/home/mreverenna/analysis/vcf_experiments/masking_seeds/imputed_ex/GM12878_SC_merged_chr{chrom}_seed{seed}_perc{perc}imputed_hundred_ex'
    positions = f'/home/mreverenna/analysis/vcf_experiments/masking_seeds/positions/file_diff_{chrom}.txt '

    print(f'Imputation of GM12878_SC_merged_chr{chrom}_seed{seed}_perc{perc}_toimp is started, take a little break...')
    command = "java -jar /home/mreverenna/programs/beagle.22Jul22.46e.jar " + f"excludemarkers={positions}" + f"ref={ref}" + f"gt={gt}" + f"map={map}" + f"out={out}"
    result = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()

if __name__ == '__main__':
    chromosomes = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22]
    percentages = ['10']
    seeds = range(1,51)

    for chrom in chromosomes:
        for seed in seeds:
            for perc in percentages:
                impute_seed(chrom, seed, perc)

In [None]:
pd.read_csv(os.path.join(output_dir, 'check_fifty/GM12878_gDNA_merged_chr3_seed3_perc10_check_fifty.vcf'),header=0,sep='\t', dtype='object').head(3)

#### Pre imputation

In [None]:
#output_dir = '/home/mreverenna/analysis/vcf_experiments/masking_seeds/check_hundred/'
#result_dir = '/home/mreverenna/analysis/vcf_experiments/masking_seeds/results/'
gen_dosage = {'0/0': 0, '0/1': 1, '1/0': 1, '1/1': 2, '1/2': 3, '2/2': 4, './.': 5}
samples = ['rep_1', 'rep_2', 'rep_3', 'rep_4', 'rep_5']
chromosomes = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22]
seeds = range(1, 51)
percentages = ['10']

j_list_pre = []

for sample in samples:
    dict_pre = []
    for chrom in chromosomes:
        for seed in seeds:
            for perc in percentages:
                bk_merg_chr_check = pd.read_csv(os.path.join(output_dir, f'check_fifty/GM12878_gDNA_merged_chr{chrom}_seed{seed}_perc{perc}_check_fifty.vcf'), header=0, sep='\t', dtype='object')
                sc_merg_chr_check = pd.read_csv(os.path.join(output_dir, f'check_fifty/GM12878_SC_merged_chr{chrom}_seed{seed}_perc{perc}_check_fifty.vcf.gz'), header=0, sep='\t', dtype='object')

                vec_bulk = bk_merg_chr_check[sample].map(gen_dosage).tolist()
                vec_sing = sc_merg_chr_check[sample].map(gen_dosage).tolist()

                j_value = jaccard_score(vec_bulk, vec_sing, average='micro')
                j_list_pre.append(j_value)

                dict_js = {
                    'sample' : sample,
                    'chromosome' : chrom,
                    'seed': seed,
                    'perc': perc,
                    'j_score': j_value
                }

                dict_pre.append(dict_js)

    pd.DataFrame(dict_pre).to_excel(os.path.join(result_dir, f'tables_before/jaccard_scores_{sample}_pre_fifty_micro.xlsx'), index=False)

In [None]:
pd.read_csv(os.path.join(output_dir, 'check_fifty/GM12878_SC_merged_chr1_seed2_perc10_check_fifty.vcf.gz'),header=0,sep='\t', dtype='object').head(3)

In [None]:
pd.read_csv(os.path.join(output_dir, 'check_fifty/GM12878_gDNA_merged_chr1_seed2_perc10_check_fifty.vcf'),header=0,sep='\t', dtype='object').head(3)

#### After imputation

In [None]:
output_dir

In [None]:
pd.read_csv(os.path.join(output_dir, 'imputed_fifty/GM12878_SC_merged_chr1_seed2_perc10imputed_hundred_ex.vcf.gz'),header=8,sep='\t', dtype='object')

In [None]:
pd.read_csv(os.path.join(output_dir, f'imputed_fifty_copy/GM12878_SC_merged_chr1_seed24_perc10imputed_hundred_ex.vcf.gz'),header=8,sep='\t', dtype='object')

In [None]:
pd.read_csv(os.path.join(output_dir, 'imputed_fifty_copy/GM12878_SC_merged_chr1_seed24_perc10imputed_fifty.vcf.gz'),header=0,sep='\t', dtype='object')

In [None]:
pd.read_csv(os.path.join(output_dir, 'imputed_fifty_copy/GM12878_SC_merged_chr22_seed1_perc10imputed_fifty.vcf.gz'),header=0,sep='\t', dtype='object')

In [None]:
samples = ['rep_1', 'rep_2', 'rep_3', 'rep_4', 'rep_5']
chromosomes = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22]
seeds = range(1, 51)
percentages = ['10']
gen_dosage =     {'0/0': 0,'0/1': 1,'1/0': 1, '1/1': 2, '1/2': 3, '2/1': 3, '2/2': 4, '2/0': 5,'0/2': 5}
gen_dosage_imp = {'0|0': 0,'0|1': 1,'1|0': 1, '1|1': 2, '1|2': 3, '2|1': 3, '2|2': 4,'2|0': 5,'0|2': 5}


j_list_post = []
for sample in samples:
    dict_post = []

    for chrom in chromosomes:
        for seed in seeds:
            for perc in percentages:

                sc_merg_chr_imputed = pd.read_csv(os.path.join(output_dir, f'imputed_fifty_copy/GM12878_SC_merged_chr{chrom}_seed{seed}_perc{perc}imputed_fifty.vcf.gz'), header=0, sep='\t', dtype='object')
                bk_merg_chr_check = pd.read_csv(os.path.join(output_dir, f'check_fifty/GM12878_gDNA_merged_chr{chrom}_seed{seed}_perc{perc}_check_fifty.vcf'), header=0, sep='\t', dtype='object')

                sc_merg_chr_imputed['FORMAT'] = 'GT'
                sc_merg_chr_imputed.iloc[:,9:] = sc_merg_chr_imputed.iloc[:,9:].apply(lambda x : x.str.split(':').str.get(0))
                sc_merg_chr_imputed = sc_merg_chr_imputed[~sc_merg_chr_imputed.iloc[:, 9:].isin(['3|0', '0|3', '3|3']).any(axis=1)]
                sc_merg_chr_imputed.drop_duplicates(subset=['POS'], inplace=True) 
                bk_merg_chr_check_ex = bk_merg_chr_check[~(bk_merg_chr_check.iloc[:, 9:] == './.').any(axis=1)] 
                                               
                # define the lists to compare
                sc_merg_chr_imp_filt = sc_merg_chr_imputed[sc_merg_chr_imputed.POS.isin(bk_merg_chr_check_ex['POS'].to_list())==True]
                bk_merg_chr_check_filt = bk_merg_chr_check_ex[bk_merg_chr_check_ex.POS.isin(sc_merg_chr_imp_filt['POS'].to_list())==True]
                
                print('Bk original: {}'.format(bk_merg_chr_check_ex.shape[0]), 'Sc original: {}'.format(sc_merg_chr_imputed.shape[0]))
                print('Bk filtered: {}'.format(bk_merg_chr_check_filt.shape[0]), 'Sc filtered: {}'.format(sc_merg_chr_imp_filt.shape[0]))
                print(f'GM12878_SC_merg_chr{chrom}_seed{seed}_perc{perc}imp_hundred vs GM12878_gDNA_merged_chr{chrom}_seed{seed}_perc{perc}_check_hundred')

                vec_bulk = bk_merg_chr_check_filt[sample].map(gen_dosage).tolist()
                vec_sing = sc_merg_chr_imp_filt[sample].map(gen_dosage_imp).tolist()
                                          
                j_value = jaccard_score(vec_bulk, vec_sing, average='micro')
                j_list_post.append(j_value)

                dict_js = {
                    'sample' : sample,
                    'chromosome' : chrom,
                    'seed': seed,
                    'perc': perc,
                    'j_score': j_value}

                dict_post.append(dict_js)

    pd.DataFrame(dict_post).to_excel(os.path.join(result_dir, f'tables_after/jaccard_scores_{sample}_post_fifty_micro.xlsx'), index=False)

### Graphical illustration

In [None]:
samples = ['rep_1', 'rep_2', 'rep_3', 'rep_4', 'rep_5']

In [None]:
# load excels jaccard score
file_names_pre = [os.path.join(result_dir,f'tables_before/jaccard_scores_{sample}_pre_fifty_micro.xlsx') for sample in samples]
dfs_pre = [pd.read_excel(file) for file in file_names_pre] # read each file into a list of DataFrames
result_pre = pd.concat(dfs_pre)

file_names_post = [os.path.join(result_dir,f'tables_after/jaccard_scores_{sample}_post_fifty_micro.xlsx') for sample in samples]
dfs_post = [pd.read_excel(file) for file in file_names_post] 
result_post = pd.concat(dfs_post)

result_merged = result_post.copy()

In [None]:
result_pre.head(3)

In [None]:
result_post.head(3)

In [None]:
result_merged['j_score_pre'] = result_pre['j_score']
result_merged.rename(columns={'j_score': 'j_score_post'}, inplace=True)

In [None]:
result_merged

### Plotly

In [None]:
fig = go.Figure()

fig.add_trace(go.Violin(x=result_merged['chromosome'], y=result_merged['j_score_pre'],
                        legendgroup='PRE', scalegroup='PRE', name='PRE', side = 'negative', line_color='red'))
fig.add_trace(go.Violin(x=result_merged['chromosome'], y=result_merged['j_score_post'],
                        legendgroup='POST', scalegroup='POST',  side = 'positive', name='POST',line_color='green'))

fig.update_traces(meanline_visible=True)
fig.update_layout(violingap=0, violinmode = 'overlay', 
                  title='Distribution of jaccard scores before and after imputation by chromosomes',
                  xaxis=dict(tickmode='array', tickvals=np.arange(1, 23)),
                  yaxis=dict(range=[0.4, 1]),
                  xaxis_title='Chromosomes',
                  yaxis_title='Jaccard scores')

#fig.write_image(os.path.join(plot_dir, 'violinplot_by_samples.pdf'))
fig.show()

In [None]:
fig = go.Figure()

fig.add_trace(go.Violin(x=result_merged['sample'], y=result_merged['j_score_pre'],
                        legendgroup='PRE', scalegroup='PRE', name='PRE',line_color='orange'))
fig.add_trace(go.Violin(x=result_merged['sample'], y=result_merged['j_score_post'],
                        legendgroup='POST', scalegroup='POST', name='POST',line_color='green'))

fig.update_traces(box_visible=True, meanline_visible=True)
fig.update_layout(violinmode='group', yaxis=dict(range=[0.4, 1]), 
                  title='Distribution of jaccard scores before and after imputation by samples',
                  xaxis_title='Samples',
                  yaxis_title='Jaccard scores')

#fig.write_image(os.path.join(plot_dir, 'violinplot_by_samples.pdf'))
fig.show()

####  Plots seaborn and matplot

In [None]:
# define plots functions before and after imputation
def plot_bef(pos1, pos2, per, chr, sample, data=result_pre):
    subset = data[(data['sample'] == sample) & (data['perc'] == per) & (data['chromosome'] == chr)]
    sns.scatterplot(data=subset, x='seed', y='j_score', hue='seed', ax=axs[pos1, pos2])
    axs[pos1, pos2].set_ylim(0.00, 1.00)
    axs[pos1, pos2].set_xlabel('Seeds')
    axs[pos1, pos2].set_ylabel('Jaccard scores')
    axs[pos1, pos2].set_title(f'{sample} before imputation')
    axs[pos1, pos2].axhline(y=0.5, linestyle='--', color='grey')
    axs[pos1, pos2].tick_params(axis='x', labelrotation=0)
    if axs[pos1, pos2].legend_ is not None:
        axs[pos1, pos2].legend_.remove()

def plot_post(pos1, pos2, per, chr, sample, data=result_post):
    subset = data[(data['sample']==sample) & (data['perc']==per) & (data['chromosome'] == chr)]
    sns.scatterplot(data=subset, x='seed', y='j_score', hue='seed', ax=axs[pos1, pos2])
    axs[pos1, pos2].set_ylim(0.00, 1.00)
    axs[pos1, pos2].set_xlabel('Seeds')
    axs[pos1, pos2].set_ylabel('Jaccard scores')
    axs[pos1, pos2].set_title(f'{sample} after imputation')
    axs[pos1, pos2].axhline(y=0.5, linestyle='--', color='grey')
    axs[pos1, pos2].tick_params(axis='x', labelrotation=0)
    if axs[pos1, pos2].legend_ is not None:
        axs[pos1, pos2].legend_.remove()

In [None]:
for chrom in chromosomes:

     fig, axs = plt.subplots(nrows=5, ncols=2, figsize=(22, 30))
     plot_bef(pos1=0, pos2=0, per = 10, chr = chrom, sample = 'rep_1', data = result_pre)
     plot_post(pos1=0, pos2=1, per = 10, chr = chrom, sample = 'rep_1', data = result_post)
     plot_bef(pos1=1, pos2=0, per = 10, chr = chrom, sample = 'rep_2', data = result_pre)
     plot_post(pos1=1, pos2=1, per = 10, chr = chrom, sample = 'rep_2', data = result_post)
     plot_bef(pos1=2, pos2=0, per = 10, chr = chrom, sample = 'rep_3', data = result_pre)
     plot_post(pos1=2, pos2=1, per = 10, chr = chrom, sample = 'rep_3', data = result_post)
     plot_bef(pos1=3, pos2=0, per = 10, chr = chrom, sample = 'rep_4', data = result_pre)
     plot_post(pos1=3, pos2=1, per = 10, chr = chrom, sample = 'rep_4', data = result_post)
     plot_bef(pos1=4, pos2=0, per = 10, chr = chrom, sample = 'rep_5', data = result_pre)
     plot_post(pos1=4, pos2=1, per = 10, chr = chrom, sample = 'rep_5', data = result_post)

     fig.suptitle(f'Comparison between samples in cromosome {chrom} 10 percent masked', fontsize=15)
     fig.subplots_adjust(top= 0.92, hspace=0.45, wspace=0.25)

     #plt.savefig(os.path.join(plot_dir, f'scatter_chr{chrom}_10perc.pdf'))
     plt.close(fig)
     
plt.show()

### Paird T-test

In [None]:
# load the jaccard score before and after imputation
result_pre = result_pre.rename(columns={'j_score': 'j_score_before'})
result_pre = result_pre.reset_index(drop=True)

result_post = result_post.rename(columns={'j_score': 'j_score_after'})
result_post = result_post.reset_index(drop=True)

# merge the tables and feature engineering
result_merged = pd.concat([result_pre.reset_index(drop=True),result_post.iloc[:,4].reset_index(drop=True)], axis=1)
result_merged['delta'] = result_merged['j_score_after'] - result_merged['j_score_before']
result_merged['improvement'] = result_merged['improvement'] = result_merged['delta'].apply(lambda x: 1 if x > 0 else 0)

In [None]:
result_merged.head(3)

In [None]:
dict_pvalues = {}

for chrom in chromosomes:
    try:
        df_chrom = result_merged[result_merged['chromosome'] == chrom]
        list_after = df_chrom['j_score_after']
        list_before = df_chrom['j_score_before']

        statistic, pvalue = stats.ttest_rel(list_after, list_before, alternative='greater')
        pvalue_log = -np.log10(pvalue)
        effect = list_after.mean() - list_before.mean()

        dict_pvalues[chrom] = (pvalue, pvalue_log, effect)

    except (ValueError, TypeError) as e:
        print(f"Error processing chromosome {chrom}: {str(e)}")

In [None]:
data_pv = pd.DataFrame.from_dict(dict_pvalues, orient='index', columns=['pvalue', 'log_pvalue', 'average_mean'])
data_pv.index.name='chromosome'

pvalues = [val[0] for val in dict_pvalues.values()]
log_pvalues = [val[1] for val in dict_pvalues.values()]

plt.bar(dict_pvalues.keys(), log_pvalues)
# y = -log10(0.01) = 2
plt.axhline(y=2, color='r')
plt.xlabel('Chromosome')
plt.ylabel('-log10(p-value)')
plt.show()
