In [1]:
from Bio import SeqIO
import os
import pandas
import glob

In [2]:
results = 'calls_v8'

In [27]:
b31_results = glob.glob('/home/mf019/longread_pangenome/synteny/asms_vs_b31_v2_parsed_v2/simple_coverage/*.tsv')
wp_results = glob.glob(f'{results}/wp/tables/*.tsv')
pf_results = glob.glob(f'{results}/pf32/tables/*.tsv')

In [28]:
pandas.read_csv(b31_results[0], sep='\t')

Unnamed: 0,assembly_id,contig_id,contig_len,total_contig_coverage,list_of_alignments(ref:contig_cov:location)
0,URI56H,contig000001,903272,99.97,['(chromosome : 99.97)']
1,URI56H,contig000002,53795,98.38,['(lp54 : 98.38)']
2,URI56H,contig000003,26506,99.96,['(cp26 : 99.96)']
3,URI56H,contig000004,23360,99.65,['(lp28-3 : 99.65)']
4,URI56H,contig000005,22261,98.59,"['(lp28-4 : 96.53)', '(lp5 : 2.06)']"
...,...,...,...,...,...
147,URI56H,contig000218,123,100.81,['(cp32-8 : 100.81)']
148,URI56H,contig000220,123,99.19,['(lp36 : 99.19)']
149,URI56H,contig000221,123,100.81,['(lp28-2 : 100.81)']
150,URI56H,contig000222,121,99.17,['(lp56 : 99.17)']


In [22]:
wp_dataframes = [pandas.read_csv(file, sep='\t') for file in wp_results]
pf_dataframes = [pandas.read_csv(file, sep='\t') for file in pf_results]
b31_homology_dfs = [pandas.read_csv(file, sep='\t') for file in b31_results]

In [59]:
def write_merged_table(results, **kwargs):
    output_dir = '/'.join(results[0].split('/')[:-3])
    results_type = results[0].split('/')[-3]
    tsvs = [pandas.read_csv(file, sep='\t') for file in results]
    combined_df = pandas.concat(tsvs, ignore_index=True)
    df_grouped = combined_df.groupby(['assembly_id', 'contig_id'])
    rows = []
    for (assembly_id, contig_id), group in df_grouped:
        group['overall_percent_identity'] = group['overall_percent_identity'].apply(lambda x: f'{x:.2f}')
        group['plasmid_id'] = group['plasmid_id'].apply(lambda x: x.split('|')[1]) if results_type == 'wp' else group['plasmid_id']
        homologies = list(zip(group['plasmid_id'], group['overall_percent_identity']))
        name_homologies = list(zip(group['plasmid_name'], group['overall_percent_identity']))
        rows.append({'assembly_id': assembly_id, 'contig_id':contig_id, f'{results_type}_homologies':homologies, f'{results_type}_names_homologies': name_homologies})
    new_df = pandas.DataFrame(rows)
    out_file = f'merged_{results_type}_hits.tsv'
    out_file_path = os.path.join(output_dir, out_file)
    out_path = kwargs.get('out', f'{out_file_path}')
    new_df.to_csv(out_path, sep='\t', index=False)
    return new_df

In [60]:
def write_merged_table_b31(results, **kwargs):
    out_file = f'merged_B31_homologies.tsv'
    out_path = kwargs.get('out', f'{os.getcwd()}')
    out_file_path = os.path.join(out_path, out_file)
    tsvs = [pandas.read_csv(file, sep='\t') for file in results]
    combined_df = pandas.concat(tsvs, ignore_index=True).rename(columns={'list_of_alignments(ref:contig_cov:location)': 'B31_homologies'})
    combined_df.to_csv(out_file_path, sep='\t', index=False)
    return combined_df

In [61]:
wp_df = write_merged_table(wp_results)
pf_df = write_merged_table(pf_results)

In [62]:
wp_df

Unnamed: 0,assembly_id,contig_id,wp_homologies,wp_names_homologies
0,ESI26H,contig000001,"[(CP124100.1, 99.95), (CP124096.1, 99.95), (CP...","[(chromosome, 99.95), (chromosome, 99.95), (ch..."
1,ESI26H,contig000002,"[(CP001565.1, 96.77), (CP001572.1, 96.63), (CP...","[(cp32-12, 96.77), (cp32-5, 96.63), (cp32-5+1,..."
2,ESI26H,contig000003,"[(CP001566.1, 97.99), (CP001262.1, 85.68), (CP...","[(lp28-9, 97.99), (lp28-6, 85.68), (lp28-6, 86..."
3,ESI26H,contig000004,"[(CP001567.1, 97.56), (CP001201.1, 97.52), (AE...","[(lp36, 97.56), (lp36, 97.52), (lp36, 97.22), ..."
4,ESI26H,contig000005,"[(AE001579.1, 95.97), (CP094603.1, 95.97), (CP...","[(cp32-7, 95.97), (cp32-7, 95.97), (cp32-10, 9..."
...,...,...,...,...
2190,UWI283P,MC149_lp28-3,"[(CP001203.1, 95.74), (CP001256.1, 95.93), (CP...","[(lp28-3, 95.74), (lp28-3, 95.93), (lp28-3, 95..."
2191,UWI283P,MC149_lp28-4,"[(CP002318.1, 97.41), (CP001272.1, 98.24), (CP...","[(lp28-4, 97.41), (lp28-4, 98.24), (lp28-4, 98..."
2192,UWI283P,MC149_lp28-5,"[(CP094592.1, 99.45), (CP002317.1, 97.78), (CP...","[(lp28-5, 99.45), (lp28-5, 97.78), (lp28-3, 92..."
2193,UWI283P,MC149_lp36,"[(CP001497.1, 88.18), (CP124078.1, 87.80), (CP...","[(lp17, 88.18), (lp17, 87.80), (lp17, 87.80), ..."


In [72]:
b31_df = write_merged_table_b31(b31_results, out='calls_v8')

In [73]:
b31_df

Unnamed: 0,assembly_id,contig_id,contig_len,total_contig_coverage,B31_homologies
0,URI56H,contig000001,903272,99.97,['(chromosome : 99.97)']
1,URI56H,contig000002,53795,98.38,['(lp54 : 98.38)']
2,URI56H,contig000003,26506,99.96,['(cp26 : 99.96)']
3,URI56H,contig000004,23360,99.65,['(lp28-3 : 99.65)']
4,URI56H,contig000005,22261,98.59,"['(lp28-4 : 96.53)', '(lp5 : 2.06)']"
...,...,...,...,...,...
1490,URI44H,contig000027,890,28.09,['(lp28-1 : 28.09)']
1491,URI44H,contig000031,674,100.15,['(lp56 : 100.15)']
1492,URI44H,contig000034,560,101.96,['(cp32-3 : 101.96)']
1493,URI44H,contig000055,206,99.03,['(lp56 : 99.03)']


In [74]:
b31_df = b31_df.drop(['total_contig_coverage'], axis=1)

In [75]:
merged = wp_df.merge(pf_df, on=['assembly_id', 'contig_id'], how='outer')

In [80]:
merged_all = merged.merge(b31_df, on=['assembly_id', 'contig_id'], how='outer')
merged_all

Unnamed: 0,assembly_id,contig_id,wp_homologies,wp_names_homologies,pf32_homologies,pf32_names_homologies,contig_len,B31_homologies
0,ESI26H,contig000001,"[(CP124100.1, 99.95), (CP124096.1, 99.95), (CP...","[(chromosome, 99.95), (chromosome, 99.95), (ch...","[(NE_5261_chromosome_ParA_2, 100.00), (B379_ch...","[(chromosome, 100.00), (chromosome, 99.45), (c...",910396.0,['(chromosome : 100.01)']
1,ESI26H,contig000002,"[(CP001565.1, 96.77), (CP001572.1, 96.63), (CP...","[(cp32-12, 96.77), (cp32-5, 96.63), (cp32-5+1,...","[(JD1_cp32-12_ParA_1, 99.62), (ZS7_cp32-12_Par...","[(cp32-12, 99.62), (cp32-12, 98.85), (cp32-12,...",61321.0,"['(cp32-8 : 8.92)', '(lp56 : 10.48)', '(cp32-4..."
2,ESI26H,contig000003,"[(CP001566.1, 97.99), (CP001262.1, 85.68), (CP...","[(lp28-9, 97.99), (lp28-6, 85.68), (lp28-6, 86...","[(Bol26_lp28-9, 100.00), (JD1_lp28-7, 96.05), ...","[(lp28-9, 100.00), (lp28-7, 96.05), (lp28-1, 8...",51152.0,"['(lp28-2 : 28.32)', '(lp56 : 3.89)', '(lp36 :..."
3,ESI26H,contig000004,"[(CP001567.1, 97.56), (CP001201.1, 97.52), (AE...","[(lp36, 97.56), (lp36, 97.52), (lp36, 97.22), ...","[(B31_lp36, 100.00), (MM1_lp36_ParA_1, 99.59)]","[(lp36, 100.00), (lp36, 99.59)]",34728.0,['(lp36 : 99.84)']
4,ESI26H,contig000005,"[(AE001579.1, 95.97), (CP094603.1, 95.97), (CP...","[(cp32-7, 95.97), (cp32-7, 95.97), (cp32-10, 9...","[(MM1_cp32-6_ParA_1, 100.00), (JD1_cp32-6, 99....","[(cp32-6, 100.00), (cp32-6, 99.60)]",30286.0,"['(cp32-6 : 37.37)', '(cp32-7 : 51.54)']"
...,...,...,...,...,...,...,...,...
2329,UWI283P,MC149_lp28-3,"[(CP001203.1, 95.74), (CP001256.1, 95.93), (CP...","[(lp28-3, 95.74), (lp28-3, 95.93), (lp28-3, 95...","[(JD1_lp28-3_ParA_1, 99.60), (Bol26_lp28-3_Par...","[(lp28-3, 99.60), (lp28-3, 98.01), (lp28-6, 72...",30741.0,"['(lp28-3 : 67.76)', '(lp25 : 20.54)']"
2330,UWI283P,MC149_lp28-4,"[(CP002318.1, 97.41), (CP001272.1, 98.24), (CP...","[(lp28-4, 97.41), (lp28-4, 98.24), (lp28-4, 98...","[(B31_lp28-4, 100.00), (WI91-23_lp28-4_ParA_1,...","[(lp28-4, 100.00), (lp28-4, 98.80), (untig, 62...",29493.0,"['(lp28-1 : 1.52)', '(lp17 : 3.52)', '(lp21 : ..."
2331,UWI283P,MC149_lp28-5,"[(CP094592.1, 99.45), (CP002317.1, 97.78), (CP...","[(lp28-5, 99.45), (lp28-5, 97.78), (lp28-3, 92...","[(JD1_lp28-5_ParA_1, 100.00), (B379_lp28-5_Par...","[(lp28-5, 100.00), (lp28-5, 100.00), (lp28-5, ...",26921.0,"['(lp5 : 10.96)', '(lp28-3 : 15.21)', '(lp21 :..."
2332,UWI283P,MC149_lp36,"[(CP001497.1, 88.18), (CP124078.1, 87.80), (CP...","[(lp17, 88.18), (lp17, 87.80), (lp17, 87.80), ...","[(MM1_lp36_ParA_1, 93.98), (B31_lp36, 93.98)]","[(lp36, 93.98), (lp36, 93.98)]",35937.0,"['(lp56 : 0.35)', '(lp36 : 20.56)', '(lp28-1 :..."


In [81]:
col_order = ['assembly_id', 'contig_id', 'contig_len', 'wp_homologies', 'wp_names_homologies', 'pf32_homologies', 'pf32_names_homologies', 'B31_homologies']

In [82]:
merged_all= merged_all[col_order]
merged_all

Unnamed: 0,assembly_id,contig_id,contig_len,wp_homologies,wp_names_homologies,pf32_homologies,pf32_names_homologies,B31_homologies
0,ESI26H,contig000001,910396.0,"[(CP124100.1, 99.95), (CP124096.1, 99.95), (CP...","[(chromosome, 99.95), (chromosome, 99.95), (ch...","[(NE_5261_chromosome_ParA_2, 100.00), (B379_ch...","[(chromosome, 100.00), (chromosome, 99.45), (c...",['(chromosome : 100.01)']
1,ESI26H,contig000002,61321.0,"[(CP001565.1, 96.77), (CP001572.1, 96.63), (CP...","[(cp32-12, 96.77), (cp32-5, 96.63), (cp32-5+1,...","[(JD1_cp32-12_ParA_1, 99.62), (ZS7_cp32-12_Par...","[(cp32-12, 99.62), (cp32-12, 98.85), (cp32-12,...","['(cp32-8 : 8.92)', '(lp56 : 10.48)', '(cp32-4..."
2,ESI26H,contig000003,51152.0,"[(CP001566.1, 97.99), (CP001262.1, 85.68), (CP...","[(lp28-9, 97.99), (lp28-6, 85.68), (lp28-6, 86...","[(Bol26_lp28-9, 100.00), (JD1_lp28-7, 96.05), ...","[(lp28-9, 100.00), (lp28-7, 96.05), (lp28-1, 8...","['(lp28-2 : 28.32)', '(lp56 : 3.89)', '(lp36 :..."
3,ESI26H,contig000004,34728.0,"[(CP001567.1, 97.56), (CP001201.1, 97.52), (AE...","[(lp36, 97.56), (lp36, 97.52), (lp36, 97.22), ...","[(B31_lp36, 100.00), (MM1_lp36_ParA_1, 99.59)]","[(lp36, 100.00), (lp36, 99.59)]",['(lp36 : 99.84)']
4,ESI26H,contig000005,30286.0,"[(AE001579.1, 95.97), (CP094603.1, 95.97), (CP...","[(cp32-7, 95.97), (cp32-7, 95.97), (cp32-10, 9...","[(MM1_cp32-6_ParA_1, 100.00), (JD1_cp32-6, 99....","[(cp32-6, 100.00), (cp32-6, 99.60)]","['(cp32-6 : 37.37)', '(cp32-7 : 51.54)']"
...,...,...,...,...,...,...,...,...
2329,UWI283P,MC149_lp28-3,30741.0,"[(CP001203.1, 95.74), (CP001256.1, 95.93), (CP...","[(lp28-3, 95.74), (lp28-3, 95.93), (lp28-3, 95...","[(JD1_lp28-3_ParA_1, 99.60), (Bol26_lp28-3_Par...","[(lp28-3, 99.60), (lp28-3, 98.01), (lp28-6, 72...","['(lp28-3 : 67.76)', '(lp25 : 20.54)']"
2330,UWI283P,MC149_lp28-4,29493.0,"[(CP002318.1, 97.41), (CP001272.1, 98.24), (CP...","[(lp28-4, 97.41), (lp28-4, 98.24), (lp28-4, 98...","[(B31_lp28-4, 100.00), (WI91-23_lp28-4_ParA_1,...","[(lp28-4, 100.00), (lp28-4, 98.80), (untig, 62...","['(lp28-1 : 1.52)', '(lp17 : 3.52)', '(lp21 : ..."
2331,UWI283P,MC149_lp28-5,26921.0,"[(CP094592.1, 99.45), (CP002317.1, 97.78), (CP...","[(lp28-5, 99.45), (lp28-5, 97.78), (lp28-3, 92...","[(JD1_lp28-5_ParA_1, 100.00), (B379_lp28-5_Par...","[(lp28-5, 100.00), (lp28-5, 100.00), (lp28-5, ...","['(lp5 : 10.96)', '(lp28-3 : 15.21)', '(lp21 :..."
2332,UWI283P,MC149_lp36,35937.0,"[(CP001497.1, 88.18), (CP124078.1, 87.80), (CP...","[(lp17, 88.18), (lp17, 87.80), (lp17, 87.80), ...","[(MM1_lp36_ParA_1, 93.98), (B31_lp36, 93.98)]","[(lp36, 93.98), (lp36, 93.98)]","['(lp56 : 0.35)', '(lp36 : 20.56)', '(lp28-1 :..."


In [83]:
merged_all.to_csv('calls_v8/merged_calls_v8.tsv', sep='\t', index=False)