In [None]:
import pandas as pd
import numpy as np
import os
import pysam
import pickle as pkl
from matplotlib import pyplot as plt
from matplotlib.patches import FancyArrow
from matplotlib import cm
from matplotlib.lines import Line2D
from matplotlib.patches import Rectangle
from matplotlib.ticker import MultipleLocator
import seaborn as sns
from Bio import SeqIO
from scipy import stats
from progressbar import ProgressBar
from statsmodels.stats.multitest import multipletests
from roman import toRoman
import itertools
from collections import Counter
import re

# General functions and metadata

In [None]:
rc_sns = {'ytick_color':'k', 'xtick_color':'k', 'text_color':'k', 'font.sans-serif':'DejaVu Sans', 'figure.facecolor':(1,1,1,1)}
sns.set_style(style='ticks', rc=rc_sns)

In [None]:
#path for figures
fig_path = '/home/mathieu/mhenault_landrylab/Publications/lrma/draft/fig/'
tables_path = '/home/mathieu/mhenault_landrylab/Publications/lrma/draft/tables/'

In [None]:
#import tables of strain identities and cross parents
nano_strains = pd.read_csv('/mnt/HDD3/lrma/script/nano_strains.csv', index_col=0)
Cross_parents = pd.read_csv('/mnt/HDD3/lrma/private_variants/cross_parents.txt', sep=';', header=None, index_col=0).squeeze()

In [None]:
cross_alias = {'VL3':'CC1',
              'VL4':'CC2',
              'VL5':'CC3',
              'VL1':'BB1',
              'VL2':'BB2',
              'L1':'BC1',
              'L2':'BC2',
              'M1':'BA1',
              'M2':'BA2',
              'H1':'BSc1',
              'H2':'BSc2'}
nano_strains['cross'] = nano_strains['cross'].replace(cross_alias)
cross_order = {j:i for i,j in enumerate(['CC1', 'CC2', 'CC3', 'BB1', 'BB2', 'BC1', 'BC2', 'BA1', 'BA2', 'BSc1', 'BSc2'])}
cross_color = nano_strains.groupby('cross')['cross_color'].apply(lambda x: x.iloc[0]).to_dict()

In [None]:
#import table of cross parents
cross_parents = pd.read_csv('/mnt/HDD3/lrma/script/cross_parents.csv', sep=',', index_col=0)
cross_color = nano_strains.groupby('cross')['cross_color'].apply(lambda x: x.iloc[0]).to_dict()

parents_background = cross_parents.value_counts(['strain', 'group', 'mat', 'ho', 'ade2']).rename('count').reset_index().set_index('strain')

In [None]:
# dict of parental strains with new cross IDs
for (cross, p) in Cross_parents.items():
    Cross_parents[cross_alias[cross]] = p
Parents_order = dict(zip(['LL2011_004', 'LL2011_009', 'MSH-587-1', 'LL2011_012', 'LL2011_001',
                          'MSH-604', 'UWOPS-91-202', 'LL2012_028', 'LL2012_021',
                          'YPS644', 'YPS744', 'LL2013_040', 'LL2013_054'], range(13)))

parents_group = {'MSH-604':'SpB',
                   'UWOPS-91-202':'SpB',
                   'LL2012_021':'SpB',
                   'LL2012_028':'SpB',
                   'LL2011_004':'SpC',
                   'LL2011_009':'SpC',
                   'MSH-587-1':'SpC',
                   'LL2011_012':'SpC',
                   'LL2011_001':'SpC',
                   'YPS644':'SpA',
                   'YPS744':'SpA',
                   'LL2013_040':'Sc',
                   'LL2013_054':'Sc'}

parents_color = {'MSH-604':'red',
                   'UWOPS-91-202':'red',
                   'LL2012_021':'darkred',
                   'LL2012_028':'darkred',
                   'LL2011_004':'dodgerblue',
                   'LL2011_009':'dodgerblue',
                   'MSH-587-1':'midnightblue',
                   'LL2011_012':'midnightblue',
                   'LL2011_001':'midnightblue',
                   'YPS644':'limegreen',
                   'YPS744':'limegreen',
                   'LL2013_040':'dimgrey',
                   'LL2013_054':'dimgrey'}

mat_alias = {'a': r'$a$', 'alpha':r'$\alpha$'}
group_alias = {'SpA':r'$SpA$', 'SpB':r'$SpB$', 'SpC':r'$SpC$', 'Scer':r'$S.c.$'}

## Initial construction of subg metadata table

In [None]:
#split table per subgenome
ns_subg = nano_strains.loc[nano_strains['cross']!='P'].copy()
for c, df in ns_subg.groupby('cross'):
    s1, s2 = cross_parents.loc[c].split(',')
    ns_subg.loc[df.index, 's1'] = s1
    ns_subg.loc[df.index, 's2'] = s2

ns_subg = pd.melt(ns_subg, id_vars=ns_subg.columns[:-2], value_vars=['s1','s2'], value_name='subg', var_name='subg_idx').sort_values(by=['cross','strain','subg_idx'])
ns_subg['s_subg'] = ns_subg.apply(lambda x: f'{x["strain"]}.{x["subg"]}', axis=1)
ns_subg.index = ns_subg['s_subg'].values
# add reference genome for quick mapping 
for sg, df in ns_subg.groupby('subg'):
    if sg in ['LL2013_040','LL2013_054']:
        ns_subg['quick_map'] = '/home/mathieu/mhenault_landrylab/Sequences/ref_genomes/S288C_pacbio/S288c.genome.fa'
    if sg in ['YPS644','YPS744']:
        ns_subg['quick_map'] = '/home/mathieu/mhenault_landrylab/Sequences/ref_genomes/CBS432_pacbio/CBS432.genome.fa'
    else:
        ns_subg['quick_map'] = '/home/mathieu/mhenault_landrylab/Sequences/ref_genomes/YPS138_pacbio/YPS138.genome.fa'
#ns_subg.to_csv('/mnt/HDD3/lrma/script/ns_subg.csv')

In [None]:
# import subgenome metadata table
ns_subg = pd.read_csv('/mnt/HDD3/lrma/script/ns_subg.csv', index_col=0)
ns_subg['cross'] = ns_subg['cross'].replace(cross_alias)
ns_subg['mat'] = parents_background.loc[ns_subg['subg'], 'mat'].values

In [None]:
lines_per_cross = ns_subg.loc[ns_subg['identity_filter']==True].groupby('cross').apply(lambda x: set(x['strain'].values)).to_dict()

# Cross design

## Fig 2A

In [None]:
#plot crosses scheme

fig = plt.figure(figsize=[5,4])
gs = plt.GridSpec(ncols=1, nrows=1, left=0.25, right=0.68, top=0.9, bottom=0.23)
ax = fig.add_subplot(gs[0])

#for cross in Cross_parents:
for cross in cross_order:
    sub = cross_parents.loc[cross_parents['cross']==cross].sort_values(by='mat')
    #p1, p2 = Cross_parents[cross].split(',')
    p1, p2 = sub['strain'].values
    
    X = np.repeat(cross_order[cross], 2)
    Y = [Parents_order[p] for p in (p1, p2)]
    
    ax.plot(X, Y, c='k', marker='o', mfc='k', lw=1, ms=5)
    
    fa = FancyArrow(cross_order[cross], 12.5, 0, 3.7, width=1, head_length=0, head_width=0, fc=cross_color[cross], lw=1, ec='w', 
                    clip_on=False, zorder=0)
    ax.add_patch(fa)
    if cross == 'BSc1':
        tc = 'k'
    else:
        tc = 'w'
    ax.text(cross_order[cross], 14.2, cross, size=9, ha='center', va='top', color=tc, rotation=90, fontweight='bold', zorder=1)
    ax.text(cross_order[cross], 13.3, f'({len(lines_per_cross[cross])})', size=7, ha='center', va='center', color=tc, rotation=90, fontweight='bold', zorder=1)
    
for group, (y, dy), fc in zip(['SpC','SpB','SpA','S. c.'], [(0,5), (5,4), (9,2), (11,2)], ['#00008B','#EE0000','#00CD00','0.25']):
    rect_spacing = 0.05
    
    Rect = Rectangle((-0.5, y+rect_spacing-0.5), 11, dy-2*rect_spacing, fc=fc, lw=0, zorder=0, clip_on=False, alpha=0.3)
    ax.add_patch(Rect)
    mid = np.mean([y, dy+y])-0.5
    ax.text(10.8, mid, group, color=fc, size=11, fontweight='semibold', fontstyle='italic',
             va='center', ha='left', clip_on=False, zorder=0)
    

ax.set_ylim(12.5,-0.5)
ax.set_yticks(range(13))
ax.set_yticklabels([])

ax.set_xlim(-0.5, 10.5)
ax.set_xticks([])


for p in Parents_order:
    
    group, mat, c = parents_background.loc[p, ['group', 'mat', 'count']]
    
    if c > 1:
        fw = 'bold'
    else:
        fw = 'normal'
    
    ax.text(-1, Parents_order[p]-0.1, f'{p}', size=8, color='k', ha='right', va='center', weight=fw)
    ax.text(-1, Parents_order[p]+0.35, f'($MAT${mat_alias[mat]}, {group_alias[group]})', size=5, color='k', ha='right', va='center')
    
for i in ['right','top']:
    ax.spines[i].set_visible(False)
    
    
#plt.savefig(f'{fig_path}Fig2A.svg', dpi=300)
#plt.show()
plt.close()

# Plot stats from the MA lines assemblies

In [None]:
parent_size = {}
for subg in ns_subg['subg'].unique():
    file = f'/home/mathieu/paradoxus_nanopore/MA_parents/assemblies/{subg}.chromosomes.fasta'
    parent_size[subg] = np.sum([len(seq.seq) for seq in SeqIO.parse(file, 'fasta')])

In [None]:
#get statistics on assemblies
assembl_stats = []
for (cross, subg, s), df in ns_subg.loc[ns_subg['identity_filter']].groupby(['cross','subg','s_subg']):
    #cross = cross_alias[cross]
    query = f'/mnt/HDD3/lrma/medaka/{s}/consensus.fasta'
    
    draft = pd.concat([pd.Series([seq.id, len(seq.seq)]) for seq in SeqIO.parse(query, 'fasta')], axis=1).T.sort_values(by=1, ascending=False)
    Size = draft[1].sum()
    n = draft.shape[0]
    idx, cum = (0,0)
    while cum < Size/2:
        tig, size = draft.loc[idx]
        cum += size
        idx += 1
        
    N50 = size
    assembl_stats.append([cross, cross_order[cross], subg, f'{cross}.{subg}', s, Size, N50, n])

assembl_stats = pd.DataFrame(assembl_stats, columns=['cross','co','subg','cross_subg','s_subg','size','N50','n'])
assembl_stats['strain'] = assembl_stats['s_subg'].apply(lambda x: x.split('.')[0])
assembl_stats['so'] = assembl_stats['strain'].apply(lambda x: int(x[1:]))
assembl_stats.index = assembl_stats['s_subg'].values

In [None]:
for subg, df in assembl_stats.groupby('subg'):
    assembl_stats.loc[df.index, 'size_ratio'] = df['size']/parent_size[subg]*100
assembl_stats['N50_Mb'] = assembl_stats['N50']*1e-6
assembl_stats['N50_log'] = np.log10(assembl_stats['N50'])

In [None]:
assembl_summary = []

for m in ['n', 'N50']:
    for cross, df in assembl_stats.groupby('cross'):
        med = np.median(df[m])
        q005 = np.quantile(df[m], 0.05)
        q095 = np.quantile(df[m], 0.95)
        
        assembl_summary.append([cross, m, med, q005, q095])
        
    
    med = np.median(assembl_stats[m])
    q005 = np.quantile(assembl_stats[m], 0.05)
    q095 = np.quantile(assembl_stats[m], 0.95)
    assembl_summary.append(['global', m, med, q005, q095])
    
assembl_summary = pd.DataFrame(assembl_summary, columns=['cross', 'metric', 'median', 'q005', 'q095'])

## Fig 2C

In [None]:
cross_subgenome_order = assembl_stats.groupby(['co','subg']).apply(lambda x: x.iloc[0]).reset_index(drop=True)
cross_subgenome_order['mat'] = parents_background.loc[cross_subgenome_order['subg'], 'mat'].values
cross_subgenome_order['subg'] = cross_subgenome_order['s_subg'].apply(lambda x: x.split('.')[1])
for cross, df in cross_subgenome_order.groupby('cross'):
    p1, p2 = Cross_parents[cross].split(',')
    subg_idx = pd.Series({p1:0, p2:1})
    cross_subgenome_order.loc[df.index, 'subg_order'] = subg_idx.loc[df['subg']].values
cross_subgenome_order = cross_subgenome_order.sort_values(by=['co', 'subg_order'])['cross_subg']

In [None]:
fig, axes = plt.subplots(ncols=3, figsize=[10,4], gridspec_kw=dict(top=0.93, left=0.08, bottom=0.33, right=0.98, wspace=0.3))

for m, m_alias, ax in zip(['size_ratio','N50_log', 'n'],
                          ['Assembly size ratio (%)','log$_{10}$ N50 (bp)' ,'Number of contigs'], axes):

    sns.boxplot(x='cross_subg', y=m, data=assembl_stats, color='w',
                #scale='width',
                fliersize=0,
                order=cross_subgenome_order, ax=ax)
    palette = {cs:cross_color[cs.split('.')[0]] for cs in assembl_stats['cross_subg'].unique()}
    
    sns.stripplot(x='cross_subg', y=m, data=assembl_stats, palette=palette, alpha=0.5, hue='cross_subg',
                order=cross_subgenome_order, ax=ax)
    
    ax.legend_.remove()
    ax.set_xticklabels(cross_subgenome_order, rotation=90, size=8)
    ax.set_xlabel('')
    ax.set_ylabel(m_alias)

sns.despine()

#plt.savefig(f'{fig_path}Fig2C.svg', dpi=300)
#plt.show()
plt.close()

## Supp Table S2

In [None]:
assembl_stats.sort_values(by=['co','so','subg']).to_csv(f'{tables_path}assembly_stats.csv')

In [None]:
SORT = {}
for s in ['J17','C20']:
    cross = nano_strains.loc[s, 'cross']
    s1, s2 = cross_parents.loc[cross].split(',')

    bam = pysam.AlignmentFile(f'/mnt/HDD3/lrma/minimap_sort/{s}.sort.minimap.bam', 'rb')

    read_len = {}
    query_len = {}
    qual = {}
    secondary = {}
    nm = {}
    aln_pos = {}
    for i in bam:

        name = i.query_name
        read_len[name] = i.infer_read_length()
        query_len[name] = i.infer_query_length()
        secondary[name] = i.is_secondary
        aln_pos[name] = i.reference_start
        if i.is_secondary:
            qual[name] = -1
        else:
            qual[name] = np.mean(i.query_alignment_qualities)

        tags = dict(i.get_tags())
        if 'NM' in tags:
            nm[name] = tags['NM']
        else:
            nm[name] = -1

    tab = pd.read_csv(f'/mnt/HDD3/lrma/private_variants/private_variants_{cross}.tab', sep='\t', header=None,
                     dtype={0:str,1:np.int32,2:str,3:str,4:str,5:str,6:str})
    for gt_col in (5,6):
        tab[gt_col] = np.where(tab[gt_col].isin(['1|1','1/1']), tab[4], tab[3])
    tab.index = tab[1].values-1

    reads = {}
    #s1, s2 = samples
    idx = 0
    with ProgressBar(max_value=tab.shape[0]) as bar:
        for tig, tab_sub in tab.groupby(0):
            # get list of positions on the contig
            loci = bam.pileup(tig)
            #iterate pileups per position
            for pileup in loci:
                i = pileup.reference_pos
                # if position is a snp that allows to discriminate parents, continue
                if i in tab_sub.index:
                    gt1, gt2 = tab_sub.loc[i, [5,6]]

                    for (rid, nt) in zip(pileup.get_query_names(), pileup.get_query_sequences()):
                        # init reads dict
                        if rid not in reads:
                            reads[rid] = {s1:0, s2:0}
                        nt = nt.upper()
                        if nt == gt1:
                            reads[rid][s1] += 1
                        elif nt == gt2:
                            reads[rid][s2] += 1
                    idx += 1
                    bar.update(idx)

    sort = pd.DataFrame(reads).T
    #sort['binom'] = sort.apply(lambda x: stats.binom_test(x[s1], x[s1]+x[s2], 0.5), axis=1)
    sort['sort'] = sort.apply(lambda x : sort_counts(x, s1, s2), axis=1)

    sort['rl'] = pd.Series(read_len).loc[sort.index]
    sort['log_rl'] = np.log10(sort['rl'])
    sort['ql'] = pd.Series(query_len)
    sort['log_ql'] = np.log10(sort['ql'])
    sort['sec'] = pd.Series(secondary)
    sort['nm'] = pd.Series(nm)
    sort['qual'] = pd.Series(qual)
    sort['pos'] = pd.Series(aln_pos)
    sort['log0'] = np.log10(sort[s1]+1)
    sort['log1'] = np.log10(sort[s2]+1)
    sort['%nm'] = sort['nm']/sort['ql']

    #sort['corr_pval'] = multipletests(sort['binom'], alpha=0.05, method='fdr_bh')[0]
    SORT[s] = sort

In [None]:
def sort_counts(x, s1, s2):
    c1, c2 = x[[s1, s2]]
    if np.max([c1 ,c2]) >= 2:
        if c1 >= 2*c2:
            return s1
        if c2 >= 2*c1:
            return s2
        else:
            return -1
    else:
        return -1

In [None]:
for gt, df in sort.loc[sort['corr_pval']].groupby('sort'):
    with open(f'/mnt/HDD3/lrma/sort/{s}.{gt}.reads', 'w') as handle:
        handle.write('\n'.join(df.index))

In [None]:
fig, axes = plt.subplots(ncols=2, nrows=2, gridspec_kw={'height_ratios':(1,4), 'wspace':0.4}, figsize=[8,5])
for (s, sort), ax in zip(SORT.items(), axes.T):
    
    cross = nano_strains.loc[s, 'cross']
    s1, s2 = cross_parents.loc[cross].split(',')
    
    palette={s1:'blue',s2:'red',-1:'0.3'}
    
    sns.scatterplot(data=sort, x=s1, y=s2, hue='sort', edgecolor=(1,1,1,0), ax=ax[1],
                   palette=palette, alpha=0.1, s=10)
    ax[1].set_xlabel(f'snps {s1}')
    ax[1].set_ylabel(f'snps {s2}')
    
    bar = sort.value_counts('sort')
    ax[0].bar(range(3), bar, color=[palette[i] for i in bar.index])
    ax[0].set_xticks(range(3))
    ax[0].set_xticklabels(bar.index)
    ax[0].set_ylabel('sorted reads')
    ax[0].set_title(f'{s} ({cross_alias[cross]})')

sns.despine()
plt.savefig('/mnt/HDD3/lrma/fig/sort_example.png', dpi=300)
plt.show()
plt.close()

### correlate numbers of reads in mappings to sorted subgenomes

In [None]:
#parse mapping stats
map_sort_stats = {}
for s in nano_strains.loc[nano_strains['cross']!='P'].index:
    with open(f'/mnt/HDD3/lrma/minimap_sort/{s}.sort.minimap.stats') as handle:
        dat = dict([line.split('\t')[1:3] for line in handle.read().splitlines() if line[:2]=='SN'])
        for m,v in dat.items():
            if '.' in v:
                dat[m] = float(v)
            else:
                dat[m] = int(v)
        map_sort_stats[s] = dat

In [None]:
#parse number of bases
read_lengths = {}
idx = 0
with ProgressBar(max_value=ns_subg.shape[0]) as bar:
    for s in ns_subg.index:
        read_lengths[s]  = np.array([len(seq.seq) for seq in SeqIO.parse(f'/mnt/HDD3/lrma/sort/{s}.reads.fastq', 'fastq')])
        idx += 1
        bar.update(idx)

In [None]:
#with open('/mnt/HDD3/lrma/sort/rl.pkl', 'wb') as handle:
#    pkl.dump(read_lengths, handle)

In [None]:
with open('/mnt/HDD3/lrma/sort/rl.pkl', 'rb') as handle:
    read_lengths = pkl.load(handle)

In [None]:
#parse read counts from sorted subgenomes
for s in ns_subg.index:
    ns_subg.loc[s, 'bases_subg'] = read_lengths[s].sum()

In [None]:
# parse read counts from whole librarties
for s, df in ns_subg.groupby('strain'):
    stats = pd.read_csv(f'/mnt/HDD3/lrma/seqkit_stats/{s}.stats', engine='python', sep=' +', header=0)
    stats['sum_len'] = stats['sum_len'].apply(lambda x: np.float32(x.replace(',','')))
    ns_subg.loc[df.index, 'bases_tot'] = stats['sum_len'].sum()

## Fig 2B

In [None]:
ns_subg_pivot = ns_subg.loc[ns_subg['identity_filter']==True].sort_values(by='bases_subg').copy()
ns_subg_pivot['bases_ratio'] = ns_subg_pivot['bases_subg']/ns_subg_pivot['bases_tot']*100
ns_subg_pivot = ns_subg_pivot.pivot_table(index=['strain','ploidy','cross'], columns='mat', values='bases_ratio')
ns_subg_pivot['classified'] = ns_subg_pivot['a']+ns_subg_pivot['alpha']

In [None]:
fig, ax = plt.subplots(figsize=[5,4], 
                       gridspec_kw={'right':0.78, 'top':0.95, 'bottom':0.15})
ploidy_symbol = {2:'o', 3:'^', 4:"s"}


for (cross, ploidy), df in ns_subg_pivot.groupby(['cross','ploidy']):
    c = cross_color[cross]
    m = ploidy_symbol[ploidy]
    
    ax.scatter(df['a'], df['alpha'], c='#0f0f0f00', edgecolor=c, marker=m, s=64)
    
xlim = np.array([10, 70])
ax.plot(xlim, xlim, lw=1, c='k', zorder=-1)
ax.text(60, 56, '1:1')
ax.plot(xlim, xlim*2, lw=1, c='k', zorder=-1)
ax.text(36, 68, '1:2')
ax.plot(xlim, xlim/2, lw=1, c='k', zorder=-1)
ax.text(56, 25, '2:1')

for x in np.linspace(40, 100, 7):
    ax.plot(xlim, -xlim+x, c='k', lw=0.5, ls='--', zorder=-1)
    ax.text(3*x/5+1, 2*x/5, f'{x:.0f}%', size=8)

ax.set_xlim(xlim)
ax.set_ylim(15, 70)
ax.set_ylim(xlim)

ax.set_xlabel(f'MAT{mat_alias["a"]} parent subgenome (%)')
ax.set_ylabel(f'MAT{mat_alias["alpha"]} parent subgenome (%)')

# legends
legend_elms1 = [Line2D([0], [0], color=cross_color[cross], lw=2, label=cross) for cross in cross_order]
legend1 = plt.legend(handles=legend_elms1, loc='upper left', bbox_to_anchor=[1.04, 1], handlelength=1.5)
ax.add_artist(legend1)

legend_elms2 = [Line2D([0], [0], color='w', marker=s, mec='k', mfc='#0f0f0f00', ms=8, label=f'{p}n') for p,s in ploidy_symbol.items()]
legend2 = plt.legend(handles=legend_elms2, loc='upper left', bbox_to_anchor=[1.04, 0.25])
ax.add_artist(legend2)

sns.despine()
#plt.savefig(f'{fig_path}Fig2B.svg', dpi=300)
#plt.show()
plt.close()

## Fig S3A

In [None]:
ns_subg_reindex = ns_subg.set_index('s_subg')
for m in ['n','size','N50']:
    ns_subg_reindex[m] = assembl_stats[m]
ns_subg_reindex['log_bases_subg'] = np.log10(ns_subg_reindex['bases_subg'])

In [None]:
fig, ax = plt.subplots(figsize=[4,4])
sns.scatterplot(x='log_bases_subg', y='n', hue='cross', palette=cross_color, 
                hue_order=cross_order, data=ns_subg_reindex, ax=ax)
ax.set_xlabel('Bases sorted in subgenome (log$_{10}$)')
ax.set_ylabel('Number of contigs')
ax.legend_.remove()

fig.text(0.02, 0.92, 'A', size=24, fontweight='bold', font='Arial')

sns.despine()
plt.tight_layout()

#plt.show()
#plt.savefig(f'{fig_path}Supp_FigS3A.svg', dpi=300)
plt.close()

# Origin of unclassified reads

In [None]:
S_unmapped = ['J44.-1', 'L64.-1', 'B71.-1', 'D93.-1']

## Import sorted read stats for selected libraries

In [None]:
read_lengths_unmapped = {}

for s in S_unmapped:
    read_lengths_unmapped[s]  = np.array([len(seq.seq) for seq in SeqIO.parse(f'/mnt/HDD3/lrma/sort/{s}.reads.fastq', 'fastq')])

## Fig S1

In [None]:
fig, axes = plt.subplots(ncols=2, nrows=2, figsize=[8,6], 
                         gridspec_kw={'wspace':1.1, 'hspace':0.7,
                                      'left':0.08, 'bottom':0.08, 'right':0.8, 'top':0.91})

for s, cross, ax_idx in zip(['L64', 'J44', 'D93', 'B71'], ['CC3', 'CC1', 'BC2', 'BA1'],
                    itertools.product(range(2), range(2))):
    
    ax = axes[ax_idx]
    
    for s_subg, unclassified in zip(list(ns_subg.loc[ns_subg['strain']==s].sort_values(by='mat')['s_subg'].values) + [f'{s}.-1'],
                              [False, False, True]):
        
        if unclassified:
            rld = read_lengths_unmapped[f'{s}.-1']
            #label = s_subg.replace('.-1', '.unclassified')
            l = 'Unclassified'
        else:
            rld = read_lengths[s_subg]
            #label = s_subg
            subg = s_subg.split('.')[-1]
            mat = parents_background.loc[subg, 'mat']
            ploidy = ns_subg.loc[s_subg, 'ploidy_subg']
            l = f'{subg}\n(MAT{mat_alias[mat]}, {ploidy:.0f}n)'
            
        label =  f'{l}\ntotal: {rld.sum()*1e-6:.1f} Mb\nmedian: {np.median(rld)*1e-3:.1f} kb'
        
        ax.plot(np.sort(np.log10(rld)), np.linspace(0, 100, rld.shape[0]), label=label)

    ax.legend(fontsize=8, labelspacing=0.9, handlelength=2, loc='center left', bbox_to_anchor=[0.9, 0.5])
    
    ax.set_title(f'{s}\n({cross}, {ns_subg_pivot.loc[s, ["a", "alpha"]].iloc[0].sum():.1f}% classified)')
    ax.set_xticks(range(2, 6))
    ax.set_xlim(1.8,5)
    
    ax.set_xlabel('Read length (log$_{10}$ bp)')
    ax.set_ylabel('% reads')
    
sns.despine()

#for ext in ['svg', 'png']:
#    plt.savefig(f'{fig_path}Supp_FigS1.{ext}', dpi=300)
#plt.show()
plt.close()

## Unclassified reads position bias

In [None]:
#Get mapping positions per subgenome
# ref genome used for sort mappings
genome = {}
tig_offsets = []
for seq in SeqIO.parse('/home/mathieu/paradoxus_nanopore/paradoxus4/assemblies/paradoxus4_bc02_smartdenovo_pilon-nanopolish_reordered.fasta', 'fasta'):
    genome[seq.id] = seq
    tig_offsets.append([seq.id, len(seq.seq)])
tig_offsets = pd.DataFrame(tig_offsets)
tig_offsets['off'] = [0] + list(np.cumsum(tig_offsets[1])[:-1])
tig_offsets['end'] = tig_offsets['off']+tig_offsets[1]-1
tig_offsets.index = tig_offsets[0].values

tig_chrom = pd.read_csv('/home/mathieu/paradoxus_nanopore/paradoxus4/barcode02/smartdenovo/mauve/tig_rearrangement.txt', sep='\t', header=None)
for chrom, df in tig_chrom.groupby(0):
    if int(chrom[3:])%2 == 1:
        tig_chrom.loc[df.index, 'color'] = 0
    if int(chrom[3:])%2 == 0:
        tig_chrom.loc[df.index, 'color'] = 1

In [None]:
# parse read pos

POS_unclassified = []
window = 1e4
for s in ['L64', 'J44', 'D93', 'B71']:
    
    Dat = []
    ploidy = ns_subg.loc[ns_subg['strain']==s, 'ploidy_subg'].sum()
    
    for s_subg in list(ns_subg.loc[ns_subg['strain']==s].sort_values(by='mat')['s_subg'].values) + [f'{s}.-1']:
        dat = pd.read_csv(f'/mnt/HDD3/lrma/sort/{s_subg}.pos.txt', sep='\t', header=None)
        #add offsets
        dat[1] = dat[1] + tig_offsets.loc[dat[0], 'off'].values
        dat = np.histogram(dat[1], bins=np.arange(0, 11.84e6, window))
        dat = pd.Series(dat[0], name=s_subg)
        Dat.append(dat)
    
    Dat = pd.concat(Dat, axis=1).T
    median = np.median(Dat.sum(axis=0))
    Dat = Dat/median*ploidy
    POS_unclassified.append(Dat)

POS_unclassified = pd.concat(POS_unclassified)

## Fig S2

In [None]:
# heatmap of coverage for examples of lines with high percentage of 
fig = plt.figure(figsize=[7,4])

ax = fig.add_axes([0.22, 0.17, 0.65, 0.8])
cbar_ax = fig.add_axes([0.89, 0.35, 0.03, 0.3])

dat = POS_unclassified
sns.heatmap(dat, cmap='coolwarm', vmin=0, vmax=4, center=1, ax=ax, cbar_ax=cbar_ax)

chrom_color = {0:'0.7', 1:'0.85'}
for (chrom, c), df1 in tig_chrom.groupby([0,'color']):
    df1 = tig_offsets.loc[df1[1].apply(lambda x: f'{x}_pilon')]
    start, length = 1/window * np.array([df1.iloc[0]['off'], df1[1].sum()])
    ar = FancyArrow(start, dat.shape[0]+2, length, 0, width=0.8, head_width=0.8,
                    fc=chrom_color[c], lw=0, length_includes_head=True, clip_on=False, head_length=0)
    ax.add_patch(ar)
    ax.text(start+0.5*length, dat.shape[0]+2, toRoman(int(chrom[3:])), 
            ha='center', va='center', size=7, color='k')

    ax.axvline(start+length, lw=1, color='white')
    
for i in np.arange(0, dat.shape[0]+1, 3):
    ax.axhline(i, lw=1, color='white')

xticks = np.arange(0, dat.shape[1], 1e6/window)
ax.set_xticks(xticks)
ax.set_xticklabels([f'{(i*window/1e6):.0f}' for i in xticks], rotation=0, size=8)
ax.set_xlabel('Position (Mb)')
ax.set_yticks(np.array(range(dat.shape[0]))+0.5)
ax.set_yticklabels([i.replace('.-1', '.unclassified') for i in dat.index], size=8)

cbar_ax.set_ylabel('Relative depth of coverage (X)')

#for ext in ['svg', 'png']:
#    plt.savefig(f'{fig_path}Supp_FigS2.{ext}', dpi=300)

#plt.show()
plt.close()