In [524]:
import pandas as pd
import numpy as np
import os
import pysam
import pickle as pkl
from matplotlib import pyplot as plt
from matplotlib.patches import FancyArrow
from matplotlib import cm
from matplotlib.lines import Line2D
import seaborn as sns
from Bio import SeqIO
from scipy import stats
from statsmodels.api import formula as smf
from progressbar import ProgressBar
from statsmodels.stats.multitest import multipletests
from roman import toRoman
import itertools
import intervaltree
from collections import Counter
import re
import gzip
rc_sns = {'ytick_color':'k', 'xtick_color':'k', 'text_color':'k', 'font.sans-serif':'DejaVu Sans', 'figure.facecolor':(1,1,1,1)}
sns.set_style(style='ticks', rc=rc_sns)

In [3]:
#import tables of strain identities and cross parents
nano_strains = pd.read_csv('/mnt/HDD3/lrma/script/nano_strains.csv', index_col=0)
cross_parents = pd.read_csv('/mnt/HDD3/lrma/private_variants/cross_parents.txt', sep=';', header=None, index_col=0, squeeze=True)



  cross_parents = pd.read_csv('/mnt/HDD3/lrma/private_variants/cross_parents.txt', sep=';', header=None, index_col=0, squeeze=True)


In [4]:
cross_alias = {'VL3':'CC1',
              'VL4':'CC2',
              'VL5':'CC3',
              'VL1':'BB1',
              'VL2':'BB2',
              'L1':'BC1',
              'L2':'BC2',
              'M1':'BA1',
              'M2':'BA2',
              'H1':'BSc1',
              'H2':'BSc2'}
cross_order = {j:i for i,j in enumerate(['CC1', 'CC2', 'CC3', 'BB1', 'BB2', 'BC1', 'BC2', 'BA1', 'BA2', 'BSc1', 'BSc2'])}

In [359]:
ty_colors = {'Ty1p':cm.tab10(0),
             'TY1':cm.tab10(0),
             'TY2':cm.tab10(1),
             'Ty3p':cm.tab10(2),
             'TY3':cm.tab10(2),
             'Tsu4':cm.tab10(3),
             'TY4':cm.tab10(4),
             'TY5':cm.tab10(5),
             'Ty5p':cm.tab10(5)}

In [473]:
ns_subg = pd.read_csv('/mnt/HDD3/lrma/script/ns_subg.csv', index_col=0)
ns_subg['cross'] = ns_subg['cross'].replace(cross_alias)
nano_strains['cross'] = nano_strains['cross'].replace(cross_alias)

In [343]:
# get parental depth profiles
for s in nano_strains.loc[nano_strains['cross']=='P', 'strain'].values:
    bed = pd.read_csv(f'/mnt/HDD3/lrma/indexcov/{s}/{s}-indexcov.bed.gz', sep='\t')

In [344]:
bed

Unnamed: 0,#chrom,start,end,LL2011_001-ref
0,chrI,0,16384,0.0032
1,chrI,16384,32768,0.0000
2,chrI,32768,49152,2.0200
3,chrI,49152,65536,1.0700
4,chrI,65536,81920,0.2460
...,...,...,...,...
702,chrXVI,819200,835584,0.9610
703,chrXVI,835584,851968,1.1200
704,chrXVI,851968,868352,0.6130
705,chrXVI,868352,884736,0.3800


In [None]:
# import chromosomes and tig offsets

In [376]:
tig_off = {}
for r in ns_subg['subg'].unique():
    path = f'/home/mathieu/paradoxus_nanopore/MA_parents/assemblies/{r}.chromosomes.rdna.fasta'
    tig_off[r] = pd.concat([pd.Series([seq.id, len(seq.seq)]) for seq in SeqIO.parse(path, 'fasta')], axis=1).T
    tig_off[r].index = tig_off[r][0].values
    tig_off[r][2] = np.concatenate([np.array([0]), np.cumsum(tig_off[r][1].values)[:-1]])
    tig_off[r][3] = tig_off[r][1] + tig_off[r][2]
    tig_off[r]['color'] = np.tile([0,1], 10)[:tig_off[r].shape[0]]

In [6]:
# tract definition done in a separate script for MA lines subgenomes
Tracts = []

for s in ns_subg.loc[ns_subg['identity_filter']==True].index:
    Tracts.append(pd.read_csv(f'/mnt/HDD3/lrma/depth_tracts/{s}.tracts.csv', index_col=0))
Tracts = pd.concat(Tracts).reset_index(drop=True)

In [7]:
tracts_trees = {s_subg:{} for s_subg in ns_subg.loc[ns_subg['identity_filter']==True, 's_subg']}
for (s, chrom), df in Tracts.groupby(['s_subg', 'chrom']):
    t = intervaltree.IntervalTree()
    for i in df.index:
        start, end, call = df.loc[i, ['start','end','call']].values
        t[start:end] = (i, call)
    tracts_trees[s][chrom] = t

In [8]:
#import REA
REA = pd.read_csv('/mnt/HDD3/lrma/results/REA.csv', index_col=0)
og_data = pd.read_csv('/mnt/HDD3/lrma/results/og_data.csv', index_col=0)
og_data.index = og_data['og'].values

  REA = pd.read_csv('/mnt/HDD3/lrma/results/REA.csv', index_col=0)


In [9]:
rea_reindex = REA.set_index(['s_subg','og']).sort_index(level=['s_subg','og'])

In [10]:
lines_per_cross = ns_subg.loc[ns_subg['identity_filter']==True].groupby('cross').apply(lambda x: set(x['strain'].values)).to_dict()

In [134]:
display_rea = ['s_subg','cross','subg','family','solo','fl','tr','fl_tr','lift','Query','Start','End','Strand','og','mid','left_bound','right_bound','has_fl']

In [240]:
og_cross = []
for (cross, og), df in REA.groupby(['cross','og']):
    if og_data.loc[og, 'has_fl']:
    
        n_lines = df['s_subg'].nunique()
        # if more than one entry per line, flag as complex
        cmplx = set(df.value_counts('s_subg').values) != {1}
        pos = df[['Start', 'End']].values.flatten()
        left_bound = pos.min()
        right_bound = pos.max()

        og_cross.append([cross, og, n_lines, cmplx, left_bound, right_bound])

og_cross = pd.DataFrame(og_cross, columns=['cross', 'og', 'n_lines', 'cmplx', 'left_bound', 'right_bound'])

In [243]:
# get counts of orthogroups
og_count = REA.loc[REA['fl']==True].value_counts('og')

In [248]:
og_cn = []

for (cross, og, cmplx), df in og_cross.groupby(['cross', 'og', 'cmplx']):
    subg, chrom, left_bound, right_bound = og_data.loc[og, ['subg', 'chrom', 'left_bound', 'right_bound']]
    for s in lines_per_cross[cross]:
        s = f'{s}.{subg}'
        
        left_cn = np.nan
        right_cn = np.nan
        
        left_tract = tracts_trees[s][chrom][left_bound]
        if len(left_tract) == 1:
            left_cn = list(left_tract)[0][2][1]
            
        right_tract = tracts_trees[s][chrom][right_bound]
        if len(right_tract) == 1:
            right_cn = list(right_tract)[0][2][1]
            
        # 
        if (s, og) in rea_reindex.index:
            sub_rea= rea_reindex.loc[(s, og)]
            has_annot = True
            has_fltr = sub_rea['fl_tr'].sum() > 0
            has_fl = sub_rea['fl'].sum() > 0
        else:
            has_annot = False
            has_fltr = False
            has_fl = False
        
        og_cn.append([s, og, left_cn, right_cn, has_annot, has_fltr, has_fl, cmplx])
        
og_cn = pd.DataFrame(og_cn, columns=['s_subg', 'og', 'left_cn', 'right_cn', 'has_annot', 'has_fltr', 'has_fl', 'cmplx'])
og_cn['cons_cn'] = (og_cn['left_cn'] == og_cn['right_cn'])

og_cn['cn'] = og_cn[['left_cn','right_cn']].mean(axis=1)

#add assembly annotation from reverse liftover
for s, df in og_cn.groupby('s_subg'):
    bed = pd.read_csv(f'/mnt/HDD3/lrma/reverse_liftover/{s}.flank_lift.bed', sep='\t', header=None).set_index(3)
    
    for i in df.index:
        og = df.loc[i, 'og']
        chrom, lb, rb = og_data.loc[og, ['chrom', 'left_bound', 'right_bound']]
               
        has_assembly_left = f'{chrom}_{lb-500:.0f}_{lb:.0f}' in bed.index
        has_assembly_right = f'{chrom}_{rb:.0f}_{rb+500:.0f}' in bed.index
        
        og_cn.loc[i, 'has_assembly_left'] = has_assembly_left
        og_cn.loc[i, 'has_assembly_right'] = has_assembly_right

og_cn['cons_as'] = (og_cn['has_assembly_left'] == og_cn['has_assembly_right'])
og_cn['og_count'] = og_count.loc[og_cn['og']].values
og_cn['private'] = og_cn['og_count'] == 1

In [249]:
# export count summary to figure out classification logic
og_cn.loc[og_cn['cons_as']==True].value_counts(['has_assembly_left', 'has_annot', 'has_fltr', 'has_fl', 'cmplx', 'private', 'cons_cn', 'cn'], sort=False)\
.rename('count').reset_index().to_csv('/mnt/HDD1/Dropbox/temp_sync/counts_og.csv')

In [None]:
# scrap

In [282]:
og_cn.loc[(og_cn['cons_as']==True) &
         (og_cn['has_assembly_left']==True) &
         (og_cn['has_annot']==True) &
         (og_cn['has_fl']==True) &
         (og_cn['cmplx']==True) &
         (og_cn['private']==False) &
         (og_cn['cons_cn']==True) & 
         (og_cn['cn']==2) & 
         (og_cn['og']=='og1255')]#.value_counts('og')

Unnamed: 0,s_subg,og,left_cn,right_cn,has_annot,has_fltr,has_fl,cmplx,cons_cn,cn,has_assembly_left,has_assembly_right,cons_as,og_count,private
2146,D68.LL2011_009,og1255,2.0,2.0,True,True,True,True,True,2.0,True,True,True,30,False
2148,D75.LL2011_009,og1255,2.0,2.0,True,True,True,True,True,2.0,True,True,True,30,False
3602,K27.LL2011_009,og1255,2.0,2.0,True,True,True,True,True,2.0,True,True,True,30,False
3606,K22.LL2011_009,og1255,2.0,2.0,True,True,True,True,True,2.0,True,True,True,30,False
3607,K15.LL2011_009,og1255,2.0,2.0,True,True,True,True,True,2.0,True,True,True,30,False
4283,L48.LL2011_009,og1255,2.0,2.0,True,True,True,True,True,2.0,True,True,True,30,False
4285,L17.LL2011_009,og1255,2.0,2.0,True,True,True,True,True,2.0,True,True,True,30,False
4289,L34.LL2011_009,og1255,2.0,2.0,True,True,True,True,True,2.0,True,True,True,30,False


In [280]:
og_cn.loc[og_cn['og']=='og1255', 
         ['s_subg','has_assembly_left', 'has_annot', 'has_fltr', 'has_fl', 'cmplx', 'private', 'cons_cn', 'cn']].sort_values(by='s_subg')

Unnamed: 0,s_subg,has_assembly_left,has_annot,has_fltr,has_fl,cmplx,private,cons_cn,cn
2141,D12.LL2011_009,True,True,False,False,True,False,True,3.0
2145,D17.LL2011_009,True,True,True,True,True,False,True,1.0
2138,D25.LL2011_009,True,True,True,True,True,False,True,1.0
2144,D29.LL2011_009,True,True,True,True,True,False,True,1.0
2147,D42.LL2011_009,True,True,True,True,True,False,True,3.0
2139,D60.LL2011_009,True,True,True,True,True,False,True,1.0
2146,D68.LL2011_009,True,True,True,True,True,False,True,2.0
2142,D73.LL2011_009,True,True,True,True,True,False,True,1.0
2148,D75.LL2011_009,True,True,True,True,True,False,True,2.0
2140,D84.LL2011_009,True,True,True,True,True,False,True,1.0


In [283]:
REA.loc[REA['og']=='og1255', display_rea].sort_values(by=['s_subg','Start'])

Unnamed: 0,s_subg,cross,subg,family,solo,fl,tr,fl_tr,lift,Query,Start,End,Strand,og,mid,left_bound,right_bound,has_fl
27735,D12.LL2011_009,BC2,LL2011_009,Ty1p,True,False,False,False,True,chrXII,550611.0,550295.0,-,og1255,550294.0,550088.0,556177.0,True
17622,D17.LL2011_009,BC2,LL2011_009,Ty1p,True,False,False,False,True,chrXII,550176.0,550089.0,-,og1255,550294.0,550088.0,556177.0,True
17623,D17.LL2011_009,BC2,LL2011_009,Ty1p,False,True,False,True,True,chrXII,556177.0,550295.0,-,og1255,550294.0,550088.0,556177.0,True
32083,D25.LL2011_009,BC2,LL2011_009,Ty1p,True,False,False,False,True,chrXII,550176.0,550089.0,-,og1255,550294.0,550088.0,556177.0,True
32084,D25.LL2011_009,BC2,LL2011_009,Ty1p,False,True,False,True,True,chrXII,556176.0,550295.0,-,og1255,550294.0,550088.0,556177.0,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34932,L48.LL2011_009,CC3,LL2011_009,Ty1p,False,True,False,True,True,chrXII,556176.0,550295.0,-,og1255,550294.0,550088.0,556177.0,True
25424,L54.LL2011_009,CC3,LL2011_009,Ty1p,True,False,False,False,True,chrXII,550176.0,550089.0,-,og1255,550294.0,550088.0,556177.0,True
25425,L54.LL2011_009,CC3,LL2011_009,Ty1p,False,True,False,True,True,chrXII,556176.0,550295.0,-,og1255,550294.0,550088.0,556177.0,True
27447,L64.LL2011_009,CC3,LL2011_009,Ty1p,True,False,False,False,True,chrXII,550175.0,550088.0,-,og1255,550294.0,550088.0,556177.0,True


In [505]:
og_cn.value_counts(['cons_as','cons_cn'])/4724

cons_as  cons_cn
True     True       0.948137
False    True       0.031541
True     False      0.017570
False    False      0.002752
dtype: float64

In [232]:
og_cn.loc[(og_cn['cons_as']==True) & (og_cn['has_assembly_left']==False)
         & (og_cn['has_annot']==False) & (og_cn['has_fltr']==False) 
         & (og_cn['cmplx']==True) & (og_cn['private']==True)]

Unnamed: 0,s_subg,og,left_cn,right_cn,has_annot,has_fltr,cmplx,full,cons_cn,cn,has_assembly_left,has_assembly_right,cons_as,og_count,private


In [298]:
#import logic
logic_levels = ['has_assembly_left', 'has_annot', 'has_fltr', 'has_fl', 'cmplx', 'private', 'cons_cn', 'cn']
logic = pd.read_csv('/mnt/HDD3/lrma/results/logic.csv', index_col=0)
logic.set_index(logic_levels, inplace=True)

In [305]:
#classify orthogroups
og_cn['class'] = 'inconsistent'
for i, df in og_cn.loc[og_cn['cons_as']==True].groupby(logic_levels):
    og_cn.loc[df.index, 'class'] = logic.loc[i, 'description']

In [437]:
og_cn['strain'] = ns_subg.loc[og_cn['s_subg'], 'strain'].values
og_cn['cross'] = ns_subg.loc[og_cn['s_subg'], 'cross'].values
og_cn['subg'] = ns_subg.loc[og_cn['s_subg'], 'subg'].values

In [475]:
#summarize values
og_class_summary = og_cn.value_counts(['strain','class']).rename('count')
for i in itertools.product(og_cn['strain'].unique(), og_cn['class'].unique()):
    if i not in og_class_summary.index:
        og_class_summary.loc[i] = 0
og_class_summary = og_class_summary.reset_index()
og_class_summary['cross'] = nano_strains.loc[og_class_summary['strain'], 'cross'].values

In [350]:
classes_order = ['gain_aneup_LOH', 'loss_aneup_LOH', 'denovo']
classes_color = dict(zip(classes_order, ['limegreen', 'red', 'blue']))

In [531]:
fig, axes = plt.subplots(nrows=2, figsize=[6,6])

ax = axes[0]
sns.stripplot(x='cross', y='count', hue='class', dodge=True, alpha=0.5, ax=ax, 
              order=cross_order, hue_order=classes_order, palette=classes_color,
            data=og_class_summary.loc[og_class_summary['class'].isin(classes_order)])

ax.set_xlabel('')

ax = axes[1]
dat = og_class_summary.loc[og_class_summary['class'].isin(classes_order)].groupby(['cross','class'])['count'].mean().rename('rate').reset_index()
sns.barplot(x='cross', y='rate', hue='class', data=dat, ax=ax,
           order=cross_order, hue_order=classes_order, palette=classes_color)
ax.set_ylabel('rate (event line$^{-1}$)')
ax.set_xlabel('')

sns.despine()
plt.tight_layout()
plt.savefig('/home/mathieu/mhenault_landrylab/Publications/lrma/pre-draft/fig/counts_per_class_strains.png', dpi=300)
#plt.show()
plt.close()

In [482]:
og_denovo = og_cn.loc[og_cn['class']=='denovo'].copy()
og_denovo['cross_order'] = og_denovo['cross'].apply(lambda x: cross_order[x])
og_denovo = og_denovo.sort_values(by=['cross_order', 'strain'])

for (s_subg, og), df in og_denovo.groupby(['s_subg', 'og']):
    
    rea_sub = rea_reindex.loc[(s_subg, og)]
    rea_sub = rea_sub.loc[rea_sub['fl']==True].iloc[0]
    
    fam, chrom, start, end = rea_sub.loc[['family', 'Query', 'Start', 'End']]
    og_denovo.loc[df.index, 'family'] = fam
    og_denovo.loc[df.index, 'Query'] = chrom
    og_denovo.loc[df.index, 'Start'] = start
    og_denovo.loc[df.index, 'End'] = end

In [530]:
fig, ax = plt.subplots(figsize=[10,7])
chrom_color = {0:'0.8', 1:'0.9'}

S_order = og_denovo.value_counts(['s_subg','cross_order','cross','strain','subg']).rename('count').reset_index().sort_values(by=['cross_order','s_subg']).set_index('s_subg')
S_order['order'] = range(S_order.shape[0])

for s_subg, df in og_denovo.groupby('s_subg'):
    
    y = S_order.loc[s_subg, 'order']
    
    for i in df.index:
    
        strain, subg, cross, fam, Chrom, Start, End = og_denovo.loc[i, ['strain', 'subg', 'cross', 'family', 'Query', 'Start', 'End']]
        ax.scatter(np.mean([Start,End])+tig_off[subg].loc[Chrom, 2], y, color=ty_colors[fam], zorder=1, 
                   edgecolor='w', lw=0.5, s=64)
    
    for (chrom, length, start, color), df1 in tig_off[subg].groupby([0,1,2,'color']):

        ar = FancyArrow(start, y, length, 0, width=0.7, head_width=1, zorder=0,
                        fc=chrom_color[color], lw=0, length_includes_head=True, clip_on=False, head_length=0)
        ax.add_patch(ar)
        if y == 0:
            ax.text(start+0.5*length, y-0.5, chrom, ha='center', va='top', size=8, rotation=90, color='k')   

ax.set_yticks(range(S_order.shape[0]))
ax.set_yticklabels(S_order.apply(lambda x: f'{x["strain"]} subg. {x["subg"]} ({x["cross"]})', axis=1))
ax.set_ylim([-2, S_order.shape[0]-0.5])

ax.set_xticks(np.arange(0, 12e6, 1e6))
ax.set_xticklabels(range(12))
ax.set_xlim(-2e5, 12e6)
ax.set_xlabel('Mb')

legend_elms = [Line2D([0], [0], color='white', marker='o', mfc=ty_colors[fam], mew=0, ms=8, label=fam) for fam in og_denovo['family'].unique()]
ax.legend(handles=legend_elms, ncol=2, loc=8, bbox_to_anchor=[0.5, 1])

sns.despine()
plt.tight_layout()
plt.savefig('/home/mathieu/mhenault_landrylab/Publications/lrma/pre-draft/fig/denovo_genome_map.png', dpi=300)
#plt.show()
plt.close()

In [None]:
fig, axes = plt.subplots(nrows=16, figsize=[18,30])

chrom_ax_dict = dict(zip(sorted(set(depth[0])), range(16)))

for chrom, df1 in depth.groupby(0):
    #if chrom == 'chrXII':
    
    ax = axes[chrom_ax_dict[chrom]]
    df1[2] = df1[2]/median
    sns.histplot(x=1, y=2, data=df1, bins=[np.arange(0,1.5e6,5e3), np.linspace(0, 3, 30)], cmap='binary', ax=ax, zorder=0)

for chrom, df1 in depth_median.groupby(0):
    #if chrom == 'chrXII':
    ax = axes[chrom_ax_dict[chrom]]
    ax.scatter(df1['Bin'], df1['median'], alpha=0.5, s=6, color='red', zorder=1)
    #ax = axes[chrom_ax_dict[chrom]]
    #ax.axhline(1/hist_bin, lw=1, color='white')
    #ax.plot(df1['bin'].apply(lambda x: x.mid), df1['median']*median, c='red', lw=2, alpha=0.5)
    
for chrom, df1 in Tracts.groupby('chrom'):
    #if chrom == 'chrXII':
    ax = axes[chrom_ax_dict[chrom]]
    for t in df1.index:
        start, end, call = df1.loc[t, ['start', 'end', 'call']]
        ax.plot([start, end], [call, call], c='limegreen', lw=2, alpha=1, zorder=2)

    ax.set_title(chrom)
    ax.set_ylim(0, 5)
    
plt.show()
plt.close()