In [1]:
import pandas as pd
import numpy as np
import os
import pysam
import pickle as pkl
from matplotlib import pyplot as plt
from matplotlib.patches import FancyArrow
from matplotlib import cm
from matplotlib.lines import Line2D
import seaborn as sns
from Bio import SeqIO
from scipy import stats
from statsmodels.api import formula as smf
from progressbar import ProgressBar
from statsmodels.stats.multitest import multipletests
from roman import toRoman
import itertools
import intervaltree
from collections import Counter
import re
import gzip

In [2]:
sns.set(style='ticks', font='DejaVu Sans')

In [3]:
#import tables of strain identities and cross parents
nano_strains = pd.read_csv('/mnt/HDD3/lrma/script/nano_strains.csv', index_col=0)
cross_parents = pd.read_csv('/mnt/HDD3/lrma/private_variants/cross_parents.txt', sep=';', header=None, index_col=0, squeeze=True)



  cross_parents = pd.read_csv('/mnt/HDD3/lrma/private_variants/cross_parents.txt', sep=';', header=None, index_col=0, squeeze=True)


In [4]:
cross_alias = {'VL3':'CC1',
              'VL4':'CC2',
              'VL5':'CC3',
              'VL1':'BB1',
              'VL2':'BB2',
              'L1':'BC1',
              'L2':'BC2',
              'M1':'BA1',
              'M2':'BA2',
              'H1':'BSc1',
              'H2':'BSc2'}
cross_order = {j:i for i,j in enumerate(['CC1', 'CC2', 'CC3', 'BB1', 'BB2', 'BC1', 'BC2', 'BA1', 'BA2', 'BSc1', 'BSc2'])}

In [5]:
ns_subg = pd.read_csv('/mnt/HDD3/lrma/script/ns_subg.csv', index_col=0)
ns_subg['cross'] = ns_subg['cross'].replace(cross_alias)

In [None]:
ns_subg.value_counts(['subg', 'ref_chroder'])

In [None]:
# step done in a separate script

    depth = pd.read_csv(f'/mnt/HDD3/lrma/depth/{s}.fil.depth.gz', sep='\t', header=None)
    median = depth[2].median()
    std = depth[2].std()

    wdw = 30000
    depth['bin'] = pd.cut(depth[1], bins=pd.interval_range(start=1, end=12e6, freq=wdw, closed='left'))
    depth_median = depth.groupby([0,'bin'])[2].apply(lambda x: np.round(np.median(x)/median)).rename('median').reset_index()
    depth_median['Bin'] = depth_median['bin'].apply(lambda x: x.mid)

    tracts = []
    for chrom, df in depth_median.loc[~depth_median['median'].isna()].groupby(0):

        latest = -1
        idx = 0

        new_tract = [[], latest]

        while idx <= df.shape[0]-1:
            i = df.index[idx]
            call = depth_median.loc[i, 'median']
            if call != latest:
                #if there is an active new tract, dump it first
                if len(new_tract[0]) > 0:
                    tracts.append(new_tract)
                #define new tract with the first index in
                new_tract = [[i], call]
                #update last 
                latest = call
                idx +=1
            else:
                new_tract[0].append(i)
                latest = call
                idx +=1
        #final dump
        tracts.append(new_tract)

    for (t, call) in tracts:
        start = depth_median.loc[t[0], 'bin'].left
        end = depth_median.loc[t[-1], 'bin'].right
        chrom = depth_median.loc[t[0], 0]
        Tracts.append([s, chrom, start, end, call])


In [6]:
Tracts = []

for s in ns_subg.loc[ns_subg['identity_filter']==True].index:
    Tracts.append(pd.read_csv(f'/mnt/HDD3/lrma/depth_tracts/{s}.tracts.csv', index_col=0))
Tracts = pd.concat(Tracts).reset_index(drop=True)

In [7]:
tracts_trees = {s_subg:{} for s_subg in ns_subg.loc[ns_subg['identity_filter']==True, 's_subg']}
for (s, chrom), df in Tracts.groupby(['s_subg', 'chrom']):
    t = intervaltree.IntervalTree()
    for i in df.index:
        start, end, call = df.loc[i, ['start','end','call']].values
        t[start:end] = (i, call)
    tracts_trees[s][chrom] = t

In [8]:
#import REA
REA = pd.read_csv('/mnt/HDD3/lrma/results/REA.csv', index_col=0)
og_data = pd.read_csv('/mnt/HDD3/lrma/results/og_data.csv', index_col=0)
og_data.index = og_data['og'].values

  REA = pd.read_csv('/mnt/HDD3/lrma/results/REA.csv', index_col=0)


In [9]:
rea_reindex = REA.set_index(['s_subg','og']).sort_index(level=['s_subg','og'])

In [10]:
lines_per_cross = ns_subg.loc[ns_subg['identity_filter']==True].groupby('cross').apply(lambda x: set(x['strain'].values)).to_dict()

In [11]:
display_rea = ['s_subg','cross','subg','solo','fl','tr','fl_tr','lift','Query','Start','End','Strand','og','mid','left_bound','right_bound','has_fl']

In [12]:
og_cross = []
for (cross, og), df in REA.loc[REA['fl_tr']==True].groupby(['cross','og']):
    if df['fl'].sum() > 0:
        n_lines = len(set(df['s_subg']))
        total = df.shape[0]
        # if more than one entry per line, flag as complex
        cmplx = total > n_lines
        full = len(lines_per_cross[cross]) == n_lines
        pos = df[['Start', 'End']].values.flatten()
        left_bound = pos.min()
        right_bound = pos.max()

        og_cross.append([cross, og, n_lines, total, full, cmplx, left_bound, right_bound])

og_cross = pd.DataFrame(og_cross, columns=['cross', 'og', 'n_lines', 'total', 'full', 'cmplx', 'left_bound', 'right_bound'])

In [13]:
og_count = REA.value_counts('og')

In [14]:
og_cn = []

for (cross, og), df in og_cross.groupby(['cross','og']):
    subg, chrom, left_bound, right_bound = og_data.loc[og, ['subg', 'chrom', 'left_bound', 'right_bound']]
    for s in lines_per_cross[cross]:
        s = f'{s}.{subg}'
        
        left_cn = np.nan
        right_cn = np.nan
        
        left_tract = tracts_trees[s][chrom][left_bound]
        if len(left_tract) == 1:
            left_cn = list(left_tract)[0][2][1]
            
        right_tract = tracts_trees[s][chrom][right_bound]
        if len(right_tract) == 1:
            right_cn = list(right_tract)[0][2][1]
            
        if (s, og) in rea_reindex.index:
            sub_rea= rea_reindex.loc[(s, og)]
            has_annot = True
            has_fltr = sub_rea['fl_tr'].sum() > 0
        else:
            has_annot = False
            has_fltr = False
        
        og_cn.append([s, og, left_cn, right_cn, has_annot, has_fltr])
        
og_cn = pd.DataFrame(og_cn, columns=['s_subg', 'og', 'left_cn', 'right_cn', 'has_annot', 'has_fltr'])
og_cn['cons_cn'] = (og_cn['left_cn'] == og_cn['right_cn'])
og_cn['og_count'] = og_count.loc[og_cn['og']].values

In [34]:
#add assembly annotation from reverse liftover
for s, df in og_cn.groupby('s_subg'):
    bed = pd.read_csv(f'/mnt/HDD3/lrma/reverse_liftover/{s}.flank_lift.bed', sep='\t', header=None).set_index(3)
    
    for i in df.index:
        og = df.loc[i, 'og']
        chrom, lb, rb = og_data.loc[og, ['chrom', 'left_bound', 'right_bound']]
               
        has_assembly_left = f'{chrom}_{lb-500:.0f}_{lb:.0f}' in bed.index
        has_assembly_right = f'{chrom}_{rb:.0f}_{rb+500:.0f}' in bed.index
        
        og_cn.loc[i, 'has_assembly_left'] = has_assembly_left
        og_cn.loc[i, 'has_assembly_right'] = has_assembly_right

In [37]:
og_cn.loc[og_cn['cons_cn']].value_counts(['has_assembly_left', 'has_annot', 'has_fltr'], sort=False)

has_assembly_left  has_annot  has_fltr
False              False      False        261
                   True       False         10
                              True          35
True               False      False        166
                   True       False        166
                              True        3841
dtype: int64

In [None]:
og_count.loc[og_count==1]

In [17]:
og_cn.loc[(og_cn['left_cn']==0) & (og_cn['right_cn']==0) & (og_cn['has_annot']==True)
         # & (og_cn['og_count']==1)
         ].value_counts('og')

og
og1070    10
og1006    10
og1357    10
og1068     8
og410      8
og3891     7
og1400     6
og1051     6
og144      5
og1503     3
og515      3
og1541     3
og172      2
og166      2
og1519     2
og1790     2
og1482     2
og353      2
og3886     2
og1092     2
og612      2
og3985     1
og3727     1
og1005     1
og457      1
og491      1
og6008     1
og801      1
og3658     1
og1718     1
og3452     1
og27       1
og261      1
og2593     1
og21       1
og1624     1
og1440     1
og1267     1
og1188     1
og1134     1
og1087     1
og1083     1
og903      1
dtype: int64

In [None]:
REA.loc[(REA['og']=='og1263'), display_rea]

In [None]:
og_cn.loc[og_cn['og']=='og1263']

In [None]:
fig, axes = plt.subplots(nrows=16, figsize=[18,30])

chrom_ax_dict = dict(zip(sorted(set(depth[0])), range(16)))

for chrom, df1 in depth.groupby(0):
    #if chrom == 'chrXII':
    
    ax = axes[chrom_ax_dict[chrom]]
    df1[2] = df1[2]/median
    sns.histplot(x=1, y=2, data=df1, bins=[np.arange(0,1.5e6,5e3), np.linspace(0, 3, 30)], cmap='binary', ax=ax, zorder=0)

for chrom, df1 in depth_median.groupby(0):
    #if chrom == 'chrXII':
    ax = axes[chrom_ax_dict[chrom]]
    ax.scatter(df1['Bin'], df1['median'], alpha=0.5, s=6, color='red', zorder=1)
    #ax = axes[chrom_ax_dict[chrom]]
    #ax.axhline(1/hist_bin, lw=1, color='white')
    #ax.plot(df1['bin'].apply(lambda x: x.mid), df1['median']*median, c='red', lw=2, alpha=0.5)
    
for chrom, df1 in Tracts.groupby('chrom'):
    #if chrom == 'chrXII':
    ax = axes[chrom_ax_dict[chrom]]
    for t in df1.index:
        start, end, call = df1.loc[t, ['start', 'end', 'call']]
        ax.plot([start, end], [call, call], c='limegreen', lw=2, alpha=1, zorder=2)

    ax.set_title(chrom)
    ax.set_ylim(0, 5)
    
plt.show()
plt.close()