In [7]:
import pandas as pd
import altair as alt
import numpy as np

In [8]:
# Color mapping for different modes
modes_linelle=['HiFi-only (alt)','HiFi-only (pri)','HiC hap1','HiC hap2','Trio hap1 (pat)','Trio hap2 (mat)']
colors_linelle=['#a6cee3','#1f78b4','#b2df8a', '#33a02c', '#fb9a99','#e31a1c']

modes_byung=['HiFi-only','HiFi-Hic','HiFi-trio','CLR']
colors_byung=['#1f78b4', '#33a02c', '#e31a1c','#ff7f00']

## Linelle's graphs

In [9]:
def hapmer_graph(df):
    graph = alt.Chart(df,title='A').mark_point(clip=True).encode(
        x = alt.X(
            'pat_hapmer:Q',
            scale=alt.Scale(
                type='log',
                domain=(10,2000000)
            ),
            axis=alt.Axis(
                values=[10,100,1000,10000,1000000]
            ),
            title="Paternal hapmers"
        ),
        y = alt.Y(
            'mat_hapmer:Q',
            scale=alt.Scale(
                type='log',
                domain=(10,2000000)
            ),
            axis=alt.Axis(
                values=[10,100,1000,10000,1000000]
            ),
            title="Maternal hapmers"
        ),
        color=alt.Color(
                    'Assembly',
                    scale=alt.Scale(
                    domain=modes_linelle,
                range=colors_linelle
                    )
                ),    
        size=alt.Size('Size:Q',scale=alt.Scale(range=[100, 2000],domain=(10,20000000)))
    ).properties(
        height=800,
        width=800
    )
    return(graph)


#.configure_axis(
#        labelFont='Arial',
#        labelFontSize = 30,
#        titleFont='Arial',
#        titleFontSize=30
#    ).configure_legend(
#        orient='top-left',
#        fillColor='white',
#        labelFont='Arial',
#        labelFontSize=23,
#        title=None,
#        symbolSize=200,
#        symbolStrokeWidth=5,

In [10]:
blob = pd.read_csv('hicsolotrio.allcounts.10x.count',sep='\t')

In [11]:
blob.head()

Unnamed: 0,Assembly,Contig,pat_hapmer,mat_hapmer,Size
0,bTaeGut2.trim.HiC.hic.hap1.p_ctg,h1tg000001l,5,1012249,11321021
1,bTaeGut2.trim.HiC.hic.hap1.p_ctg,h1tg000002l,467388,203,14597216
2,bTaeGut2.trim.HiC.hic.hap1.p_ctg,h1tg000003l,20,654014,4314514
3,bTaeGut2.trim.HiC.hic.hap1.p_ctg,h1tg000004l,755214,172,5769299
4,bTaeGut2.trim.HiC.hic.hap1.p_ctg,h1tg000005l,849658,872,18748363


In [12]:
name_map = {}
name_map['bTaeGut2.solo.p2']='HiFi-only (alt)'
name_map['bTaeGut2.solo.p1']='HiFi-only (pri)'
name_map['bTaeGut2.trim.HiC.hic.hap1.p_ctg']='HiC hap1'
name_map['bTaeGut2.trim.HiC.hic.hap2.p_ctg']='HiC hap2'
name_map['bTaeGut2.trio.cutadapt.20211115.dip.hap1.p_ctg']='Trio hap1 (pat)'
name_map['bTaeGut2.trio.cutadapt.20211115.dip.hap2.p_ctg']='Trio hap2 (mat)'

In [13]:
for key in name_map.keys():
    blob.loc[ (blob['Assembly'] == key,'Assembly') ] =  name_map[key]

In [14]:
blob = blob[blob['pat_hapmer'] > 50]
blob = blob[blob['mat_hapmer'] > 50]
blob = blob[blob['mat_hapmer']/(blob['mat_hapmer']+blob['pat_hapmer'])<=.95]
blob = blob[blob['pat_hapmer']/(blob['mat_hapmer']+blob['pat_hapmer'])<=.95]

In [15]:
panel_a = hapmer_graph(blob)
panel_a

In [16]:
all_2copy = pd.read_csv('all_2copy.tsv',sep='\t')

In [17]:
all_2copy.head()

Unnamed: 0,Copies,kmer_multiplicity,Count,Assembly,ordering
1,2,20,19707,Trio hap1 (pat),1
2,2,21,20797,Trio hap1 (pat),1
3,2,22,21937,Trio hap1 (pat),1
4,2,23,22992,Trio hap1 (pat),1
5,2,24,24552,Trio hap1 (pat),1


In [18]:
def line_plot(df,cats,cols,title):
    return(
        alt.Chart(df,title=title).mark_line(clip=True,strokeWidth=20,opacity=.75).encode(
            y = alt.Y(
                'Count:Q',
                scale=alt.Scale(domain=(0,2000000)),
                axis=alt.Axis(values=[0,500000,1000000,1500000]),
                title='Count'
            ),
            x = alt.X(
                'kmer_multiplicity:Q',
                scale=alt.Scale(domain=(49,91)),
                axis=alt.Axis(values=[50,60,70,80,90]),
                title='k-mer multiplicity'
            ),
            color=alt.Color(
                'Assembly',
                scale=alt.Scale(
                    #scheme='paired',
                    domain=cats,
                    range=cols
                )
            )
        ).properties(
            height=800,
            width=800
        )
    )

In [19]:
panel_b = line_plot(all_2copy,modes_linelle,colors_linelle,'B')
panel_b

In [20]:
all_readonly = pd.read_csv('all_readonly.tsv',sep='\t')

In [21]:
all_readonly.head()

Unnamed: 0,Copies,kmer_multiplicity,Count,Assembly,ordering
1,read-only,20,4929927,HiC hap1,
2,read-only,21,5330840,HiC hap1,
3,read-only,22,5753258,HiC hap1,
4,read-only,23,6179953,HiC hap1,
5,read-only,24,6603988,HiC hap1,


In [22]:
panel_c = line_plot(all_readonly,modes_linelle,colors_linelle,'C')
panel_c

In [23]:
alt.hconcat(panel_a,panel_b,panel_c,center=True).configure_axis(
            labelFont='Arial',
            labelFontSize = 30,
            titleFont='Arial',
            titleFontSize=30
        ).configure_legend(
            orient='top-left',
            fillColor='white',
            labelFont='Arial',
            labelFontSize=23,
            title=None,
            symbolSize=1000,
            symbolStrokeWidth=20,
        ).configure_title(
            fontSize=40,
            font='Arial',
            anchor='end',
            color='black',
            dx=0,
            dy=42,
        )

## ByungJuneKo figures

In [24]:
dup = pd.read_csv('Kmer_Dupl.txt',sep='\t',names=['mode','prct'])

In [25]:
dup

Unnamed: 0,mode,prct
0,CLR,1.87237
1,HiFi-only,1.59629
2,HiFi-Hic,0.892503
3,HiFi-trio,0.726224


In [26]:
panel_B_a = alt.Chart(dup).mark_bar(color='green',opacity=.7).encode(
    x=alt.X('mode',sort=["CLR", "HiFi-only", "HiFi-Hic", "HiFi-trio"], title=None),
    y=alt.Y('prct',title="Proportion of k-mers (%)"),
    color=alt.Color(
                'mode',
                scale=alt.Scale(
                    domain=modes_byung,
                    range=colors_byung
                ),
            legend=None
        
            ),    
    
).properties(
    width=800,
    height=800,
    title = alt.TitleParams(text = 'D', 
            fontSize=40,
            font='Arial',
            anchor='end',
            color='black',
            dx=0,
            #dy=48,
                           )
)

panel_B_a

In [27]:
exp_col = pd.read_csv('Kmer_ExpCollap.txt',sep='\t',names=['mode','prct','tp'])

In [28]:
exp_col

Unnamed: 0,mode,prct,tp
0,CLR,10.8603,Expansion
1,HiFi-only,9.40783,Expansion
2,HiFi-Hic,10.1396,Expansion
3,HiFi-trio,9.55756,Expansion
4,CLR,9.8146,Collapse
5,HiFi-only,6.53933,Collapse
6,HiFi-Hic,6.37083,Collapse
7,HiFi-trio,6.79628,Collapse


In [29]:
panel_B_b = alt.Chart(exp_col,title='E').mark_bar(opacity=.7).encode(
    column=alt.Column('mode:N',header=alt.Header(orient="bottom",labelFontSize=40),title=None,
                      sort=['CLR', 'HiFi-only', 'HiFi-Hic', 'HiFi-trio']),
    x=alt.X('tp:N', title=None),
    y=alt.Y('prct',title="Proportion of k-mers (%)"),
    color=alt.Color(
                'mode', 
                scale=alt.Scale(
                    domain=modes_byung,
                    range=colors_byung,
                ),
        legend=None
            ),
    opacity=alt.Opacity('tp:N',title=None,legend=None),
).properties(
    width=800,
    height=800,
).properties(width=200,
             title = alt.TitleParams(text = 'E', 
            fontSize=40,
            font='Arial',
            anchor='end',
            color='black',
            dx=-1,
            #dy=96,
                                    ))

panel_B_b

In [30]:
pat = pd.read_csv('Paternal_completeness.txt',sep='\t',names=['tp','k_dup','k_com'])
mat = pd.read_csv('Maternal_completeness.txt',sep='\t',names=['tp','k_dup','k_com'])

In [31]:
pat

Unnamed: 0,tp,k_dup,k_com
0,Default,0.87,79.3481
1,Rebinned,0.83,80.5694


In [32]:
mat

Unnamed: 0,tp,k_dup,k_com
0,Default,4.37,77.3701
1,Rebinned,3.71,77.1426


In [33]:
def pat_mat(df,lower,upper,lower2,upper2,title):
    dup_line=alt.Chart(df).mark_line(
        color='red',
        opacity=.7,
        strokeWidth=20,
        point={
            "filled": True,
            "fill": "red",
            "size":800
        }
    ).encode(
        x=alt.X('tp',title=None),
        y=alt.Y('k_dup',scale=alt.Scale(domain=(lower, upper)),title="K-mer duplications (%)",axis=alt.Axis(titleColor='red')),
    )
    col_line=alt.Chart(df).mark_line(
        color='blue',
        opacity=.7,
        strokeWidth=20,
        point={
            "filled": True,
            "fill": "blue",
            "size":800
        }
    ).encode(
        x=alt.X('tp'),
        y=alt.Y(
            'k_com',
            scale=alt.Scale(domain=(lower2, upper2)),
            title="K-mer completeness (%)",
            axis=alt.Axis(titleColor='blue')
            ),
    )
    graph = alt.layer(dup_line, col_line).resolve_scale(
        y = 'independent'
    ).properties(
        width=300,
        height=800,
        title=title
    )
    return(graph)

In [37]:
panel_B_c = (pat_mat(pat,0,5,76,81,"Male") | pat_mat(mat,0,5,76,81,"Female")).properties(title = alt.TitleParams(text = 'F', 
            fontSize=40,
            font='Arial',
            anchor='end',
            color='black',
            dx=-45,
            #dy=67,
                                    ))
panel_B_c

In [38]:
panel_d = pd.read_csv('False_Duplication.txt',sep='\t',names=['mode','prct','mb'])

In [39]:
panel_d

Unnamed: 0,mode,prct,mb
0,CLR,1.29,13.8
1,HiFi-only,0.6,6.7
2,HiFi-Hic,0.21,2.4
3,HiFi-trio,0.12,1.3


In [40]:
bars = alt.Chart(panel_d).mark_bar(color='green',opacity=.7).encode(
    x=alt.X('mode',title=None,sort=['CLR', 'HiFi-only', 'HiFi-Hic', 'HiFi-trio']),
    y=alt.Y('prct',title="Proportion of false duplications (%)"),
    color=alt.Color(
                'mode', 
                legend=None,
                scale=alt.Scale(
                    domain=modes_byung,
                    range=colors_byung,
                ),
    ),
)
labels = alt.Chart(panel_d).mark_text(color='white',dy=45,fontSize=80
).encode(
    x=alt.X('mode',title="Mode",sort=['CLR', 'HiFi-only', 'HiFi-Hic', 'HiFi-trio']),
    y=alt.Y('prct'),
    text=alt.Text('mb'),
)

panel_B_d = (bars + labels).properties(
        width=800,
        height=800,
        title = alt.TitleParams(text = 'G', 
            fontSize=40,
            font='Arial',
            anchor='end',
            color='black',
            dx=0,
            #dy=67,
                               ))

panel_B_d

In [41]:
panel_e_hm = pd.read_csv('False_Loss_HM_HapFiltered_Rotated.txt',sep='\t',names=['x','y','v1','v2','v3','v4','v5'])

In [42]:
panel_e_hm = pd.melt(panel_e_hm,id_vars=['x','y'],value_vars=['v3','v4','v5'])

In [43]:
panel_e_hm

Unnamed: 0,x,y,variable,value
0,CLR,CLR,v3,0.0
1,CLR,HiFi-only,v3,0.047499
2,CLR,HiFi-Hic,v3,0.038318
3,CLR,HiFi-trio,v3,0.033233
4,HiFi-only,CLR,v3,0.000175
5,HiFi-only,HiFi-only,v3,0.0
6,HiFi-only,HiFi-Hic,v3,0.000617
7,HiFi-only,HiFi-trio,v3,0.000947
8,HiFi-Hic,CLR,v3,3.1e-05
9,HiFi-Hic,HiFi-only,v3,0.000112


In [44]:
hm = alt.Chart(panel_e_hm).mark_rect().encode(
    x=alt.X('x:N',sort=["CLR", "HiFi-only", "HiFi-Hic", "HiFi-trio"],title=None),
    y=alt.Y('y:N',sort=["CLR", "HiFi-only", "HiFi-Hic", "HiFi-trio"],title=None),
    color=alt.Color('value:Q',scale=alt.Scale(scheme='greys'))
).properties(
    height=800,
    width=800
)


hm

In [45]:
panel_e_top = pd.read_csv('False_Loss_HapFiltered.txt',sep='\t',names=['mode','frac','prct','mb'])

In [46]:
panel_e_top

Unnamed: 0,mode,frac,prct,mb
0,CLR,0.039684,3.968351,42.3
1,HiFi-only,0.00058,0.05797,0.7
2,HiFi-Hic,0.000148,0.014826,0.2
3,HiFi-trio,0.003992,0.399241,4.3


In [47]:
bars_e = alt.Chart(panel_e_top).mark_bar(color='green',opacity=.7).encode(
    x=alt.X('mode',sort=["CLR", "HiFi-only", "HiFi-Hic", "HiFi-trio"],title="Mode",axis=None),
    y=alt.Y('prct',scale=alt.Scale(domain=[0, 6]), title=["Proportion of","false losses (%)"]),
     color=alt.Color(
                'mode',
                legend=None,
                scale=alt.Scale(
                    domain=modes_byung,
                    range=colors_byung,
                ),
    ),
)

labels_e = alt.Chart(panel_e_top).mark_text(color='black',align="center", baseline="top",dy=-60,fontSize=60
).encode(
    text='mb',
x=alt.X('mode',sort=["CLR", "HiFi-only", "HiFi-Hic", "HiFi-trio"],title="Mode",axis=None),
    y=alt.Y('prct')
)

top_hist = (bars_e + labels_e).properties(
    height=200,
    width=800,
)

In [48]:
top_hist

In [49]:
panel_B_e = (top_hist & hm).properties(title = alt.TitleParams(text = 'H', 
            fontSize=40,
            font='Arial',
            anchor='end',
            color='black',
            dx=-55,
            #dy=67,
                               ))
panel_B_e

In [50]:
alt.hconcat(panel_B_a,panel_B_b,panel_B_c).configure_axis(
            labelFont='Arial',
            labelFontSize = 30,
            titleFont='Arial',
            titleFontSize=30
        )

In [51]:
alt.hconcat(panel_B_d,panel_B_e).configure_axis(
            labelFont='Arial',
            labelFontSize = 30,
            titleFont='Arial',
            titleFontSize=30
        ).configure_legend(
        fillColor='white',
        labelFont='Arial',
        labelFontSize=40,
        title=None,
        symbolSize=200,
        symbolStrokeWidth=5,
    )