# Load Modules

In [1]:
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import plotly
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio

from plotly.subplots import make_subplots
from multiprocessing import Pool
from tqdm import tqdm
import networkx as nx
import os
import gzip
import glob


pio.templates.default = 'plotly_white'
pd.options.mode.chained_assignment = None

# Load Data

## Load Motif Search Results

In [87]:
# results of meme-fimo motif search in full aa_seq set
motif_search = pd.read_csv(
    "../results/global_enriched/motifs/motif_search.tsv", sep="\t",
    header = None, 
    names = [
        "motif_id", "motif_alt_id", "sequence_name", "start", "stop", 
        "strand", "score", "p-value", "q-value", "matched_sequence",
        "clique_idx"
    ]
)
motif_search

Unnamed: 0,motif_id,motif_alt_id,sequence_name,start,stop,strand,score,p-value,q-value,matched_sequence,clique_idx
0,VIPEELVEEVIP,MEME-1,t1542,1,12,+,39.3889,1.570000e-13,1.060000e-08,VIPEELVEEVIP,clique_0
1,VIPEELVEEVIP,MEME-1,t5110,1,12,+,39.3889,1.570000e-13,1.060000e-08,VIPEELVEEVIP,clique_0
2,VIPEELVEEVIP,MEME-1,t5036,1,12,+,39.3889,1.570000e-13,1.060000e-08,VIPEELVEEVIP,clique_0
3,VIPEELVEEVIP,MEME-1,t4921,1,12,+,39.3889,1.570000e-13,1.060000e-08,VIPEELVEEVIP,clique_0
4,VIPEELVEEVIP,MEME-1,t5106,2,13,+,39.3889,1.570000e-13,1.060000e-08,VIPEELVEEVIP,clique_0
...,...,...,...,...,...,...,...,...,...,...,...
32570,FLKLLNEGRYCK,MEME-2,t74657,48,59,+,20.8939,9.300000e-11,7.160000e-06,FLRLLNNEKTCK,clique_9
32571,FEKQKEKYEKEI,MEME-3,t51057,7,18,+,27.1972,9.570000e-11,4.420000e-06,FNKQKEKYKEEI,clique_9
32572,FEKQKEKYEKEI,MEME-3,t51056,32,43,+,27.1972,9.570000e-11,4.420000e-06,FNKQKEKYKEEI,clique_9
32573,FEKQKEKYEKEI,MEME-3,t56553,6,17,+,27.1549,9.870000e-11,4.520000e-06,FLKQKEKYKSEI,clique_9


## Load Enriched Set Clique Membership

In [88]:
import glob 

def seqReader(fn):
    """
    iterate through sequences and yield as generator
    """
    def openSeq(fn):
        if 'gz' in fn:
            return gzip.open(fn, 'rt')
        else:
            return open(fn, 'r')

    def num_iter(fn):
        if 'fastq' in fn or 'fq' in fn:
            return 4
        else:
            return 2

    n = num_iter(fn)

    with openSeq(fn) as f:
        while True:
            try:
                yield [next(f).strip('\n') for _ in range(n)]
            except StopIteration:
                break


def seq_set(clique_fn):
    clique_idx = clique_fn.strip().split("/")[-1].split(".")[0]
    
    sset = []
    for h,s in seqReader(clique_fn):
        sset.append({
            'clique_idx' : clique_idx.strip(), 
            'sequence_name' : h.strip(">"), 
            'sequence' : s
        })
    
    return sset

# get members of cliques (in enriched set)
clique_path = glob.glob("../results/global_enriched/cliques/*.fa")

clique_sset = []
for clique_fn in clique_path:
    sset = seq_set(clique_fn)
    for s in sset:
        clique_sset.append(s)

clique_sset = pd.DataFrame(clique_sset)

clique_sset

Unnamed: 0,clique_idx,sequence_name,sequence
0,clique_233,t215202,ENVNTTITGNDFSGGEFLWPGYTEELKAKKASEDAEKAANDAENAS...
1,clique_233,t214801,HLRDAGGNKIGPPASHAIPQMINNLVGEATQGAAEVAKKASESATA...
2,clique_289,t149561,KFFYKKVENTKNKYMNKKKFSTKSEDIINKNNNTTKGSTLGGENDL...
3,clique_289,t118859,TIEENQNNELEGTFKKLIVVVKELSDKNKELDEKEKKIKTYNGDIQ...
4,clique_214,t113780,EPNENSVVDRATDSMNLDPEKVHNENMSDPNTNTEPDASLKDDKKE...
...,...,...,...
2591,clique_78,t130135,NDHRNSMRNDHRNSMRNDQRNSMMNDQRNSMMNDQRNSMMNDQRNV...
2592,clique_78,t128340,DHRNDHRNDHRNDHRNDQRNDHRNSMRNDHRNSMRNDHRNSMRNDQ...
2593,clique_78,t130132,NNSMRGGYSMRNDQRNDQRNDQRNDHRNDHRNDHRNDHRNDHRNDH...
2594,clique_234,t209502,SGPGDVSFSSGEEPTLYLDELARPVPKPRPAKQPKPQPVKDLAGRK...


# Compare Motif and Clique Sets

In [89]:
data = []
for cm, subframe in motif_search.groupby(['clique_idx', 'motif_alt_id']):
    
    clique_idx, motif_idx = [i.strip() for i in cm]
    
    if motif_idx != "MEME-1":
        continue
    
    clique_sset_subset = clique_sset[clique_sset.clique_idx == clique_idx]
    
    found_subset = subframe.sequence_name.unique()
    enriched_subset = clique_sset_subset.sequence_name.unique()
    intersection = np.intersect1d(found_subset, enriched_subset)
    union = np.union1d(found_subset, enriched_subset)
    
    data.append({
        'clique_idx' : clique_idx,
        'found_size' : found_subset.size,
        'enriched_size' : enriched_subset.size,
        'ix_size' : intersection.size,
        'un_size' : union.size,
        'mean_score_enriched' : subframe[subframe.sequence_name.isin(enriched_subset)].score.mean(),
        'mean_score_unenriched' : subframe[~subframe.sequence_name.isin(enriched_subset)].score.mean()
    })
    
    
df = pd.DataFrame(data)
df['fraction_enriched'] = df['enriched_size'] / df['un_size']
df

Unnamed: 0,clique_idx,found_size,enriched_size,ix_size,un_size,mean_score_enriched,mean_score_unenriched,fraction_enriched
0,clique_0,526,464,439,551,34.887365,33.627006,0.842105
1,clique_1,773,169,154,788,35.923794,35.139414,0.214467
2,clique_10,2178,29,29,2178,57.971669,35.425186,0.013315
3,clique_100,4,5,4,5,53.325300,,1.000000
4,clique_101,7,5,4,8,37.407500,38.575000,0.625000
...,...,...,...,...,...,...,...,...
157,clique_92,96,6,6,96,42.743467,32.978567,0.062500
158,clique_94,5,6,5,6,51.927700,,1.000000
159,clique_95,69,6,6,69,43.479917,34.435081,0.086957
160,clique_97,14,5,4,15,34.176428,33.327816,0.333333


In [126]:
px.histogram(
    df, x = 'fraction_enriched', nbins = 45,
    histnorm = 'probability'

)

In [91]:
fig = go.Figure()

tr_e = go.Scatter(
    x = df.fraction_enriched, y = df.mean_score_enriched,
    mode = 'markers', 
    marker = dict(
        color = "red", size = df.un_size, 
        sizeref = 3 * df.un_size.max(), 
        sizemin = 10
    )
)

tr_u = go.Scatter(
    x = df.fraction_enriched, y = df.mean_score_unenriched,
    mode = 'markers', 
    marker = dict(
        color = "grey", size = df.un_size, 
        sizeref = 3 * df.un_size.max(), 
        sizemin=10  
    )
)

fig.add_trace(tr_e)
fig.add_trace(tr_u)

fig

# Plot Enrichment with Intervals

In [92]:
# subframe['enriched'] = subframe.sequence_name.isin(ix)

def plot_intervals(frame, clique_sset, clique_idx = "clique_0", meme_idx = "MEME-1"):
    
    
    subframe = frame[
        (frame.clique_idx == clique_idx) &
        (frame.motif_alt_id == meme_idx)
    ]
    
    subframe['enriched'] = subframe.sequence_name.isin(
        clique_sset[clique_sset.clique_idx == clique_idx].sequence_name
    )
    
    enrichment_frame = subframe[['sequence_name', 'enriched']].drop_duplicates()
    
    
    fig = make_subplots(
        rows = 2, cols = 1,
        vertical_spacing = 0.05, 
        subplot_titles = (
            "Unenriched Targets (n = {})".format(enrichment_frame.shape[0] - enrichment_frame.enriched.sum()), 
            "Enriched Targets (n = {})".format(enrichment_frame.enriched.sum())
        )
    )
    
    for line in subframe.iterrows():
        idx, row = line
    
        interval = np.arange(row.start, row.stop)
        tr = go.Scatter(
            x = interval,
            y = [row.sequence_name for _ in interval],
            mode = 'lines',
            line=dict(color = "red" if row.enriched else "black"),
#             line = dict(color = row.score),
            name = row.sequence_name
        )
        
        if row.enriched:
            row_add = 2
        else:
            row_add = 1
            
        fig.append_trace(tr, row = row_add, col = 1)
        

    fig.update_layout(
        height = 1000, showlegend=False,
        title_text = "Clique ({}) - Motif ({}) - Enrichment ({:.3f}) - n ({})".format(
            clique_idx, motif_idx, enrichment_frame.enriched.mean(), enrichment_frame.shape[0]
        )
    )
    
    return fig

plot_intervals(motif_search, clique_sset, clique_idx = "clique_10", meme_idx = "MEME-1")

# What drives enrichment?

In [97]:
clique_idx = "clique_10"

tf_10 = motif_search[(motif_search.clique_idx == clique_idx) & (motif_search.motif_alt_id)]
cs_10 = clique_sset[clique_sset.clique_idx == clique_idx]
tf_10['enriched'] = tf_10.sequence_name.isin(cs_10.sequence_name)

px.scatter(
    tf_10, x = 'p-value', y = 'score', color = 'enriched',
    opacity = 0.5, facet_col = 'enriched', log_x = True
)



In [104]:
def edit_distance(x):
    
    dist = 0
    for i, aa in enumerate(x.motif_id):
        if aa != x.matched_sequence[i]:
            dist += 1
    
    
    
    return dist / i



tf_10['edit_distance'] = tf_10.apply(lambda x : edit_distance(x), axis = 1)

px.scatter(
    tf_10, x = 'edit_distance', y = 'score',
    facet_col = 'enriched', opacity = 0.3
)

In [116]:
def positional_distance(x):
    pos = []
    
    for i, aa in enumerate(x.motif_id):
        if aa != x.matched_sequence[i]:
            pos.append(i)
    
    
    
    return pos


tf_10['pos_diff'] = tf_10.apply(lambda x : positional_distance(x), axis = 1)


data = []
for idx, row in tf_10.iterrows():
    
    for p in row.pos_diff:
        data.append({
            'enriched' : row.enriched,
            'edit_distance' : row.edit_distance,
            'position' : p
        })
    
    
pos_diff = pd.DataFrame(data)

In [122]:
px.histogram(
    pos_diff, x = 'position', color = 'enriched',
    histnorm = 'probability',
    facet_row = 'enriched'
)

In [125]:
tf_10[tf_10.pos_diff.apply(lambda x : 0 in x)]

Unnamed: 0,motif_id,motif_alt_id,sequence_name,start,stop,strand,score,p-value,q-value,matched_sequence,clique_idx,enriched,edit_distance,pos_diff
3215,RPQFLRWFTEWG,MEME-1,t35652,1,12,+,59.8571,1.480000e-19,1.340000e-14,TPQFLRWFTEWG,clique_10,False,0.090909,[0]
3216,RPQFLRWFTEWG,MEME-1,t54527,7,18,+,59.8571,1.480000e-19,1.340000e-14,TPQFLRWFTEWG,clique_10,False,0.090909,[0]
3217,RPQFLRWFTEWG,MEME-1,t20397,9,20,+,59.8571,1.480000e-19,1.340000e-14,TPQFLRWFTEWG,clique_10,False,0.090909,[0]
3218,RPQFLRWFTEWG,MEME-1,t64232,12,23,+,59.8571,1.480000e-19,1.340000e-14,TPQFLRWFTEWG,clique_10,False,0.090909,[0]
3219,RPQFLRWFTEWG,MEME-1,t55411,14,25,+,59.8571,1.480000e-19,1.340000e-14,TPQFLRWFTEWG,clique_10,True,0.090909,[0]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5397,TPTLEEFAK,MEME-3,t68136,54,62,+,28.6667,8.410000e-11,3.490000e-05,SPTLEKFAQ,clique_10,False,0.375000,"[0, 5, 8]"
5411,RPQFLRWFTEWG,MEME-1,t43979,13,24,+,12.3036,9.360000e-11,4.800000e-07,DDQFFRWFVEWG,clique_10,False,0.363636,"[0, 1, 4, 8]"
5412,RPQFLRWFTEWG,MEME-1,t42820,24,35,+,12.3036,9.360000e-11,4.800000e-07,EDQFSRWFKEWG,clique_10,False,0.363636,"[0, 1, 4, 8]"
5413,RPQFLRWFTEWG,MEME-1,t43978,38,49,+,12.3036,9.360000e-11,4.800000e-07,DDQFFRWFVEWG,clique_10,False,0.363636,"[0, 1, 4, 8]"
