# Load Modules

In [1]:
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import plotly
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio

from plotly.subplots import make_subplots
import networkx as nx
from multiprocessing import Pool
from tqdm import tqdm
import os
import glob

pio.templates.default = 'plotly_white'
pd.options.mode.chained_assignment = None

# Define Functions

In [2]:
def seqReader(fn):
    """
    iterate through sequences and yield as generator
    """
    def openSeq(fn):
        if 'gz' in fn:
            return gzip.open(fn, 'rt')
        else:
            return open(fn, 'r')

    def num_iter(fn):
        if 'fastq' in fn or 'fq' in fn:
            return 4
        else:
            return 2

    n = num_iter(fn)

    with openSeq(fn) as f:
        while True:
            try:
                yield [next(f).strip('\n') for _ in range(n)]
            except StopIteration:
                break

# Define Paths

## Peptide Metadata

In [3]:
fn_idxmap = "../data/meta/peptide_meta/idx_header_map.tab"

## Enrichment Data

In [4]:
# Define enrichment parameters being used
enrichment_params = "z10_c8"
enrichment_dir = "../data/enriched/TororoKanunguRound2/enrichments/"

# identify bool matrix and peptide list
input_bool = os.path.join(enrichment_dir, enrichment_params + "_bool.csv")
input_peptides = os.path.join(enrichment_dir, enrichment_params + "_peptides.txt")

## Motif Data

In [28]:
# results directory for MCD-MotifSearch
results_dir = "../results/mcdms_enriched/"
motif_dir = os.path.join(results_dir, "motifs")
clique_dir = os.path.join(results_dir, "cliques")

# relative filepaths
conv_fn = os.path.join(results_dir, "{}.conv.txt".format(enrichment_params))
motif_seq_fn = os.path.join(motif_dir, "merged_motifs.fa")
motif_blast_fn = os.path.join(motif_dir, "motifs.blast.tab")
merged_cliques = os.path.join(clique_dir, "merged_cliques.tab")

## Output Figure Directory

In [29]:
fn_figdir = "../figures/MCD"

# Load and Process Data

## Process Motifs

In [6]:
# load fasta into dataframe
motif_frame = pd.DataFrame([
    [h.strip('>'), s] for h, s in seqReader(motif_seq_fn)
    ],
    columns = ["qseqid", "sequence"]
)

# load blast results into frame
motif_blast = pd.read_csv(motif_blast_fn, sep="\t")
minimum_evalue = motif_blast.\
    groupby('qseqid').\
    apply(
        lambda x : pd.Series({
            'sseqid' : x.sort_values('evalue').iloc[0].sseqid,
            'evalue' : x.sort_values('evalue').iloc[0].evalue
        })
    ).reset_index()


# create clique idx variable
minimum_evalue['clique_idx'] = minimum_evalue.qseqid.\
    apply(lambda x : x.split("_")[1]).\
    astype(int)

motif_frame = motif_frame.merge(minimum_evalue)
motif_frame

Unnamed: 0,qseqid,sequence,sseqid,evalue,clique_idx
0,clique_0_MEME-0_1.4e-6639,VIPEELVEEVIP,eid_11776::11-1_polypeptide,0.480000,0
1,clique_0_MEME-2_5.5e-302,EVVEEVVPE,eid_11776::11-1_polypeptide,6.000000,0
2,clique_1_MEME-0_3.3e-1426,GYSGIDJIBDVL,eid_913933::RAP_protein,3.300000,1
3,clique_1_MEME-3_3.5e-125,VAKLTNSDP,eid_73821::Glycophorin-binding_protein,3.800000,1
4,clique_1_MEME-5_1.1e-022,RKENELF,eid_73540::PfEMP1_variant_1_of_strain_MC,0.640000,1
...,...,...,...,...,...
127,clique_90_MEME-0_5.4e-042,EPFPNQKHKDLD,eid_914157::glutamate-rich_protein_[Plasmodium...,0.000164,90
128,clique_93_MEME-0_3.3e-035,TNPCQLEYQWHT,eid_236266::erythrocyte_membrane_protein_1,0.000272,93
129,clique_94_MEME-0_3.4e-035,NIKGEDNIKNKG,eid_1068647::cytoadherence_linked_asexual_prot...,6.000000,94
130,clique_96_MEME-0_6.6e-025,NDTTHNSYTNKK,eid_141616::erythrocyte_membrane_protein_1,2.400000,96


## Create Convertion Table

In [7]:
f = open(conv_fn, "r+")
conv_dict = {
    line.strip().split(' ')[0] : line.strip().split(' ')[1] for line in f
}

## Load Graph from Edgelist

In [8]:
G = nx.read_edgelist(merged_cliques)
G = nx.relabel_nodes(G, conv_dict)

## Create Clique Lookup Table

In [9]:
clique_table = []
for fn in glob.glob("{}/*fa".format(clique_dir)):
    clique_idx = int(fn.split("/")[-1].split(".")[0].split("_")[1])
    
    
    for header, seq in seqReader(fn):
        clique_table.append({
            'clique_idx' : clique_idx,
            "header" : header.strip(">"),
            "seq" : seq
        })
        

clique_table = pd.DataFrame(clique_table)
clique_table

Unnamed: 0,clique_idx,header,seq
0,241,t86926,KEVLKLEFSKENIEEDKENNVSAREIDLINEMLKEDEKEQEEDDDA...
1,241,t176026,MSSHINLYDKDENINEDKDDDINENKDDDINENKDENINEDKDEDS...
2,147,t185485,SDKKLNSQSDKKLNSQSDKEINSQNDKEINSQSDKKLNSQSDKEIN...
3,147,t80872,VLSTDTDDSETDAEKTSNNSNTLHKKETPAMKYNMNIAQDEINRQN...
4,147,t80873,KETPAMKYNMNIAQDEINRQNDVSKNTTYAENNEYTSENITKPSDQ...
...,...,...,...
2389,50,t124144,YDGTFNVLNKENVQTTSQPKDDDDNGPNDPFYDGTFNVLNKENVQT...
2390,50,t124142,EETTSQPKDVDDNGPNDPFYDGTFNVLNKENVQTTSQPKDVDDNGP...
2391,50,t131353,TTKCLSNEDTLNVSNKENTTFMNTSMWNNENVQQNGEQYVQQNGEQ...
2392,248,t228923,KKKKGENVMDGNFEMNSADEKNNYMSNETKYNSRNFIYDFDHRNQD...


## Aggregate Clique Layouts

In [10]:
layout = nx.spring_layout(G, seed=42)

In [11]:
frame = []
for idx, c in clique_table.groupby('clique_idx'):
    
    header_set = set(c.header.values)
    
    agg_x, agg_y = np.array([
        layout[h] for h in header_set
        ]).\
        mean(axis = 0)
    
    frame.append({
        'clique_idx' : idx,
        'membership' : len(header_set),
        'agg_x' : agg_x,
        'agg_y' : agg_y
    })
    

frame = pd.DataFrame(frame)

In [12]:
group_cols = ['clique_idx', 'membership', 'agg_x', 'agg_y']
clique_motif_frame = []
nan_remover = lambda x : np.array([i for i in x.astype(str) if i != 'nan'])

for idx, subframe in frame.merge(motif_frame, how = 'left').groupby(group_cols):
    
    data = {
        'clique_idx' : idx[0],
        'membership' : idx[1],
        'agg_x' : idx[2],
        'agg_y' : idx[3],
        'qseqid' : nan_remover(subframe.qseqid.unique()),
        'sequences' : nan_remover(subframe.sequence.unique()),
        'sseqid' : nan_remover(subframe.sseqid.unique()),
        'num_known' : nan_remover(subframe.sseqid.unique()).size
    }
    
    clique_motif_frame.append(data)

clique_motif_frame = pd.DataFrame(clique_motif_frame)
clique_motif_frame.sort_values('num_known', inplace=True)
clique_motif_frame

Unnamed: 0,clique_idx,membership,agg_x,agg_y,qseqid,sequences,sseqid,num_known
274,274,2,0.087412,0.849061,[],[],[],0
122,122,4,0.810070,0.159737,[],[],[],0
123,123,4,-0.732529,-0.260718,[],[],[],0
124,124,4,0.244762,0.712304,[],[],[],0
213,213,2,-0.745668,-0.528340,[],[],[],0
...,...,...,...,...,...,...,...,...
11,11,27,0.181906,0.252555,"[clique_11_MEME-0_1.5e-339, clique_11_MEME-1_2...","[QHEIVEVEEILP, EDKNEKV, HTEQLDLDHKTV]","[eid_469456::glutamate-rich_protein, eid_91343...",3
18,18,22,0.250668,-0.191435,"[clique_18_MEME-0_8.1e-803, clique_18_MEME-1_1...","[PNANPNANPNAN, PNANPNA, RKPKHKKLKQPG, QGNGQGHNM]",[eid_756411::Circumsporozoite_protein_precurso...,4
20,20,20,-0.088426,-0.414727,"[clique_20_MEME-0_3.6e-104, clique_20_MEME-1_4...","[EEEDEEDIEEEN, QQSEKKSISKVD, EKELSNQ, ETNDTEDT...",[eid_1067086::mature_parasite-infected_erythro...,4
1,1,120,-0.139762,0.024525,"[clique_1_MEME-0_3.3e-1426, clique_1_MEME-3_3....","[GYSGIDJIBDVL, VAKLTNSDP, RKENELF, WLDRHRD]","[eid_913933::RAP_protein, eid_73821::Glycophor...",4


# Plot Cliques and Motifs

## Aggregate Clique Layout + Number of found motifs

In [31]:
clique_motif_frame['joined_sequences'] = [
    ','.join(x) for x in clique_motif_frame.sequences
]


clique_motif_frame['joined_epitopes'] = [
    ','.join([i.split("::")[-1].replace("[Plasmodium_falciparum_3D7]", "") for i in x]) for x in clique_motif_frame.sseqid
]

fig = go.Figure()

tr = go.Scatter(
    x = clique_motif_frame.agg_x,
    y = clique_motif_frame.agg_y,
    mode = 'markers',
    marker = dict(
        size = clique_motif_frame.membership,
        color = clique_motif_frame.num_known,
        sizemode = 'area',
        sizeref=100*max(clique_motif_frame.num_known)/(40.**2),
        sizemin = 3,
        symbol = [0 if x > 0 else 100 for x in clique_motif_frame.num_known],
        colorbar = dict(title = 'known motifs'),
        colorscale = ["#f5d60f", "#C70039"]    
    ),
    customdata = clique_motif_frame[['clique_idx', 'joined_sequences', 'joined_epitopes']].values,
    hovertemplate = 
        '<b>Clique Number</b> : %{customdata[0]}<br>' +
        "<b>Membership</b> : %{marker.size}<br>" +
        '<b>Motifs</b> : %{customdata[1]}</b><br>' +
        "<extra><b>Known Epitopes</b> : %{customdata[2]}<br></extra>",
)

fig.add_trace(tr)


fig.update_layout(
    height = 1000, width = 1000,
    hoverlabel = dict(font=dict(color='white'))
)

fig.write_html(os.path.join(fn_figdir, "clique_scatter.html"))
fig

## Distribution of Number of Found Motifs

In [32]:
known_frame = clique_motif_frame[['clique_idx', 'membership', 'num_known']].drop_duplicates()

known_hist = known_frame.\
    groupby('num_known').apply(lambda x : pd.Series({'counts':x.shape[0]})).reset_index()

px.bar(
    known_hist, x = 'num_known', y = 'counts'
)

## Membership size of nodes with number of motifs found

In [15]:
fig = px.box(
    known_frame, x = 'num_known', y = 'membership',
    log_y=True, hover_name='clique_idx'
)

fig.update_layout(height = 1000)

In [16]:
num_unknown_motifs = (known_frame.num_known.astype(int) == 0).sum()
total_cliques = known_frame.clique_idx.unique().size

print(
    "Total Unknown Clique Motifs : {} / {} ( {:.3f} )".\
    format(num_unknown_motifs, total_cliques, num_unknown_motifs/total_cliques)
)

# known_frame.clique_idx.unique().size

Total Unknown Clique Motifs : 178 / 275 ( 0.647 )


# Motif Distribution

In [17]:
import scipy

motif_seq_frame = []

for h, s in seqReader(motif_seq_fn):
    _, clique_idx, motif_id, evalue = h.split("_")
    motif_seq_frame.append({
        "clique_idx" : int(clique_idx),
        "motif_id" : motif_id,
        "evalue" : float(evalue),
        "seq" : s
    })
    
motif_seq_frame = pd.DataFrame(motif_seq_frame).merge(clique_motif_frame[["clique_idx", "membership"]])
motif_seq_frame

Unnamed: 0,clique_idx,motif_id,evalue,seq,membership
0,0,MEME-0,0.000000e+00,VIPEELVEEVIP,442
1,0,MEME-1,0.000000e+00,VVEEVVPEELVE,442
2,0,MEME-2,5.500000e-302,EVVEEVVPE,442
3,0,MEME-3,3.800000e-56,EELVEEV,442
4,0,MEME-4,3.500000e-51,PEELVEE,442
...,...,...,...,...,...
328,94,MEME-0,3.400000e-35,NIKGEDNIKNKG,5
329,95,MEME-0,1.300000e-10,NNDNMND,5
330,96,MEME-0,6.600000e-25,NDTTHNSYTNKK,5
331,97,MEME-0,1.800000e-38,DNIEVNNTL,5


## Correlation of Membership and Discovered Motifs

In [18]:
n_motifs_frame = motif_seq_frame.\
    groupby(["clique_idx", "membership"]).\
    apply(lambda x : pd.Series({"n_motifs" : x.shape[0]})).\
    reset_index()

px.scatter(
    n_motifs_frame, x = 'membership', y = "n_motifs",
    log_x = True
)

## Distribution of E-Value with Motif Rank

In [19]:
px.box(
    motif_seq_frame, x = 'motif_id', y = 'evalue',
    points = 'all', log_y = True
)

# Response Rate

## Create Patient Responses

In [20]:
bool_mat = pd.read_csv(input_bool)

In [21]:
idx_lookup = {}
for line in open(fn_idxmap, "r+"):
    idx, name = line.strip().split("\t")
    idx_lookup[name] = idx

peptide_index = []
for line in open(input_peptides, "r+"):
    if "#" in line:
        continue
        
    peptide_index.append(idx_lookup[line.strip()])

bool_mat.index = peptide_index

In [22]:
patient_frame = []
for c in bool_mat.columns:
    responses = bool_mat.index.values[bool_mat.loc[:,c] == 1]
    for r in responses:
        patient_frame.append({
            'cid' : c,
            'header' : r
        })
patient_frame = pd.DataFrame(patient_frame)
patient_frame = patient_frame.merge(clique_table[['clique_idx', 'header']])

patient_frame

Unnamed: 0,cid,header,clique_idx
0,CK3-4F42,t152479,9
1,CT3-4CX7,t152479,9
2,CT3-5779,t152479,9
3,CT3-5A4B,t152479,9
4,CT3-4CCQ,t152479,9
...,...,...,...
59395,CK3-5FAC,t146905,146
59396,CK3-9BFT,t146905,146
59397,CK3-DUL2,t146905,146
59398,CK3-G5NH,t146905,146


In [23]:
number_responses = patient_frame.\
    groupby('clique_idx').\
    apply(
        lambda x : pd.Series({
            'num_patients' : x.cid.unique().size
        })
    ).reset_index()

## Clique Motifs with Patient Responses

In [24]:
clique_response_frame = clique_motif_frame.merge(number_responses)

fig = px.scatter(
    clique_response_frame, x = 'agg_x', y = 'agg_y', size = 'membership',
    size_max = 50, hover_name='clique_idx', hover_data=['qseqid', 'sequences', 'sseqid'],
    color = 'num_patients', color_continuous_scale="OrRd",
    facet_col="num_known",facet_col_wrap=2
)

fig.update_layout(
    height = 1000, width = 1000
)

fig.write_html("../figures/enriched_set_clique_motifs/patient_responses.html")

fig

## Correlation of number of known motifs against patient recognition

In [33]:
clique_responses = clique_response_frame[
    ['clique_idx', 'membership', 'num_known', 'num_patients']
].drop_duplicates()

fig = px.scatter(
    clique_responses, x = 'num_known', y = 'num_patients',
    size = 'membership', color = 'num_known',
    hover_name="clique_idx", size_max=50
)

fig.write_html(os.path.join(fn_figdir, "motif_confidence.html"))

fig

In [26]:
patient_frame['location'] = patient_frame.cid.apply(lambda x : 'Tororo' if 'CT' in x else 'Kanungu')
patient_response_by_region = patient_frame.\
    groupby(['location', 'clique_idx']).\
    apply(
        lambda x : pd.Series({'num_patients' : x.cid.unique().size})
).reset_index()

patient_response_by_region = pd.pivot_table(
    patient_response_by_region, index = 'clique_idx', 
    columns = 'location', values = 'num_patients'
).reset_index()

patient_response_by_region['leaning'] = patient_response_by_region.apply(
    lambda x : 'K' if x.Kanungu > x.Tororo else 'T',
    axis = 1
)

patient_response_by_region

location,clique_idx,Kanungu,Tororo,leaning
0,0,42.0,67.0,T
1,1,15.0,32.0,T
2,2,29.0,48.0,T
3,3,77.0,73.0,K
4,4,54.0,20.0,K
...,...,...,...,...
270,270,,9.0,T
271,271,7.0,10.0,T
272,272,1.0,11.0,T
273,273,6.0,3.0,K


## Regional Obervations of Cliques

In [34]:
fig = px.scatter(
    clique_response_frame.merge(patient_response_by_region),
    x = 'agg_x', y = 'agg_y', size = 'membership',
    size_max = 50, hover_name='clique_idx', hover_data=['qseqid', 'sequences', 'sseqid'],
    facet_row = 'leaning',
    color = 'num_known',
    color_discrete_sequence=['#06021c', '#dcb825', '#dc5c25', '#dc2549']
)

fig.update_layout(
    height = 1000, width = 1000,
    coloraxis_colorbar=dict(
        tickvals = [0,1,2,3]
    )
)

fig.update_layout(height = 1000, width = 1000)

fig.write_html("regional_clique_enrichment.html")

fig