# Load Modules

In [100]:
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import plotly
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio

from plotly.subplots import make_subplots
from multiprocessing import Pool
from tqdm import tqdm
import networkx as nx
import sys

pio.templates.default = 'plotly_white'
pd.options.mode.chained_assignment = None

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Load Metadata

In [101]:
def load_meta():
    """
    load peptide meta data
    """
    def an_geneProduct(g):
        """
        annotate gene product
        """
        if '|' in g:
            if 'gene_product' in g:
                return g.split('|')[4].split('=')[1]
            else:
                return g.split('|')[2].split('=')[0]
        else:
            return g
    print(
        "loading peptide meta data...",
        file=sys.stderr
        )

    meta_dir = "../data/meta/"
    meta = pd.read_csv(meta_dir + "meta_frame.tab.gz", sep="\t")

    meta['gene_product'] = meta.apply(
        lambda x : an_geneProduct(x.geneID),
        axis = 1
        )

    return meta
meta = load_meta()
meta

loading peptide meta data...


Unnamed: 0,target_num,target,geneID,groupID,classID,sequence,aaSeq,gene_product
0,t0,PF3D7_0628100.1-p1|transcript=PF3D7_0628100.1|...,PF3D7_0628100.1-p1|transcript=PF3D7_0628100.1|...,PlasmoDB-33_Pfalciparum3D7_AnnotatedProteins_k...,PF,ATGAAGGATTTGTATTCTCTTTATGAAGATATCAGTGAAAACGACA...,MKDLYSLYEDISENDKIIISEKLKNLLNYFQDKNKSYVSEFVVVSR...,"HECT-domain_(ubiquitin-transferase),_putative"
1,t1,PF3D7_0628100.1-p1|transcript=PF3D7_0628100.1|...,PF3D7_0628100.1-p1|transcript=PF3D7_0628100.1|...,PlasmoDB-33_Pfalciparum3D7_AnnotatedProteins_k...,PF,TTGCTCAATTATTTTCAAGACAAGAACAAGAGCTATGTCTCGGAGT...,LLNYFQDKNKSYVSEFVVVSRTRFFKSISNYGEFLLLQSSSRVISS...,"HECT-domain_(ubiquitin-transferase),_putative"
2,t2,PF3D7_0628100.1-p1|transcript=PF3D7_0628100.1|...,PF3D7_0628100.1-p1|transcript=PF3D7_0628100.1|...,PlasmoDB-33_Pfalciparum3D7_AnnotatedProteins_k...,PF,AAGTCGATATCAAATTACGGTGAGTTTCTGCTTTTGCAGAGTTCAA...,KSISNYGEFLLLQSSSRVISSYEHILRLLHQAKVYLEFVRCLKLNN...,"HECT-domain_(ubiquitin-transferase),_putative"
3,t3,PF3D7_0628100.1-p1|transcript=PF3D7_0628100.1|...,PF3D7_0628100.1-p1|transcript=PF3D7_0628100.1|...,PlasmoDB-33_Pfalciparum3D7_AnnotatedProteins_k...,PF,TTGAGGCTATTGCACCAGGCAAAGGTCTATCTGGAATTTGTCCGCT...,LRLLHQAKVYLEFVRCLKLNNCSVIDNEKYEEEISKFINYLESKRK...,"HECT-domain_(ubiquitin-transferase),_putative"
4,t4,PF3D7_0628100.1-p1|transcript=PF3D7_0628100.1|...,PF3D7_0628100.1-p1|transcript=PF3D7_0628100.1|...,PlasmoDB-33_Pfalciparum3D7_AnnotatedProteins_k...,PF,GACAATGAGAAATATGAAGAAGAAATTAGCAAGTTCATCAACTATC...,DNEKYEEEISKFINYLESKRKKYYYHICQILIKFIVSSLSDNTIDN...,"HECT-domain_(ubiquitin-transferase),_putative"
...,...,...,...,...,...,...,...,...
238063,t238063,tr|G0WJE6|G0WJE6_EBVG_Latent_membrane_protein_...,tr|G0WJE6|G0WJE6_EBVG_Latent_membrane_protein_...,EBV_KK,VIR,GCTGCAGCGGCGGCTGAGGTAGAGAATAAAGGAGGTGACCGGGATC...,AAAAAEVENKGGDRDPPSMTDGGGGDPHLPTLLLGTFGSRGDDDDP...,G0WJE6_EBVG_Latent_membrane_protein_1(LMP-1)_c...
238064,t238064,tr|G0WJE0|G0WJE0_EBVG_Latent_membrane_protein_...,tr|G0WJE0|G0WJE0_EBVG_Latent_membrane_protein_...,EBV_KK,VIR,GCTGCGGCAGCTGCCGAGGTAGAGAATAAAGGCGGTGATCGTGACC...,AAAAAEVENKGGDRDPPSMTDGGGGDPHLPTLLLGTPDSGGDDDDP...,G0WJE0_EBVG_Latent_membrane_protein_1(LMP-1)_c...
238065,t238065,tr|Q9YPF9|Q9YPF9_EBVG_Reacc_protein_OS=Epstein...,tr|Q9YPF9|Q9YPF9_EBVG_Reacc_protein_OS=Epstein...,EBV_KK,VIR,GCTGCTGCAGCGGCTATGGGAATACTGTGCGTCATGTTAACTGGTG...,AAAAAMGILCVMLTGESLYCSPDSGGGGNGKSYTLLIFFDTTFNCY...,Q9YPF9_EBVG_Reacc_protein_OS
238066,t238066,PFL0030c_D1_seq1,PFL0030c_D1,3D7_var-PfEMP1_sequences_ST,PF,GCGGCAGCCGCCGCGGCAATGGACAAGTCTTCGATCGCGAACAAAA...,AAAAAAMDKSSIANKIEAYLGAKSDDSKIDQSLKADPSEVQYYGSG...,PFL0030c_D1


# Load Graph

In [147]:
def prepare_graph(fn):
    edge_list = pd.read_csv(
        fn, sep="\t", names = ['source', 'target', 'distance'], skiprows=1
    )
    G = nx.read_edgelist(
        fn, delimiter="\t", data = [('distance', str)]
    )
    
    [G.remove_node(n) for n in ['node1', 'node2']]
    
    for edge in G.edges():
        G.edges()[edge]['distance'] = int((7 - int(G.edges()[edge]['distance'])) / 7) 
    
    return edge_list, G

fn = "../results/aa1000_md1_8.tab"
edge_list, G = prepare_graph(fn)
edge_list

Unnamed: 0,source,target,distance
0,t100281,t103012,1
1,t100281,t10821,1
2,t100281,t122975,1
3,t100281,t92185,1
4,t100458,t177697,1
...,...,...,...
3046,t99855,t202855,0
3047,t99855,t221048,1
3048,t99855,t28139,1
3049,t99855,t43125,1


# Cluster

In [148]:
from sklearn.cluster import Birch, SpectralClustering

adjacency_matrix = nx.to_numpy_matrix(G)
b = SpectralClustering(affinity = 'precomputed')
cluster_labels = b.fit_predict(adjacency_matrix)


Graph is not fully connected, spectral embedding may not work as expected.



# Plot Network

## Plotting Functions

In [149]:
def plot_network(graph):
    """
    generic network plotter
    """
    
    G = graph.copy()
    
    layout = nx.spring_layout(G)
    
    edge_x = []
    edge_y = []
    for edge in G.edges():
        x0, y0 = layout[edge[0]]
        x1, y1 = layout[edge[1]]
        edge_x.append(x0)
        edge_x.append(x1)
        edge_x.append(None)
        edge_y.append(y0)
        edge_y.append(y1)
        edge_y.append(None)
    
    node_list = [n for n in G.nodes()]
    
    node_positions = np.vstack([
        layout[n] for n in node_list
    ])
    
    fig = go.Figure()
    
        
    node_colors = [
        'red' if n in [0, 1, 2, 3, 13] else 'black' for n in node_list
    ]
    
    node_trace = go.Scatter(
        x = node_positions[:,0],
        y = node_positions[:,1],
        mode = 'markers',
        marker = dict(color = node_colors)
    )
    
    edge_trace = go.Scatter(
        x=edge_x, y=edge_y,
        line=dict(width=0.25, color='#888'),
        mode='lines'
        
    )
    
    fig.add_trace(node_trace)
    fig.add_trace(edge_trace)
    
    fig.update_layout(height = 1000, width = 1000)
    
    return fig

def plot_network_distance(G, cluster_labels):
    """
    takes a cluster label and creates layout with distances
    """
    
    layout = nx.spring_layout(G, weight = 'distance')
    
    edge_x = []
    edge_y = []
    edge_distance = []
    for edge in G.edges():
        x0, y0 = layout[edge[0]]
        x1, y1 = layout[edge[1]]
        edge_x.append(x0)
        edge_x.append(x1)
        edge_x.append(None)
        edge_y.append(y0)
        edge_y.append(y1)
        edge_y.append(None)
        edge_distance.append(G.edges()[edge]['distance'])
    
    node_list = [n for n in G.nodes()]
    
    node_positions = np.vstack([
        layout[n] for n in node_list
    ])
    
    fig = go.Figure()
    
    node_trace = go.Scatter(
        x = node_positions[:,0],
        y = node_positions[:,1],
        mode = 'markers',
        marker = dict(color = cluster_labels),
        hovertext=cluster_labels
    )
    
    edge_trace = go.Scatter(
        x=edge_x, y=edge_y,
        line=dict(width=0.25, color='#888'),
        hovertext=edge_distance,
        mode='lines'
        
    )
    
    fig.add_trace(node_trace)
    fig.add_trace(edge_trace)
    
    fig.update_layout(height = 1000, width = 1000)
    
    return fig

def plot_network_gene_product(graph, meta):
    """
    generic network plotter
    """
    gp_lookup = {
        t : gp for t, gp in meta[['target_num', 'gene_product']].values
    }
    
    G = graph.copy()
    
    layout = nx.spring_layout(G)
    
    edge_x = []
    edge_y = []
    for edge in G.edges():
        x0, y0 = layout[edge[0]]
        x1, y1 = layout[edge[1]]
        edge_x.append(x0)
        edge_x.append(x1)
        edge_x.append(None)
        edge_y.append(y0)
        edge_y.append(y1)
        edge_y.append(None)
    
    node_list = [n for n in G.nodes()]
    
    node_positions = np.vstack([
        layout[n] for n in node_list
    ])
    
    fig = go.Figure()
    
        
    node_colors = [
        len(list(G.neighbors(n))) for n in node_list
    ]
    
    node_trace = go.Scatter(
        x = node_positions[:,0],
        y = node_positions[:,1],
        mode = 'markers',
        marker = dict(color = node_colors, colorbar = dict(title = "Number of Neighbors")),
        hovertext=[gp_lookup[n] for n in node_list]
    )
    
    edge_trace = go.Scatter(
        x=edge_x, y=edge_y,
        line=dict(width=0.25, color='#888'),
        mode='lines'
        
    )
    
    fig.add_trace(node_trace)
    fig.add_trace(edge_trace)
    
    fig.update_layout(height = 1000, width = 1000, showlegend=False)
    
    return fig

## Plot Network w/ Clusters

In [150]:
plot_network_gene_product(G, meta)

In [151]:
cliques = list(nx.find_cliques(G))
clique_sizes = [len(c) for c in cliques]
px.histogram(clique_sizes)

In [155]:
def create_new_g_obj(graph, nodes):
    G = nx.Graph()
    sg = graph.subgraph(nodes)
    
    G.add_nodes_from(sg.nodes())
    G.add_edges_from(sg.edges())
    
    return G

def clique_minimization(G):
    graph = G.copy()
    subgraphs = []
    subgraph_sizes = []
    i = 0
    while(len(graph.nodes()) > 0 ):

        graph_size = len(graph.nodes())

        cliques = list(nx.find_cliques(graph))
        clique_sizes = [len(c) for c in cliques]

        largest_clique = cliques[np.argmax(clique_sizes)]
        largest_subgraph = create_new_g_obj(graph, largest_clique)
        subgraphs.append(largest_subgraph)
        subgraph_sizes.append(len(largest_clique))

        [graph.remove_node(n) for n in largest_clique]
        i += 1
    
    return subgraphs

In [153]:
def ConcatenateGraphs(subgraphs):
    G = nx.Graph()
    
    for sg in subgraphs:
        G.add_nodes_from(sg.nodes())
        G.add_edges_from(sg.edges())
        
    return G

all_sg = ConcatenateGraphs(subgraphs)

In [154]:
plot_network_gene_product(all_sg, meta)