In [1]:
import os, sys
sys.path.append(os.getcwd() + "/../src")

%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import util
import networkx as nx
import handl
import random

In [3]:
homologs_path = "../data/homologs/sc-sp/sp-sc-homologs.txt" 
sp_ppi_path = "../data/ppi/biogrid/sp/sp-biogrid.v3.4.157-ppi-std.tsv" 
sc_ppi_path = "../data/ppi/biogrid/sc/sc-biogrid.v3.4.157-ppi-std.tsv" 
sc_G = util.simple_two_core(nx.read_edgelist(sc_ppi_path, encoding='ascii'), verbose=False)
sp_G = util.simple_two_core(nx.read_edgelist(sp_ppi_path, encoding='ascii'), verbose=False)
homologs = handl.homologs_in_graphs(sp_G, sc_G, util.read_homolog_list(homologs_path))


In [4]:
random.shuffle(homologs)
sp_homs, sc_homs = zip(*homologs)
sp_nodes, sc_nodes = list(sorted(sp_G.nodes())), list(sorted(sc_G.nodes()))

In [5]:
n_landmarks = 400

In [6]:
def load_sls(fp):
    return pd.read_csv(fp, sep='\t', dtype={'Gene A': str, 'Gene B': str, 'Score': float})
def pair2outcome(df):
    INCONCLUSIVE = 'Inconclusive'
    return dict( (frozenset([u, v]), c) for u, v, c in zip(df['Gene A'], df['Gene B'], df['Category']) if c != INCONCLUSIVE )

In [7]:
def summary_statistics(homs, pair2outcome, nodes, exclude=set()):
    assert(isinstance(homs, set))
    SL = 'SL'
    NON_SL = 'Non-SL'
    n_self_pairs = 0
    n_0hom_SLs, n_0hom_non_SLs = 0, 0
    n_1hom_SLs, n_1hom_non_SLs = 0, 0
    n_2hom_SLs, n_2hom_non_SLs = 0, 0
    for pair, outcome in pair2outcome.items():
        pairs = [p for p in pair]
        if len(pairs) != 2: 
            n_self_pairs += 1
            continue
        p1, p2 = pairs
        
        # skip if ps are not in nodes or are in exclude set
        if (p1 not in nodes or p2 not in nodes): continue
        if (p1 in exclude or p2 in exclude): continue
        
        if p1 in homs and p2 in homs:
            if outcome == SL: n_2hom_SLs += 1
            else: n_2hom_non_SLs += 1
        elif (p1 in homs) ^ (p2 in homs):
            if outcome == SL: n_1hom_SLs += 1
            else: n_1hom_non_SLs += 1
        else:
            if outcome == SL: n_0hom_SLs += 1
            else: n_0hom_non_SLs += 1
    print('Self pairs:', n_self_pairs)
    return n_0hom_SLs, n_0hom_non_SLs,\
            n_1hom_SLs, n_1hom_non_SLs,\
            n_2hom_SLs, n_2hom_non_SLs,\
            n_0hom_SLs + n_1hom_SLs + n_2hom_SLs, \
            n_0hom_non_SLs + n_1hom_non_SLs + n_2hom_non_SLs,
        

### Collins
Collins SL data

In [8]:
#print(len(collins_pair2outcome))
collins_fp = '../data/gi/collins/sc/collins-sc-emap-gis-std.tsv'
collins_df = load_sls(collins_fp)
collins_pair2outcome = pair2outcome(collins_df)
collins_results = list(zip(summary_statistics(set(sc_homs), collins_pair2outcome, set(sc_nodes)), 
                      ['# 0 hom SLs', '# 0 hom non SLs','# 1 hom SLs',
                       '#1 hom non SLs','# 2 hom SLs', '#2 hom non SLs',
                       '# total SLs', '# total non SLs']))

for r in collins_results: print(r)

Self pairs: 2
(4125, '# 0 hom SLs')
(81806, '# 0 hom non SLs')
(2545, '# 1 hom SLs')
(36605, '#1 hom non SLs')
(442, '# 2 hom SLs')
(3862, '#2 hom non SLs')
(7112, '# total SLs')
(122273, '# total non SLs')


Collins SL Data minus 400 landmarks

In [9]:
#print(len(collins_pair2outcome))
collins_fp = '../data/gi/collins/sc/collins-sc-emap-gis-std.tsv'
collins_df = load_sls(collins_fp)
collins_pair2outcome = pair2outcome(collins_df)
collins_results = list(zip(summary_statistics(set(sc_homs), collins_pair2outcome, set(sc_nodes), set(sc_homs[:400])), 
                      ['# 0 hom SLs', '# 0 hom non SLs','# 1 hom SLs',
                       '#1 hom non SLs','# 2 hom SLs', '#2 hom non SLs',
                       '# total SLs', '# total non SLs']))

for r in collins_results: print(r)

Self pairs: 2
(4125, '# 0 hom SLs')
(81806, '# 0 hom non SLs')
(1434, '# 1 hom SLs')
(20691, '#1 hom non SLs')
(120, '# 2 hom SLs')
(1262, '#2 hom non SLs')
(5679, '# total SLs')
(103759, '# total non SLs')


### Roguev

In [10]:
roguev_fp = '../data/gi/roguev/sp/roguev-sp-emap-gis-std.tsv'
roguev_df = load_sls(roguev_fp)
roguev_pair2outcome = pair2outcome(roguev_df)
roguev_results = list(zip(summary_statistics(set(sp_homs), roguev_pair2outcome, set(sp_nodes)), 
                      ['# 0 hom SLs', '# 0 hom non SLs','# 1 hom SLs',
                       '#1 hom non SLs','# 2 hom SLs', '#2 hom non SLs',
                       '# total SLs', '# total non SLs']))

for r in roguev_results: print(r)

Self pairs: 2
(1102, '# 0 hom SLs')
(9633, '# 0 hom non SLs')
(1164, '# 1 hom SLs')
(9645, '#1 hom non SLs')
(290, '# 2 hom SLs')
(2380, '#2 hom non SLs')
(2556, '# total SLs')
(21658, '# total non SLs')


In [11]:
roguev_fp = '../data/gi/roguev/sp/roguev-sp-emap-gis-std.tsv'
roguev_df = load_sls(roguev_fp)
roguev_pair2outcome = pair2outcome(roguev_df)
roguev_results = list(zip(summary_statistics(set(sp_homs), roguev_pair2outcome, set(sp_nodes),set(sp_homs[:400])), 
                      ['# 0 hom SLs', '# 0 hom non SLs','# 1 hom SLs',
                       '#1 hom non SLs','# 2 hom SLs', '#2 hom non SLs',
                       '# total SLs', '# total non SLs']))

for r in roguev_results: print(r)

Self pairs: 2
(1102, '# 0 hom SLs')
(9633, '# 0 hom non SLs')
(644, '# 1 hom SLs')
(5674, '#1 hom non SLs')
(107, '# 2 hom SLs')
(812, '#2 hom non SLs')
(1853, '# total SLs')
(16119, '# total non SLs')


### Biogrid Sp V3.4

In [13]:
biogrid_sp_fp = '../data/gi/biogrid/sp/sp-biogrid.v3.4.157-sls-std.tsv'
biogrid_sp_df = load_sls(biogrid_sp_fp)
biogrid_sp_pair2outcome = pair2outcome(biogrid_sp_df)
biogrid_sp_results = list(zip(summary_statistics(set(sp_homs), biogrid_sp_pair2outcome, set(sp_nodes)), 
                      ['# 0 hom SLs', '# 0 hom non SLs','# 1 hom SLs',
                       '#1 hom non SLs','# 2 hom SLs', '#2 hom non SLs',
                       '# total SLs', '# total non SLs']))

for r in biogrid_sp_results: print(r)

Self pairs: 1
(241, '# 0 hom SLs')
(0, '# 0 hom non SLs')
(292, '# 1 hom SLs')
(0, '#1 hom non SLs')
(125, '# 2 hom SLs')
(0, '#2 hom non SLs')
(658, '# total SLs')
(0, '# total non SLs')


In [14]:
biogrid_sp_fp = '../data/gi/biogrid/sp/sp-biogrid.v3.4.157-sls-std.tsv'
biogrid_sp_df = load_sls(biogrid_sp_fp)
biogrid_sp_pair2outcome = pair2outcome(biogrid_sp_df)
biogrid_sp_results = list(zip(summary_statistics(set(sp_homs), biogrid_sp_pair2outcome, set(sp_nodes), set(sp_nodes[:400])), 
                      ['# 0 hom SLs', '# 0 hom non SLs','# 1 hom SLs',
                       '#1 hom non SLs','# 2 hom SLs', '#2 hom non SLs',
                       '# total SLs', '# total non SLs']))

for r in biogrid_sp_results: print(r)

Self pairs: 1
(150, '# 0 hom SLs')
(0, '# 0 hom non SLs')
(231, '# 1 hom SLs')
(0, '#1 hom non SLs')
(103, '# 2 hom SLs')
(0, '#2 hom non SLs')
(484, '# total SLs')
(0, '# total non SLs')


In [16]:
biogrid_sc_fp = '../data/gi/biogrid/sc/sc-biogrid.v3.4.157-sls-std.tsv'
biogrid_sc_df = load_sls(biogrid_sc_fp)
biogrid_sc_pair2outcome = pair2outcome(biogrid_sc_df)
biogrid_sc_results = list(zip(summary_statistics(set(sc_homs), biogrid_sc_pair2outcome, set(sc_nodes)), 
                      ['# 0 hom SLs', '# 0 hom non SLs','# 1 hom SLs',
                       '#1 hom non SLs','# 2 hom SLs', '#2 hom non SLs',
                       '# total SLs', '# total non SLs']))

for r in biogrid_sc_results: print(r)

Self pairs: 4
(6021, '# 0 hom SLs')
(0, '# 0 hom non SLs')
(6124, '# 1 hom SLs')
(0, '#1 hom non SLs')
(1380, '# 2 hom SLs')
(0, '#2 hom non SLs')
(13525, '# total SLs')
(0, '# total non SLs')


In [17]:
biogrid_sc_fp = '../data/gi/biogrid/sc/sc-biogrid.v3.4.157-sls-std.tsv'
biogrid_sc_df = load_sls(biogrid_sc_fp)
biogrid_sc_pair2outcome = pair2outcome(biogrid_sc_df)
biogrid_sc_results = list(zip(summary_statistics(set(sc_homs), biogrid_sc_pair2outcome, set(sc_nodes), set(sc_nodes[:400])), 
                      ['# 0 hom SLs', '# 0 hom non SLs','# 1 hom SLs',
                       '#1 hom non SLs','# 2 hom SLs', '#2 hom non SLs',
                       '# total SLs', '# total non SLs']))

for r in biogrid_sc_results: print(r)

Self pairs: 4
(5242, '# 0 hom SLs')
(0, '# 0 hom non SLs')
(5357, '# 1 hom SLs')
(0, '#1 hom non SLs')
(1132, '# 2 hom SLs')
(0, '#2 hom non SLs')
(11731, '# total SLs')
(0, '# total non SLs')


In [21]:
def sl_data_homologs(A_df, B_df, homologs):
    INCONCLUSIVE = 'Inconclusive'
    A_pairs = [(u,v) for u, v, c in zip(A_df['Gene A'], A_df['Gene B'], A_df['Category']) if c != INCONCLUSIVE ]
    B_pairs = [(u,v) for u, v, c in zip(B_df['Gene A'], B_df['Gene B'], B_df['Category']) if c != INCONCLUSIVE ]
    A1_genes, A2_genes = zip(*A_pairs)
    A_genes = set(A1_genes) | set(A2_genes)
    B1_genes, B2_genes = zip(*B_pairs)
    B_genes = set(B1_genes) | set(B2_genes)
    
    homs = [ (u,v) for u, v in homologs if u not in A_genes and v not in B_genes] 
    return homs
    

In [35]:
biogrid_homs = sl_data_homologs(biogrid_sp_df, biogrid_sc_df, homologs)
collins_roguev_homs = sl_data_homologs(roguev_df, collins_df, homologs)

In [36]:
print(len(biogrid_homs))
print(len(collins_roguev_homs))

123
625


In [37]:
len(homologs)

775

In [43]:
def sl_data_genes(df):
    INCONCLUSIVE = 'Inconclusive'
    A_pairs = [(u,v) for u, v, c in zip(df['Gene A'], df['Gene B'], df['Category']) if c != INCONCLUSIVE ]
    A1_genes, A2_genes = zip(*A_pairs)
    A_genes = set(A1_genes) | set(A2_genes)
    return A_genes

In [44]:
len(sl_data_genes(collins_df))

661

In [45]:
len(sl_data_genes(roguev_df))

540

In [46]:
len(sl_data_genes(biogrid_sc_df))

3507

In [47]:
len(sl_data_genes(biogrid_sp_df))

531