In [1]:
import os, sys
import urllib
import zipfile
import gzip
import pandas as pd

from collections import defaultdict
from goatools.anno.gaf_reader import GafReader
from goatools.obo_parser import GODag

import networkx as nx

data_dir = "../data/"

In [2]:
def download_and_unzip(download_url_link, dir_path, zipped_filename,destination_dir_name, unzip=True):
    #https://www.tutorialsbuddy.com/download-and-unzip-a-zipped-file-in-python
    print("Download starting")

    urllib.request.urlretrieve(
        download_url_link, os.path.join(dir_path, zipped_filename)
    )
    print("Download complete")

    if unzip:
        print("unzipping file starting")
    
        if zipped_filename.endswith(".zip"):
            with zipfile.ZipFile(os.path.join(dir_path, zipped_filename), "r") as zip_file:
                zip_file.extractall(os.path.join(dir_path, destination_dir_name))
        elif zipped_filename.endswith(".gz"):
            print("zipfile")
            with gzip.GzipFile(os.path.join(dir_path, zipped_filename), "rb") as zip_file:
                with open(os.path.join(dir_path, destination_dir_name, zipped_filename.replace(".gz", "")), "wb") as fout:
                    fout.write(zip_file.read())
        else:
            raise NotImplementedError("NO CASE")
            
    
    print("unzipping complete")

In [3]:
if not os.path.exists(os.path.join(data_dir, "9606.protein.links.full.txt.gz")):
    download_and_unzip("https://stringdb-static.org/download/protein.links.full.v11.5/9606.protein.links.full.v11.5.txt.gz", ".", os.path.join(data_dir, "9606.protein.links.full.txt.gz"), ".")
        

In [4]:
df = pd.read_csv(os.path.join(data_dir, "9606.protein.links.full.txt"), sep=" ")

In [5]:
df.head()

Unnamed: 0,protein1,protein2,neighborhood,neighborhood_transferred,fusion,cooccurence,homology,coexpression,coexpression_transferred,experiments,experiments_transferred,database,database_transferred,textmining,textmining_transferred,combined_score
0,9606.ENSP00000000233,9606.ENSP00000379496,0,0,0,0,0,0,54,0,0,0,0,103,85,155
1,9606.ENSP00000000233,9606.ENSP00000314067,0,0,0,0,0,0,0,0,180,0,0,0,61,197
2,9606.ENSP00000000233,9606.ENSP00000263116,0,0,0,0,0,0,62,0,152,0,0,0,101,222
3,9606.ENSP00000000233,9606.ENSP00000361263,0,0,0,0,0,0,0,0,161,0,0,47,58,181
4,9606.ENSP00000000233,9606.ENSP00000409666,0,0,0,0,0,60,63,0,213,0,0,0,72,270


In [6]:
df.columns

Index(['protein1', 'protein2', 'neighborhood', 'neighborhood_transferred',
       'fusion', 'cooccurence', 'homology', 'coexpression',
       'coexpression_transferred', 'experiments', 'experiments_transferred',
       'database', 'database_transferred', 'textmining',
       'textmining_transferred', 'combined_score'],
      dtype='object')

In [7]:
df[df.protein1.str.contains("ENSP00000225831") |df.protein2.str.contains("ENSP00000225831") ].sort_values("combined_score", ascending=False)

Unnamed: 0,protein1,protein2,neighborhood,neighborhood_transferred,fusion,cooccurence,homology,coexpression,coexpression_transferred,experiments,experiments_transferred,database,database_transferred,textmining,textmining_transferred,combined_score
3125481,9606.ENSP00000292301,9606.ENSP00000225831,0,0,0,0,0,0,62,679,78,800,0,989,634,999
575537,9606.ENSP00000225831,9606.ENSP00000292301,0,0,0,0,0,0,62,679,78,800,0,989,634,999
3900117,9606.ENSP00000306512,9606.ENSP00000225831,0,0,0,0,0,140,188,280,0,900,0,914,143,995
576240,9606.ENSP00000225831,9606.ENSP00000306512,0,0,0,0,0,140,188,280,0,900,0,914,143,995
9353673,9606.ENSP00000385675,9606.ENSP00000225831,0,0,0,0,0,250,64,0,0,900,0,915,216,994
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
576492,9606.ENSP00000225831,9606.ENSP00000358391,0,0,0,0,0,0,77,0,0,0,0,117,0,150
575839,9606.ENSP00000225831,9606.ENSP00000351742,0,0,0,0,0,0,0,0,0,0,0,135,58,150
6182829,9606.ENSP00000351742,9606.ENSP00000225831,0,0,0,0,0,0,0,0,0,0,0,135,58,150
577007,9606.ENSP00000225831,9606.ENSP00000237696,0,0,0,0,0,88,0,0,0,0,0,107,0,150


In [9]:
use_evidences = ['fusion', 'coexpression','experiments','database','textmining']

In [10]:
subdf = df[["protein1", "protein2"]+use_evidences]
subdf

Unnamed: 0,protein1,protein2,fusion,coexpression,experiments,database,textmining
0,9606.ENSP00000000233,9606.ENSP00000379496,0,0,0,0,103
1,9606.ENSP00000000233,9606.ENSP00000314067,0,0,0,0,0
2,9606.ENSP00000000233,9606.ENSP00000263116,0,0,0,0,0
3,9606.ENSP00000000233,9606.ENSP00000361263,0,0,0,0,47
4,9606.ENSP00000000233,9606.ENSP00000409666,0,60,0,0,0
...,...,...,...,...,...,...,...
11938493,9606.ENSP00000485678,9606.ENSP00000354800,0,213,0,0,0
11938494,9606.ENSP00000485678,9606.ENSP00000308270,0,152,0,0,0
11938495,9606.ENSP00000485678,9606.ENSP00000335660,0,182,0,0,0
11938496,9606.ENSP00000485678,9606.ENSP00000300127,0,155,0,0,0


In [11]:
subdf["score"] = subdf[use_evidences].max(axis=1)/1000
subdf = subdf[subdf.score > 0]
subdf

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subdf["score"] = subdf[use_evidences].max(axis=1)/1000


Unnamed: 0,protein1,protein2,fusion,coexpression,experiments,database,textmining,score
0,9606.ENSP00000000233,9606.ENSP00000379496,0,0,0,0,103,0.103
3,9606.ENSP00000000233,9606.ENSP00000361263,0,0,0,0,47,0.047
4,9606.ENSP00000000233,9606.ENSP00000409666,0,60,0,0,0,0.060
5,9606.ENSP00000000233,9606.ENSP00000324287,0,49,0,0,723,0.723
7,9606.ENSP00000000233,9606.ENSP00000392206,0,0,0,0,242,0.242
...,...,...,...,...,...,...,...,...
11938493,9606.ENSP00000485678,9606.ENSP00000354800,0,213,0,0,0,0.213
11938494,9606.ENSP00000485678,9606.ENSP00000308270,0,152,0,0,0,0.152
11938495,9606.ENSP00000485678,9606.ENSP00000335660,0,182,0,0,0,0.182
11938496,9606.ENSP00000485678,9606.ENSP00000300127,0,155,0,0,0,0.155


In [11]:
#only retain interactions with at least high confidence (0.7) (medium=0.4, low=0.15, high=0.9)
subdf = subdf[subdf.score >= 0.7]
subdf

Unnamed: 0,protein1,protein2,fusion,coexpression,experiments,database,textmining,score
5,9606.ENSP00000000233,9606.ENSP00000324287,0,49,0,0,723,0.723
144,9606.ENSP00000000233,9606.ENSP00000158762,0,0,0,0,723,0.723
187,9606.ENSP00000000233,9606.ENSP00000440005,0,77,679,900,850,0.900
401,9606.ENSP00000000233,9606.ENSP00000356737,0,0,270,0,882,0.882
944,9606.ENSP00000000233,9606.ENSP00000429900,0,0,0,0,739,0.739
...,...,...,...,...,...,...,...,...
11938200,9606.ENSP00000485663,9606.ENSP00000309474,0,722,0,0,0,0.722
11938211,9606.ENSP00000485663,9606.ENSP00000248342,0,172,984,900,885,0.984
11938226,9606.ENSP00000485663,9606.ENSP00000416255,0,0,800,0,413,0.800
11938234,9606.ENSP00000485663,9606.ENSP00000220849,0,396,979,900,811,0.979


In [12]:
all_ensp_proteins = set()
allStringProts = set(subdf.protein1)
allStringProts.update(subdf.protein2)

for x in allStringProts:
    all_ensp_proteins.add(x.split(".")[1])

In [13]:
len(all_ensp_proteins)

15327

In [14]:
#biomart exported ensembl peptide + hgnc symbol
martDF = pd.read_csv(os.path.join(data_dir, "oct2014_mart_export.txt"), sep="\t")
martDF

Unnamed: 0,Ensembl Gene ID,Ensembl Transcript ID,Ensembl Protein ID,HGNC symbol
0,ENSG00000197468,ENST00000508957,,
1,ENSG00000231049,ENST00000435337,,OR52B5P
2,ENSG00000276385,ENST00000618935,,
3,ENSG00000275151,ENST00000614589,,
4,ENSG00000228913,ENST00000432676,ENSP00000410416,UBD
...,...,...,...,...
209058,LRG_94,LRG_94t1,LRG_94p1,PRF1
209059,LRG_96,LRG_96t1,LRG_96p1,RAB27A
209060,LRG_97,LRG_97t1,LRG_97p1,RAC2
209061,LRG_98,LRG_98t1,LRG_98p1,RAG1


In [15]:
ensemblProt2Gene = defaultdict(set)
for ri, row in martDF[~pd.isna(martDF["Ensembl Protein ID"])].iterrows():
    
    protid = "9606.{}".format(row["Ensembl Protein ID"])
    geneid = row["HGNC symbol"]

    ensemblProt2Gene[protid].add(geneid)

In [16]:
kg = nx.DiGraph()

In [17]:
for ri, row in subdf.iterrows():
    src = row["protein1"]
    tgt = row["protein2"]
    
    if not src in ensemblProt2Gene:
        continue
    if not tgt in ensemblProt2Gene:
        continue
    
    src = ensemblProt2Gene[src]
    tgt = ensemblProt2Gene[tgt]
    
    string_scores = {}
    for sc in use_evidences + ["score"]:
        string_scores[sc] = row[sc]
       
    for s in src:
        for t in tgt: 
            kg.add_edge( s, t, type="interacts", string_scores=string_scores, source="STRINGDB", score=0 )
    

In [18]:
print(kg)

DiGraph with 15197 nodes and 324299 edges
