### COVID-19 related protein-protein interactions from https://thebiogrid.org/news/227



In [25]:
# data downloaded from  https://thebiogrid.org/news/227 on april 19

import pandas as pd

biogrid = pd.read_csv("../data/BIOGRID-CORONAVIRUS-3.5.184.tab3.txt", sep = "\t")


In [26]:
# From these columns, we can select only the ones we can move to Wikidata.

columns_to_keep = ["Official Symbol Interactor A",
"Official Symbol Interactor B",
"Experimental System",
"Experimental System Type", 
"Author",
"Publication Source",
"Organism Interactor A", 
"Organism Interactor B",
"Source Database" ]

biogrid_tiny = biogrid[columns_to_keep]

Official Symbols + organism => Identification of interacting proteins on Wikidata. 

Publication Source	=> Parse and use to find reference on Wikidata. If not on Wikidata, add reference. Reference will be added alongside BioGrid reference

Experimental system => Reconcile and match using property determination method (P459);

stated in Q4914619 (BioGRID)
retrieved today
reference URL

Filter by experimental system type = "physical".



In [27]:
%run ../../wikidata_parser.py
ncbi_ids = set(biogrid_tiny["Organism Interactor A"])

ncbi_to_wikidata = {}

for ncbi_id in ncbi_ids:
    ncbi_to_wikidata[ncbi_id] = get_wikidata_items_for_property_value_pair('P685', ncbi_id)[0]

print(ncbi_to_wikidata)

{9606: 'Q15978631', 10090: 'Q83310', 1335626: 'Q4902157', 10029: 'Q2539773', 694009: 'Q278567', 2697049: 'Q82069695'}


In [28]:
biogrid_tiny.to_csv("../intermediate/biogrid_tiny.csv")

In [29]:
# lets start with only SARS-CoV-2 -- Human interactions from the Gordon et an paper
# Printing to reconcile via open refine. 
# Not all of these proteins were previously on UniProt. Let's look
biogrid_gordon = biogrid_tiny[biogrid_tiny["Author"] == "Gordon DE (2020)"]


In [31]:
reference_ids_sars = pd.read_csv("protein_wikidata_dictionary.csv")
biogrid_gordon = biogrid_gordon.merge(reference_ids_sars, on= "Official Symbol Interactor A", how = "left")

In [32]:
biogrid_gordon = biogrid_gordon.dropna()
biogrid_gordon.head(1)

Unnamed: 0,Official Symbol Interactor A,Official Symbol Interactor B,Experimental System,Experimental System Type,Author,Publication Source,Organism Interactor A,Organism Interactor B,Source Database,Column 1,qid_sarscov2,taxon
0,E,AP3B1,Affinity Capture-MS,physical,Gordon DE (2020),DOI:10.1101/2020.03.22.002386,2697049,9606,BIOGRID,envelope protein [SARS-CoV-2],Q88655710,Q82069695
1,E,BRD4,Affinity Capture-MS,physical,Gordon DE (2020),DOI:10.1101/2020.03.22.002386,2697049,9606,BIOGRID,envelope protein [SARS-CoV-2],Q88655710,Q82069695
2,E,BRD2,Affinity Capture-MS,physical,Gordon DE (2020),DOI:10.1101/2020.03.22.002386,2697049,9606,BIOGRID,envelope protein [SARS-CoV-2],Q88655710,Q82069695
3,E,CWC27,Affinity Capture-MS,physical,Gordon DE (2020),DOI:10.1101/2020.03.22.002386,2697049,9606,BIOGRID,envelope protein [SARS-CoV-2],Q88655710,Q82069695
4,E,ZC3H18,Affinity Capture-MS,physical,Gordon DE (2020),DOI:10.1101/2020.03.22.002386,2697049,9606,BIOGRID,envelope protein [SARS-CoV-2],Q88655710,Q82069695


In [34]:
reference_ids_humans = pd.read_csv("hgnc_to_protein_qid.csv").drop_duplicates()

reference_ids_humans["encoded_protein"] = [item.split("/")[4] for item in reference_ids_humans["encoded_protein"]]

biogrid_gordon = biogrid_gordon.merge(reference_ids_humans, left_on= "Official Symbol Interactor B", right_on="genesymbol", how = "left")

In [38]:
biogrid_gordon = biogrid_gordon.dropna()

In [39]:
import datetime
from datetime import date, timedelta

today = date.today()
today_in_wikidata_format = today.strftime("+%Y-%m-%dT00:00:00Z/11")

sars_qid = "Q82069695"

determination_method = "|P459|" + "Q91149819"
stated_in_1 = "|S248|" + "Q4914619" #BioGRID
stated_in_2 = "|S248|" + "Q91031402" # A SARS-CoV-2-Human Protein-Protein Interaction Map Reveals Drug Targets and Potential Drug-Repurposing

stated_in = stated_in_1 + stated_in_2
reference_url = "|S854|" + '"https://thebiogrid.org/news/227"' + "|S813|" + today_in_wikidata_format

f= open("gordon_qs.txt","w+") 

for index, row in biogrid_gordon.iterrows():
    sars_protein_qid = row["qid_sarscov2"]
    human_protein_qid = row["encoded_protein"]
    
    quickstatements_commands = sars_protein_qid + "|P129|" + human_protein_qid + determination_method + reference_url +  stated_in + "\n" +     human_protein_qid + "|P129|" + sars_qid + determination_method + reference_url +  stated_in + "\n"
    
    f.write(quickstatements_commands)

Now let's do the same for other interactions in this dataset.

In [42]:
biogrid_not_gordon = biogrid_tiny[biogrid_tiny["Author"] != "Gordon DE (2020)"]

In [139]:
from wikidataintegrator import wdi_core

def get_wikidata_item_by_pubmed_id(pubmed_id):
    qid = get_wikidata_item("P698", pubmed_id)
    return(qid)

def get_wikidata_item_by_doi(doi):
    qid = get_wikidata_item("P356", doi)
    return(qid)

def get_wikidata_item(wikidata_property, value):
    query_result=wdi_core.WDItemEngine.execute_sparql_query("""SELECT distinct ?item ?taxid WHERE { 
    ?item wdt:""" + wikidata_property + '"'+ value + '" }')
    try:
        match = query_result['results']['bindings'][0]
    except:
        return("not found")
    qid = match["item"]["value"]
    
    qid = qid.split("/")[4]
    return(qid)

def get_wikidata_dictionary_for_sources(publication_sources):
            
    source_qids = {}
    for source in publication_sources: 

        if "PUBMED" in source:
            pubmed_id = source.split(":")[1]

            paper_qid = get_wikidata_item_by_pubmed_id(pubmed_id)

        else:
            if "DOI" in source:
                doi = source.split(":")[1]
                paper_qid = get_wikidata_item_by_doi(doi)


            else:

                paper_qid="not found"
        print(source)

        source_qids[source] = paper_qid

    return(source_qids)


In [140]:
biogrid_not_gordon_hs_and_sc2 = biogrid_not_gordon[biogrid_not_gordon["Organism Interactor A"].isin(["9606", "2697049" ])]
biogrid_not_gordon_hs_and_sc2 = biogrid_not_gordon_hs_and_sc2[biogrid_not_gordon["Organism Interactor B"].isin(["9606", "2697049" ])]

  


In [141]:
biogrid_not_gordon_hs_and_sc2 = biogrid_not_gordon_hs_and_sc2.query("`Official Symbol Interactor A` != `Official Symbol Interactor B`")

In [145]:
publication_sources = list(set(biogrid_not_gordon_hs_and_sc2["Publication Source"]))
source_to_wikidata = get_wikidata_dictionary_for_sources(publication_sources)


PUBMED:32155444
PUBMED:27519799
DOI:10.1101/2020.02.16.951723
DOI:10.1101/2020.03.16.994236
DOI:10.1101/2020.03.29.013490
DOI:10.1101/2020.02.17.951848
DOI:10.1101/2020.03.16.993386
DOI:10.1101/2020.03.29.20041962
PUBMED:30102747
PUBMED:32075877
PUBMED:32203189
PUBMED:32132184
PUBMED:32225176
DOI:10.1101/2020.03.14.988345


In [146]:
biogrid_not_gordon_hs_and_sc2["source_qid"] = biogrid_not_gordon_hs_and_sc2["Publication Source"].map(source_to_wikidata)

In [147]:
biogrid_not_gordon_hs_and_sc2

Unnamed: 0,Official Symbol Interactor A,Official Symbol Interactor B,Experimental System,Experimental System Type,Author,Publication Source,Organism Interactor A,Organism Interactor B,Source Database,source_qid
0,RCHY1,CAMK2D,Affinity Capture-Western,physical,Ma-Lauer Y (2016),PUBMED:27519799,9606,9606,BIOGRID,Q37258617
333,ACE2,S,Co-crystal Structure,physical,Yan R (2020),PUBMED:32132184,9606,2697049,BIOGRID,Q87726414
335,ACE2,SLC6A19,Co-crystal Structure,physical,Yan R (2020),PUBMED:32132184,9606,9606,BIOGRID,Q87726414
336,ACE2,SLC6A19,Reconstituted Complex,physical,Yan R (2020),PUBMED:32132184,9606,9606,BIOGRID,Q87726414
338,S,ACE2,Reconstituted Complex,physical,Wrapp D (2020),PUBMED:32075877,2697049,9606,BIOGRID,Q89108866
339,S,ACE2,Co-crystal Structure,physical,Wrapp D (2020),PUBMED:32075877,2697049,9606,BIOGRID,Q89108866
341,S,ACE2,Reconstituted Complex,physical,Walls AC (2020),PUBMED:32155444,2697049,9606,BIOGRID,Q87973551
353,MASP2,N,Reconstituted Complex,physical,Gao T (2020),DOI:10.1101/2020.03.29.20041962,9606,2697049,BIOGRID,Q91208180
357,MASP2,MBL2,Reconstituted Complex,physical,Gao T (2020),DOI:10.1101/2020.03.29.20041962,9606,9606,BIOGRID,Q91208180
358,MASP2,C4A,Biochemical Activity,physical,Gao T (2020),DOI:10.1101/2020.03.29.20041962,9606,9606,BIOGRID,Q91208180
