### COVID-19 related protein-protein interactions from https://thebiogrid.org/news/227



In [1]:
# data downloaded from  https://thebiogrid.org/news/227 on april 19

import pandas as pd

biogrid = pd.read_csv("../data/BIOGRID-CORONAVIRUS-3.5.184.tab3.txt", sep = "\t")
biogrid.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 491 entries, 0 to 490
Data columns (total 35 columns):
 #   Column                              Non-Null Count  Dtype 
---  ------                              --------------  ----- 
 0   #BioGRID Interaction ID             491 non-null    int64 
 1   Entrez Gene Interactor A            491 non-null    object
 2   Entrez Gene Interactor B            491 non-null    int64 
 3   BioGRID ID Interactor A             491 non-null    int64 
 4   BioGRID ID Interactor B             491 non-null    int64 
 5   Systematic Name Interactor A        491 non-null    object
 6   Systematic Name Interactor B        491 non-null    object
 7   Official Symbol Interactor A        491 non-null    object
 8   Official Symbol Interactor B        491 non-null    object
 9   Synonyms Interactor A               491 non-null    object
 10  Synonyms Interactor B               491 non-null    object
 11  Experimental System                 491 non-null    object

In [2]:
# From these columns, we can select only the ones we can move to Wikidata.

columns_to_keep = ["Official Symbol Interactor A",
"Official Symbol Interactor B",
"Experimental System",
"Experimental System Type", 
"Author",
"Publication Source",
"Organism Interactor A", 
"Organism Interactor B",
"Source Database" ]

biogrid_tiny = biogrid[columns_to_keep]

Official Symbols + organism => Identification of interacting proteins on Wikidata. 

Publication Source	=> Parse and use to find reference on Wikidata. If not on Wikidata, add reference. Reference will be added alongside BioGrid reference

Experimental system => Reconcile and match using property determination method (P459);

stated in Q4914619 (BioGRID)
retrieved today
reference URL

Filter by experimental system type = "physical".



In [3]:
%run ../../wikidata_parser.py
ncbi_ids = set(biogrid_tiny["Organism Interactor A"])

ncbi_to_wikidata = {}

for ncbi_id in ncbi_ids:
    ncbi_to_wikidata[ncbi_id] = get_wikidata_items_for_property_value_pair('P685', ncbi_id)[0]

print(ncbi_to_wikidata)

{9606: 'Q15978631', 10090: 'Q83310', 1335626: 'Q4902157', 10029: 'Q2539773', 694009: 'Q278567', 2697049: 'Q82069695'}


In [4]:
biogrid_tiny.to_csv("../intermediate/biogrid_tiny.csv")

In [74]:
# lets start with only SARS-CoV-2 -- Human interactions from the Gordon et an paper
# Printing to reconcile via open refine. 
# Not all of these proteins were previously on UniProt. Let's look
biogrid_gordon = biogrid_tiny[biogrid_tiny["Author"] == "Gordon DE (2020)"]


In [75]:
for i in set(biogrid_gordon["Official Symbol Interactor A"]):
    print(i)

nsp15
nsp11
S
ORF9b
nsp1
ORF6
ORF3a
nsp4
ORF9c
N
nsp6
ORF3b
nsp2
nsp14
nsp7
nsp13
ORF7a
ORF10
nsp9
E
nsp8
ORF8
nsp12
M
nsp10
nsp5


In [76]:
reference_ids_sars = pd.read_csv("protein_wikidata_dictionary.csv")
biogrid_gordon = biogrid_gordon.merge(reference_ids_sars, on= "Official Symbol Interactor A", how = "left")

In [93]:
biogrid_gordon = biogrid_gordon.dropna()
biogrid_gordon.head()

Unnamed: 0,Official Symbol Interactor A,Official Symbol Interactor B,Experimental System,Experimental System Type,Author,Publication Source,Organism Interactor A,Organism Interactor B,Source Database,Column 1,qid_sarscov2,taxon,genesymbol,encoded_protein,encoded_proteinLabel
0,E,AP3B1,Affinity Capture-MS,physical,Gordon DE (2020),DOI:10.1101/2020.03.22.002386,2697049,9606,BIOGRID,envelope protein [SARS-CoV-2],Q88655710,Q82069695,AP3B1,Q21106367,Adaptor related protein complex 3 subunit beta 1
1,E,BRD4,Affinity Capture-MS,physical,Gordon DE (2020),DOI:10.1101/2020.03.22.002386,2697049,9606,BIOGRID,envelope protein [SARS-CoV-2],Q88655710,Q82069695,BRD4,Q21100441,Bromodomain containing 4
2,E,BRD2,Affinity Capture-MS,physical,Gordon DE (2020),DOI:10.1101/2020.03.22.002386,2697049,9606,BIOGRID,envelope protein [SARS-CoV-2],Q88655710,Q82069695,BRD2,Q21100450,Bromodomain containing 2
3,E,CWC27,Affinity Capture-MS,physical,Gordon DE (2020),DOI:10.1101/2020.03.22.002386,2697049,9606,BIOGRID,envelope protein [SARS-CoV-2],Q88655710,Q82069695,CWC27,Q21108518,CWC27 spliceosome associated cyclophilin
4,E,ZC3H18,Affinity Capture-MS,physical,Gordon DE (2020),DOI:10.1101/2020.03.22.002386,2697049,9606,BIOGRID,envelope protein [SARS-CoV-2],Q88655710,Q82069695,ZC3H18,Q21134430,Zinc finger CCCH-type containing 18


In [78]:
for i in set(biogrid_gordon["Experimental System"]):
    print(i)

Affinity Capture-MS


In [79]:
reference_ids_humans = pd.read_csv("hgnc_to_protein_qid.csv").drop_duplicates()

reference_ids_humans["encoded_protein"] = [item.split("/")[4] for item in reference_ids_humans["encoded_protein"]]

biogrid_gordon = biogrid_gordon.merge(reference_ids_humans, left_on= "Official Symbol Interactor B", right_on="genesymbol", how = "left")

In [81]:
biogrid_gordon.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 257 entries, 0 to 256
Data columns (total 15 columns):
 #   Column                        Non-Null Count  Dtype 
---  ------                        --------------  ----- 
 0   Official Symbol Interactor A  257 non-null    object
 1   Official Symbol Interactor B  257 non-null    object
 2   Experimental System           257 non-null    object
 3   Experimental System Type      257 non-null    object
 4   Author                        257 non-null    object
 5   Publication Source            257 non-null    object
 6   Organism Interactor A         257 non-null    int64 
 7   Organism Interactor B         257 non-null    int64 
 8   Source Database               257 non-null    object
 9   Column 1                      257 non-null    object
 10  qid_sarscov2                  257 non-null    object
 11  taxon                         257 non-null    object
 12  genesymbol                    244 non-null    object
 13  encoded_protein     

In [98]:
import datetime
from datetime import date, timedelta

today = date.today()
today_in_wikidata_format = today.strftime("+%Y-%m-%dT00:00:00Z/11")


determination_method = "|P459|" + "Q91149819"
stated_in = "|S248|" + "Q4914619" #BioGRID
stated_in = "|S248|" + "Q87917582" # A SARS-CoV-2-Human Protein-Protein Interaction Map Reveals Drug Targets and Potential Drug-Repurposing
reference_url = "|S854|" + '"https://thebiogrid.org/news/227"' + "|S813|" + today_in_wikidata_format

f= open("gordon_qs.txt","w+") 

for index, row in biogrid_gordon.iterrows():
    sars_protein_qid = row["qid_sarscov2"]
    human_protein_qid = row["encoded_protein"]
    
    quickstatements_commands = sars_qid + "|P129|" + human_protein_qid + determination_method + reference_url +  stated_in + "\n" +     human_protein_qid + "|P129|" + sars_qid + determination_method + reference_url +  stated_in + "\n"
    
    f.write(quickstatements_commands)