# Comparing RBP Analyses from Ferrarini et al. and [Schmidt et al.](https://www.nature.com/articles/s41564-020-00846-z)

In [1]:
!pip install biopython
!pip install pyensembl

Collecting biopython
  Downloading biopython-1.79-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl (2.3 MB)
[K     |████████████████████████████████| 2.3 MB 4.5 MB/s 
Installing collected packages: biopython
Successfully installed biopython-1.79
Collecting pyensembl
  Downloading pyensembl-1.9.4.tar.gz (55 kB)
[K     |████████████████████████████████| 55 kB 2.2 MB/s 
[?25hCollecting typechecks>=0.0.2
  Downloading typechecks-0.1.0.tar.gz (3.4 kB)
Collecting datacache>=1.1.4
  Downloading datacache-1.1.5.tar.gz (13 kB)
Collecting memoized-property>=1.0.2
  Downloading memoized-property-1.0.3.tar.gz (5.0 kB)
Collecting gtfparse>=1.1.0
  Downloading gtfparse-1.2.1.tar.gz (12 kB)
Collecting serializable
  Downloading serializable-0.2.1.tar.gz (8.4 kB)
Collecting tinytimer
  Downloading tinytimer-0.0.0.tar.gz (2.1 kB)
Collecting progressbar33>=2.4
  Downloading progressbar33-2.4.tar.gz (10 kB)
Collecting mock
  Downloading mock-4.0.3-py3-none-any.whl (28 kB)
Collecting simplejson
  

In [2]:
from google.colab import drive

drive.mount("/content/drive")

Mounted at /content/drive


In [3]:
from pathlib import Path

CONF_DATA = Path("/content/drive/MyDrive/ecbm-e4060-covid-interactions/DATA/Hackathon/output/ki-results")
LIT_DATA = Path("/content/drive/MyDrive/ecbm-e4060-covid-interactions/DATA/Literature")
HACK_DATA = Path("/content/drive/MyDrive/ecbm-e4060-covid-interactions/DATA/Hackathon")

## Comparing Our Output to Schmidt et al.

In [4]:
genome_hits_file = "genome_hits.tsv"
f_utr_hits_file = "f_utr_hits.tsv"
t_utr_hits_file = "t_utr_hits.tsv"
genome_neg_hits_file = "genome_neg_hits.tsv"

In [5]:
import pandas as pd

genome_hits = pd.read_csv(CONF_DATA / genome_hits_file, sep="\t")
f_utr_hits = pd.read_csv(CONF_DATA / f_utr_hits_file, sep="\t")
t_utr_hits = pd.read_csv(CONF_DATA / t_utr_hits_file, sep="\t")
genome_neg_hits = pd.read_csv(CONF_DATA / genome_neg_hits_file, sep="\t")

In [6]:
total_hits = pd.concat([genome_hits, f_utr_hits, t_utr_hits, genome_neg_hits])
total_hits.Gene_name.unique()

array(['HNRNPL', 'FUS', 'MBNL1', 'SRSF1', 'RBMY1A1', 'ZFP36', 'SRSF10',
       'PTBP1', 'YBX2', 'SRSF3', 'PABPC1', 'PABPN1', 'SART3', 'PABPC4',
       'ZNF638', 'PABPC5', 'PABPC3', 'CELF2', 'YBX1', 'ZRANB2', 'KHDRBS3',
       'LIN28A', 'PPIE', 'HNRNPA1', 'NOVA2', 'NOVA1', 'CELF6', 'ELAVL1',
       'TIA1', 'CELF4', 'HNRNPDL', 'RBFOX1'], dtype=object)

In [7]:
total_hits

Unnamed: 0,Gene_name,strand,N,mean_count,sd_count,z,pval,qval
1,HNRNPL,+,632,387.5538,20.243979,12.075008,7.157531e-34,7.085956e-32
2,FUS,+,140,61.4582,7.443073,10.552335,2.4774969999999998e-26,1.226361e-24
3,MBNL1,+,682,402.2202,29.912798,9.353181,4.252541e-21,1.403339e-19
4,SRSF1,+,335,221.235,15.190995,7.488976,3.470646e-14,8.589849e-13
5,RBMY1A1,+,107,55.879,7.316165,6.987404,1.400092e-12,2.772182e-11
6,ZFP36,+,609,469.4026,20.582304,6.782399,5.909821e-12,9.751205e-11
7,SRSF10,+,88,42.4038,7.098959,6.422941,6.683299e-11,9.452095e-10
8,PTBP1,+,3151,2828.7736,54.425648,5.920488,1.604938e-09,1.986111e-08
9,YBX2,+,51,24.172,4.91964,5.453245,2.472947e-08,2.720242e-07
10,SRSF3,+,74,38.8242,6.483896,5.425103,2.896061e-08,2.8671e-07


In [8]:
#The SARS-CoV-2 RNA–protein interactome in infected human cells, December 2020
#supplementary table 1 (Proteins detected by quantitative mass spectrometry in 
#SARS-CoV-2 RNA and RMRP RNA antisense purifications in infected human cells.)

st1 = LIT_DATA / "41564_2020_846_MOESM2_ESM-1.xlsx"
st1_pd = pd.read_excel(st1, index_col=0, skiprows=2)
#st1_pd[(st1_pd['adj.P.Val.SCoV2.over.RMRP'] < 0.05) & (st1_pd['logFC.SCoV2.over.RMRP'] > 0)]

In [9]:
total_hits[total_hits.Gene_name.isin(st1_pd.geneSymbol)].Gene_name.nunique()

9

In [10]:
conf_schmidt = total_hits[total_hits.Gene_name.isin(st1_pd.geneSymbol)]

## Comparing Ferrarini et al. Output to Schmidt et al.

Before performing this analysis, I placed the significantly enriched RBPs from Ferrarini et al.'s paper (listed on pages 10 and 11) in TSV files.

In [11]:
ferrarini_data = pd.read_csv(HACK_DATA / "enriched_rbp_list.tsv", sep="\t", index_col=0)

In [12]:
ferrarini_data[ferrarini_data.Gene_name.isin(st1_pd.geneSymbol)].Gene_name.nunique()

10

In [13]:
ferrarini_schmidt = ferrarini_data[ferrarini_data.Gene_name.isin(st1_pd.geneSymbol)]

In [14]:
set(ferrarini_schmidt.Gene_name) - set(conf_schmidt.Gene_name)

{'HNRNPA2B1'}

In [15]:
ferrarini_schmidt

Unnamed: 0,Gene_name,Binding_site_location
0,HNRNPL,Genome
3,SRSF1,Genome
7,PTBP1,Genome
8,SRSF3,Genome
10,PABPC1,Genome
13,PABPC4,Genome
18,YBX1,Genome
23,PABPC4,3'UTR
25,PABPC1,3'UTR
29,HNRNPA2B1,3'UTR


## Grabbing Protein Sequences from Schmidt et al. Data

In [16]:
relaxed_fdr = st1_pd[(st1_pd['adj.P.Val.SCoV2.over.RMRP'] < 0.2) & (st1_pd['logFC.SCoV2.over.RMRP'] > 0)]

This above criteria were used by Schmidt et al. to identify the 104 RBPs they say bind to the SARS-CoV-2 genome. `relaxed_fdr` contains 12 extra RBPs that come from the SARS-CoV-2 species instead of Homo Sapiens. The `accession_number` field can be used to obtain their sequences using Entrez.

In [17]:
relaxed_fdr

Unnamed: 0_level_0,adj.P.Val.SCoV2.over.RMRP,Log.P.Value.SCoV2.over.RMRP,logFC.SCoV2.over.RMRP,P.Value.SCoV2.over.RMRP,RMRP rep2 ratio,RMRP rep1 ratio,SCoV2 rep2 ratio,SCoV2 rep1 ratio,accession_number,geneSymbol,numSpectraProteinObserved,numPepsUnique,numPepsUniqueSubgroupSpecificCI,accession_numbers,species,percentCoverage,groupNum,subgroupNum,scoreUnique,scoreUniqueSubgroupSpecificCI,totalIntensity,entry_name
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
NP_001120665.1,0.000111,60.516031,3.939615,8.879672e-07,,-1.783388,2.424939,1.887515,NP_001120665.1,CNBP,1,9,1,NP_001120665.1|NP_001120667.1,HOMO SAPIENS,52.2,79,79.2,123.53,14.78,1330000,cellular nucleic acid-binding protein isoform...
YP_009742613.1,0.000290,54.270089,3.407615,3.741029e-06,,-0.205388,3.379939,3.024515,YP_009742613.1,nsp6,1,1,1,YP_009742613.1,SARSCOV2_NC_045512.2,2.4,613,613.1,14.80,14.80,17500000,mat_peptide:nsp6:ORF1a_polyprotein:ORF1ab:GN=...
NP_055935.4,0.136832,13.459390,3.356758,4.508800e-02,-2.980673,-0.552388,1.665939,1.514515,NP_055935.4,ALMS1,2,2,2,NP_055935.4,HOMO SAPIENS,0.4,538,538.1,17.67,17.67,101000000,Alstrom syndrome protein 1 GN=ALMS1
NP_001129125.1,0.002919,40.563732,2.976258,8.782675e-05,-1.431673,-0.306388,2.542939,1.671515,NP_001129125.1,PABPC4,13,16,8,NP_001129125.1|NP_001129126.1|NP_003810.1|NP_0...,HOMO SAPIENS,24.8,18,18.2,256.13,128.11,1810000000,polyadenylate-binding protein 4 isoform 1 GN=...
YP_009725310.1,0.002303,41.804432,2.620615,6.600195e-05,,-0.079388,2.759939,2.322515,YP_009725310.1,nsp15,1,1,1,YP_009725310.1,SARSCOV2_NC_045512.2,2.6,617,617.1,14.66,14.66,3760000,mat_peptide:endoRNAse:ORF1b_polyprotein:ORF1a...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
NP_001005335.1,0.156721,12.721526,0.757758,5.343766e-02,-0.386673,-0.358388,0.475939,0.294515,NP_001005335.1,HNRNPL,4,2,2,NP_001005335.1|NP_001524.2,HOMO SAPIENS,4.3,336,336.1,30.65,30.65,93200000,heterogeneous nuclear ribonucleoprotein L iso...
NP_001092104.1,0.179775,11.964295,0.741258,6.361660e-02,-0.056673,-0.082388,0.906939,0.436515,NP_001092104.1,RBM47,17,8,6,NP_001092104.1|NP_061900.2,HOMO SAPIENS,16.3,44,44.2,121.76,91.94,625000000,RNA-binding protein 47 isoform a GN=RBM47
NP_001129171.1,0.183259,11.803415,0.740758,6.601741e-02,0.167327,-0.204388,0.922939,0.521515,NP_001129171.1,YWHAZ,3,3,2,NP_001129171.1,HOMO SAPIENS,12.6,186,186.3,43.35,28.63,153000000,14-3-3 protein zeta/delta GN=YWHAZ
NP_000166.2,0.181670,11.901205,0.727758,6.454751e-02,0.094327,0.062612,0.967939,0.644515,NP_000166.2,GPI,2,2,2,NP_000166.2|NP_001171651.1|NP_001276718.1|NP_0...,HOMO SAPIENS,3.4,365,365.1,28.14,28.14,113000000,glucose-6-phosphate isomerase isoform 2 GN=GPI


### Positive Examples

In [18]:
relaxed_fdr.accession_number.unique()

array(['NP_001120665.1', 'YP_009742613.1', 'NP_055935.4',
       'NP_001129125.1', 'YP_009725310.1', 'NP_001138898.1',
       'YP_009724397.2', 'NP_003746.2', 'YP_009724390.1',
       'NP_001166299.1', 'NP_001164274.1', 'NP_005889.3',
       'YP_009724391.1', 'NP_003747.1', 'NP_001004317.1',
       'NP_001269581.1', 'NP_004550.2', 'NP_001018494.1',
       'NP_001185747.1', 'NP_001071105.1', 'NP_150093.1', 'NP_055121.1',
       'NP_036429.2', 'NP_001153146.1', 'YP_009725307.1', 'NP_002559.2',
       'YP_009725311.1', 'YP_009724393.1', 'NP_055205.2',
       'NP_001284478.1', 'NP_001287750.1', 'YP_009742616.1',
       'NP_001230648.1', 'NP_001231903.1', 'NP_001007554.1',
       'NP_001123551.1', 'NP_000961.2', 'NP_071496.1', 'NP_002943.2',
       'NP_002889.1', 'NP_001005.1', 'NP_001180345.1', 'NP_057190.2',
       'NP_001020241.1', 'NP_001005386.1', 'NP_005498.1', 'NP_001559.1',
       'NP_009109.3', 'NP_001107565.1', 'NP_002127.1', 'NP_001295206.1',
       'NP_001276978.1', 'NP_00100722

The following is a list of RBPs from [Sundararaman et al.](https://www.sciencedirect.com/science/article/pii/S1097276516000964?via%3Dihub).

In [19]:
rbp_list_file = LIT_DATA / "1-s2.0-S1097276516000964-mmc2.xlsx"
rbp_list = pd.read_excel(rbp_list_file, sheet_name="Sheet2_1072_RBP_compilation")
rbp_list.columns = ["ensembl_ID", "gene_symbol", "gene_description"]
rbp_list

Unnamed: 0,ensembl_ID,gene_symbol,gene_description
0,ENSG00000090861,AARS,alanyl-tRNA synthetase
1,ENSG00000275700,AATF,apoptosis antagonizing transcription factor
2,ENSG00000204574,ABCF1,"ATP-binding cassette, sub-family F (GCN20), me..."
3,ENSG00000146109,ABT1,activator of basal transcription 1
4,ENSG00000167315,ACAA2,acetyl-CoA acyltransferase 2
...,...,...,...
1067,ENSG00000124201,ZNFX1,"zinc finger, NFX1-type containing 1"
1068,ENSG00000132485,ZRANB2,"zinc finger, RAN-binding domain containing 2"
1069,ENSG00000212643,ZRSR1,"zinc finger (CCCH type), RNA-binding motif and..."
1070,ENSG00000169249,ZRSR2,"zinc finger (CCCH type), RNA-binding motif and..."


I subset for those not identified by Schmidt et al. I'll randomly select 118 of these to be treated as the negative set.

In [20]:
negative_total = rbp_list[~rbp_list.gene_symbol.isin(st1_pd.geneSymbol)]
negative_total

Unnamed: 0,ensembl_ID,gene_symbol,gene_description
0,ENSG00000090861,AARS,alanyl-tRNA synthetase
1,ENSG00000275700,AATF,apoptosis antagonizing transcription factor
2,ENSG00000204574,ABCF1,"ATP-binding cassette, sub-family F (GCN20), me..."
3,ENSG00000146109,ABT1,activator of basal transcription 1
7,ENSG00000164113,ADAD1,adenosine deaminase domain containing 1 (testi...
...,...,...,...
1067,ENSG00000124201,ZNFX1,"zinc finger, NFX1-type containing 1"
1068,ENSG00000132485,ZRANB2,"zinc finger, RAN-binding domain containing 2"
1069,ENSG00000212643,ZRSR1,"zinc finger (CCCH type), RNA-binding motif and..."
1070,ENSG00000169249,ZRSR2,"zinc finger (CCCH type), RNA-binding motif and..."


In [21]:
sampled = negative_total.sample(n=118)
sampled

Unnamed: 0,ensembl_ID,gene_symbol,gene_description
475,ENSG00000086504,MRPL28,mitochondrial ribosomal protein L28
705,ENSG00000265241,RBM8A,RNA binding motif protein 8A
531,ENSG00000130935,NOL11,nucleolar protein 11
685,ENSG00000119707,RBM25,RNA binding motif protein 25
50,ENSG00000224470,ATXN1L,ataxin 1-like
...,...,...,...
466,ENSG00000133606,MKRN1,makorin ring finger protein 1
75,ENSG00000162642,C1orf52,chromosome 1 open reading frame 52
708,ENSG00000144642,RBMS3,"RNA binding motif, single stranded interacting..."
822,ENSG00000156304,SCAF4,SR-related CTD-associated factor 4


Need accession numbers for these...

In [22]:
list(sampled.ensembl_ID)

['ENSG00000086504',
 'ENSG00000265241',
 'ENSG00000130935',
 'ENSG00000119707',
 'ENSG00000224470',
 'ENSG00000113460',
 'ENSG00000033030',
 'ENSG00000104413',
 'ENSG00000055483',
 'ENSG00000130985',
 'ENSG00000080802',
 'ENSG00000114416',
 'ENSG00000151923',
 'ENSG00000055044',
 'ENSG00000149016',
 'ENSG00000100764',
 'ENSG00000105793',
 'ENSG00000180098',
 'ENSG00000118197',
 'ENSG00000131503',
 'ENSG00000198242',
 'ENSG00000132773',
 'ENSG00000164548',
 'ENSG00000148840',
 'ENSG00000205323',
 'ENSG00000119285',
 'ENSG00000163239',
 'ENSG00000139675',
 'ENSG00000170748',
 'ENSG00000129317',
 'ENSG00000075292',
 'ENSG00000118246',
 'ENSG00000197181',
 'ENSG00000106638',
 'ENSG00000113742',
 'ENSG00000137876',
 'ENSG00000134987',
 'ENSG00000166197',
 'ENSG00000126749',
 'ENSG00000132603',
 'ENSG00000168003',
 'ENSG00000086758',
 'ENSG00000183684',
 'ENSG00000138709',
 'ENSG00000003756',
 'ENSG00000134627',
 'ENSG00000188613',
 'ENSG00000149658',
 'ENSG00000171056',
 'ENSG00000108298',


In [25]:
# This code came from UniProt docs
import urllib.parse
import urllib.request

url = 'https://www.uniprot.org/uploadlists/'

params = {
'from': 'ENSEMBL_ID',
'to': 'SWISSPROT',
'format': 'tab',
'query': " ".join(list(sampled.ensembl_ID))
}

data = urllib.parse.urlencode(params)
data = data.encode('utf-8')
req = urllib.request.Request(url, data)
with urllib.request.urlopen(req) as f:
   response = f.read()
response = response.decode('utf-8')
print(response)

From	To
ENSG00000086504	Q13084
ENSG00000265241	Q9Y5S9
ENSG00000130935	Q9H8H0
ENSG00000119707	P49756
ENSG00000224470	P0C7T5
ENSG00000113460	Q8TDN6
ENSG00000033030	Q6NZY4
ENSG00000104413	Q6NXG1
ENSG00000055483	Q9P275
ENSG00000130985	P22314
ENSG00000080802	O95628
ENSG00000114416	P51114
ENSG00000151923	Q01085
ENSG00000055044	Q9Y2X3
ENSG00000149016	Q9H6E5
ENSG00000100764	P62191
ENSG00000105793	A4D1E9
ENSG00000180098	Q9NX07
ENSG00000118197	Q5T1V6
ENSG00000131503	Q8IWZ3
ENSG00000198242	P62750
ENSG00000132773	Q96GM8
ENSG00000164548	Q13595
ENSG00000148840	Q5VV67
ENSG00000205323	P82979
ENSG00000119285	Q9H583
ENSG00000163239	Q5VZ19
ENSG00000139675	Q32P51
ENSG00000170748	O75526
ENSG00000129317	Q9H0K6
ENSG00000075292	Q14966
ENSG00000118246	Q9NYY8
ENSG00000197181	Q8TC59
ENSG00000106638	Q9Y4P3
ENSG00000113742	Q17RY0
ENSG00000137876	Q9UHA3
ENSG00000134987	Q8NI36
ENSG00000166197	Q14978
ENSG00000126749	Q92979
ENSG00000132603	Q9Y221
ENSG00000168003	P08195
ENSG00000086758	Q7Z6Z7
ENSG00000138709	Q659C4
ENS

In [26]:
swiss_prot_ids = []
for line in response.strip().split("\n")[1:]:
    #print(line)
    sp_id = line.split("\t")[1]
    swiss_prot_ids.append(sp_id)
swiss_prot_ids

['Q13084',
 'Q9Y5S9',
 'Q9H8H0',
 'P49756',
 'P0C7T5',
 'Q8TDN6',
 'Q6NZY4',
 'Q6NXG1',
 'Q9P275',
 'P22314',
 'O95628',
 'P51114',
 'Q01085',
 'Q9Y2X3',
 'Q9H6E5',
 'P62191',
 'A4D1E9',
 'Q9NX07',
 'Q5T1V6',
 'Q8IWZ3',
 'P62750',
 'Q96GM8',
 'Q13595',
 'Q5VV67',
 'P82979',
 'Q9H583',
 'Q5VZ19',
 'Q32P51',
 'O75526',
 'Q9H0K6',
 'Q14966',
 'Q9NYY8',
 'Q8TC59',
 'Q9Y4P3',
 'Q17RY0',
 'Q9UHA3',
 'Q8NI36',
 'Q14978',
 'Q92979',
 'Q9Y221',
 'P08195',
 'Q7Z6Z7',
 'Q659C4',
 'P52756',
 'Q7Z3Z4',
 'Q8WY41',
 'Q9BYJ9',
 'Q9BT81',
 'P84098',
 'Q5U5Q3',
 'O75607',
 'O60869',
 'Q15477',
 'Q9HAV4',
 'Q96KR1',
 'Q7Z478',
 'Q8NDT2',
 'Q15428',
 'Q96M93',
 'P18124',
 'Q9NX74',
 'P62979',
 'P05388',
 'Q8N5A5',
 'Q9UBU9',
 'Q1ED39',
 'Q99848',
 'Q5VWX1',
 'Q86TM3',
 'P23246',
 'Q587J7',
 'Q9NQC3',
 'Q9BRU9',
 'O43399',
 'Q9NPE3',
 'Q15696',
 'Q9NV31',
 'Q9Y324',
 'P42285',
 'O76021',
 'O75525',
 'Q9BQ75',
 'Q8WVV9',
 'Q9H8H2',
 'Q14684',
 'Q9Y3U8',
 'Q13148',
 'Q8NBP7',
 'O60812',
 'O75475',
 'Q96T37',

In [27]:
import os
import sys

from urllib.request import urlretrieve

import Bio
from Bio import SeqIO, SearchIO, Entrez
from Bio.Seq import Seq
from Bio.SeqUtils import GC
from Bio.Blast import NCBIWWW
from Bio.Data import CodonTable

In [28]:
from Bio import Entrez
Entrez.email = "sg4040@columbia.edu"

In [29]:
#retrieve S protein sequences 
handle = Entrez.efetch(db="protein", id=list(relaxed_fdr.accession_number), rettype="fasta")

#save sequences to fasta file
fasta_file = "positive_proteins.fasta"
with open(fasta_file, "w") as fo:
    fo.write(handle.read())

#check that everything worked
!cat positive_proteins.fasta

>NP_001120665.1 CCHC-type zinc finger nucleic acid binding protein isoform 2 [Homo sapiens]
MSSNECFKCGRSGHWARECPTGGGRGRGMRSRGRGGFTSDRGFQFVSSSLPDICYRCGESGHLAKDCDLQ
EDEACYNCGRGGHIAKDCKEPKREREQCCYNCGKPGHLARDCDHADEQKCYSCGEFGHIQKDCTKVKCYR
CGETGHVAINCSKTSEVNCYRCGESGHLARECTIEATA

>YP_009742613.1 nsp6 [Severe acute respiratory syndrome coronavirus 2]
SAVKRTIKGTHHWLLLTILTSLLVLVQSTQWSLFFFLYENAFLPFAMGIIAMSAFAMMFVKHKHAFLCLF
LLPSLATVAYFNMVYMPASWVMRIMTWLDMVDTSLSGFKLKDCVMYASAVVLLILMTARTVYDDGARRVW
TLMNVLTLVYKVYYGNALDQAISMWALIISVTSNYSGVVTTVMFLARGIVFMCVEYCPIFFITGNTLQCI
MLVYCFLGYFCTCYFGLFCLLNRYFRLTLGVYDYLVSTQEFRYMNSQGLLPPKNSIDAFKLNIKLLGVGG
KPCIKVATVQ

>NP_055935.4 Alstrom syndrome protein 1 isoform 1 [Homo sapiens]
MEPEDLPWPGELEEEEEEEEEEEEEEEEEAAAAAAANVDDVVVVEEVEEEAGRELDSDSHYGPQHLESID
DEEDEEAKAWLQAHPGRILPPLSPPQHRYSEGERTSLEKIVPLTCHVWQQIVYQGNSRTQISDTNVVCLE
TTAQRGSGDDQKTESWHCLPQEMDSSQTLDTSQTRFNVRTEDTEVTDFPSLEEGILTQSENQVKEPNRDL
FCSPLLVIQDSFASPDLPLLTCLTQDQEFAPDSLFHQSELSFAPLRGIPDKSEDTEWSSRPSEVSEALFQ
ATAEVASDLA

In [55]:
handle = Entrez.efetch(db="protein", id=swiss_prot_ids, rettype="fasta")

#save sequences to fasta file
fasta_file = "negative_proteins.fasta"
with open(fasta_file, "w") as fo:
    fo.write(handle.read())

#check that everything worked
!cat negative_proteins.fasta

>sp|Q13084.4|RM28_HUMAN RecName: Full=39S ribosomal protein L28, mitochondrial; Short=L28mt; Short=MRP-L28; AltName: Full=Melanoma antigen p15; AltName: Full=Melanoma-associated antigen recognized by T-lymphocytes; AltName: Full=Mitochondrial large ribosomal subunit protein bL28m; Flags: Precursor
MPLHKYPVWLWKRLQLREGICSRLPGHYLRSLEEERTPTPVHYRPHGAKFKINPKNGQRERVEDVPIPIY
FPPESQRGLWGGEGWILGQIYANNDKLSKRLKKVWKPQLFEREFYSEILDKKFTVTVTMRTLDLIDEAYG
LDFYILKTPKEDLCSKFGMDLKRGMLLRLARQDPQLHPEDPERRAAIYDKYKEFAIPEEEAEWVGLTLEE
AIEKQRLLEEKDPVPLFKIYVAELIQQLQQQALSEPAVVQKRASGQ

>sp|Q9Y5S9.1|RBM8A_HUMAN RecName: Full=RNA-binding protein 8A; AltName: Full=Binder of OVCA1-1; Short=BOV-1; AltName: Full=RNA-binding motif protein 8A; AltName: Full=RNA-binding protein Y14; AltName: Full=Ribonucleoprotein RBM8A
MADVLDLHEAGGEDFAMDEDGDESIHKLKEKAKKRKGRGFGSEEGSRARMREDYDSVEQDGDEPGPQRSV
EGWILFVTGVHEEATEEDIHDKFAEYGEIKNIHLNLDRRTGYLKGYTLVEYETYKEAQAAMEGLNGQDLM
GQPISVDWCFVRGPPKGKRRGGRRRSRSPDRRRR

>sp|Q9H8H0.1|NOL11_HUMAN RecName

In [56]:
# Analysis of the fasta file
from Bio import SeqIO
record = SeqIO.parse(open(fasta_file, 'r'),"fasta")

updated_seqs = []
for seq in record:
    seq_descr = seq.description
    full_name_section = seq_descr.split(";")[0]
    full_name = full_name_section.split("Full=")[1]
    seq.description = full_name
    print("ID: {}, Length: {}, Description: {}".format(seq.id, len(seq), seq.description))
    updated_seqs.append(seq)

ID: sp|Q13084.4|RM28_HUMAN, Length: 256, Description: 39S ribosomal protein L28, mitochondrial
ID: sp|Q9Y5S9.1|RBM8A_HUMAN, Length: 174, Description: RNA-binding protein 8A
ID: sp|Q9H8H0.1|NOL11_HUMAN, Length: 719, Description: Nucleolar protein 11
ID: sp|P49756.3|RBM25_HUMAN, Length: 843, Description: RNA-binding protein 25
ID: sp|P0C7T5.1|ATX1L_HUMAN, Length: 689, Description: Ataxin-1-like
ID: sp|Q8TDN6.2|BRX1_HUMAN, Length: 353, Description: Ribosome biogenesis protein BRX1 homolog
ID: sp|Q6NZY4.2|ZCHC8_HUMAN, Length: 707, Description: Zinc finger CCHC domain-containing protein 8
ID: sp|Q6NXG1.2|ESRP1_HUMAN, Length: 681, Description: Epithelial splicing regulatory protein 1
ID: sp|Q9P275.4|UBP36_HUMAN, Length: 1123, Description: Ubiquitin carboxyl-terminal hydrolase 36
ID: sp|P22314.3|UBA1_HUMAN, Length: 1058, Description: Ubiquitin-like modifier-activating enzyme 1
ID: sp|O95628.3|CNOT4_HUMAN, Length: 575, Description: CCR4-NOT transcription complex subunit 4
ID: sp|P51114.3|FXR1_

In [58]:
with open("negative_proteins.fasta", "w") as output_handle:
    SeqIO.write(updated_seqs, output_handle, "fasta")

## Next Steps
* Figuring out how to train a [transformer model](https://github.com/facebookresearch/esm) or [protein-based BERT model](https://github.com/nadavbra/protein_bert).
* But first, save the FASTA sequences to files!
* It is best not to run these lines again since they'll change the analysis. Consider this the final selection of negative training data for the model.

In [59]:
#!cp negative_proteins.fasta /content/drive/MyDrive/ecbm-e4060-covid-interactions/DATA/ML_Extension/negative_proteins.fasta

In [33]:
#!cp positive_proteins.fasta /content/drive/MyDrive/ecbm-e4060-covid-interactions/DATA/ML_Extension/positive_proteins.fasta