# Data files with STR, IDR and ELM annotations need to be reformatted to allow intersecting with bedtools

In [1]:
import numpy as np
import pandas as pd

## ELM data reformatting

In [2]:
df_elm = pd.read_csv("../data/raw/elm_instances_homo_sapiens.tsv", sep="\t", skiprows=5)
df_elm

Unnamed: 0,Accession,ELMType,ELMIdentifier,ProteinName,Primary_Acc,Accessions,Start,End,References,Methods,InstanceLogic,PDB,Organism
0,ELMI002256,CLV,CLV_C14_Caspase3-7,ATN1_HUMAN,P54259,P54259 Q99495 Q99621 Q9UEK7,103,107,10085113 9535906,cleavage reaction; mutation analysis; western ...,true positive,,Homo sapiens
1,ELMI001933,CLV,CLV_C14_Caspase3-7,ATN1_HUMAN,P54259,P54259 Q99495 Q99621 Q9UEK7,106,110,10085113 9535906,cleavage reaction; mutation analysis; western ...,true positive,,Homo sapiens
2,ELMI001928,CLV,CLV_C14_Caspase3-7,CADH1_HUMAN,P12830,P12830 Q13799 Q14216 Q15855 Q16194 Q4PJ14,747,751,11076937,cleavage reaction; confocal microscopy; edman ...,true positive,,Homo sapiens
3,ELMI001931,CLV,CLV_C14_Caspase3-7,CASP6_HUMAN,P55212,P55212 Q9BQE7,20,24,8900201,cleavage reaction; mutation analysis; western ...,true positive,,Homo sapiens
4,ELMI001929,CLV,CLV_C14_Caspase3-7,CASP6_HUMAN,P55212,P55212 Q9BQE7,176,180,8900201,cleavage reaction; mutation analysis; western ...,true positive,,Homo sapiens
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2181,ELMI001349,TRG,TRG_NLS_MonoExtN_4,SPAST_HUMAN,Q9UBP0,Q9UBP0 A7E2A7 Q9UPR9,7,13,15147984,colocalization; mutation analysis,true positive,,Homo sapiens
2182,ELMI002587,TRG,TRG_NLS_MonoExtN_4,UNG_HUMAN,P13051,P13051 A8K5M6 B2R8Y1 O00637 O00719 Q93028,15,21,9753728,confocal microscopy; mutation analysis,true positive,,Homo sapiens
2183,ELMI000008,TRG,TRG_PTS1,AMACR_HUMAN,Q9UHK6,Q9UHK6 A5YM47 B8Y916 B8Y918 O43673 Q96GH1 Q9Y3Q1,379,382,11060344,mutation analysis,true positive,,Homo sapiens
2184,ELMI000007,TRG,TRG_PTS1,CATA_HUMAN,P04040,P04040 A8K6C0 B2RCZ9 D3DR07 Q2M1U4 Q4VXX5 Q9BW...,524,527,8769411,mutation analysis; two hybrid,true positive,,Homo sapiens


In [3]:
df_elm_filt = df_elm[df_elm["InstanceLogic"] == "true positive"]
df_elm_filt

Unnamed: 0,Accession,ELMType,ELMIdentifier,ProteinName,Primary_Acc,Accessions,Start,End,References,Methods,InstanceLogic,PDB,Organism
0,ELMI002256,CLV,CLV_C14_Caspase3-7,ATN1_HUMAN,P54259,P54259 Q99495 Q99621 Q9UEK7,103,107,10085113 9535906,cleavage reaction; mutation analysis; western ...,true positive,,Homo sapiens
1,ELMI001933,CLV,CLV_C14_Caspase3-7,ATN1_HUMAN,P54259,P54259 Q99495 Q99621 Q9UEK7,106,110,10085113 9535906,cleavage reaction; mutation analysis; western ...,true positive,,Homo sapiens
2,ELMI001928,CLV,CLV_C14_Caspase3-7,CADH1_HUMAN,P12830,P12830 Q13799 Q14216 Q15855 Q16194 Q4PJ14,747,751,11076937,cleavage reaction; confocal microscopy; edman ...,true positive,,Homo sapiens
3,ELMI001931,CLV,CLV_C14_Caspase3-7,CASP6_HUMAN,P55212,P55212 Q9BQE7,20,24,8900201,cleavage reaction; mutation analysis; western ...,true positive,,Homo sapiens
4,ELMI001929,CLV,CLV_C14_Caspase3-7,CASP6_HUMAN,P55212,P55212 Q9BQE7,176,180,8900201,cleavage reaction; mutation analysis; western ...,true positive,,Homo sapiens
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2181,ELMI001349,TRG,TRG_NLS_MonoExtN_4,SPAST_HUMAN,Q9UBP0,Q9UBP0 A7E2A7 Q9UPR9,7,13,15147984,colocalization; mutation analysis,true positive,,Homo sapiens
2182,ELMI002587,TRG,TRG_NLS_MonoExtN_4,UNG_HUMAN,P13051,P13051 A8K5M6 B2R8Y1 O00637 O00719 Q93028,15,21,9753728,confocal microscopy; mutation analysis,true positive,,Homo sapiens
2183,ELMI000008,TRG,TRG_PTS1,AMACR_HUMAN,Q9UHK6,Q9UHK6 A5YM47 B8Y916 B8Y918 O43673 Q96GH1 Q9Y3Q1,379,382,11060344,mutation analysis,true positive,,Homo sapiens
2184,ELMI000007,TRG,TRG_PTS1,CATA_HUMAN,P04040,P04040 A8K6C0 B2RCZ9 D3DR07 Q2M1U4 Q4VXX5 Q9BW...,524,527,8769411,mutation analysis; two hybrid,true positive,,Homo sapiens


In [4]:
df_elm_filt.columns

Index(['Accession', 'ELMType', 'ELMIdentifier', 'ProteinName', 'Primary_Acc',
       'Accessions', 'Start', 'End', 'References', 'Methods', 'InstanceLogic',
       'PDB', 'Organism'],
      dtype='object')

In [5]:
df_elm_filt = (df_elm_filt[['Primary_Acc', 'Start', 'End', 'Accession', 'ELMType', 'ELMIdentifier']]
                   .rename(columns = {'Primary_Acc': 'uniprot', 'Start': 'start', 'End': 'end'})
                   .sort_values(by=['uniprot', 'start'])
                   .reset_index(drop=True))
df_elm_filt

Unnamed: 0,uniprot,start,end,Accession,ELMType,ELMIdentifier
0,A2VCK8,3,21,ELMI001654,LIG,LIG_Actin_WH2_2
1,A5D8V6,202,209,ELMI002658,LIG,LIG_EF_ALG2_ABM_1
2,A6NGB9,420,430,ELMI002697,LIG,LIG_WH1
3,A9UF02,115,122,ELMI002050,DOC,DOC_ANK_TNKS_1
4,B9EIS4,18,24,ELMI001419,MOD,MOD_PKA_2
...,...,...,...,...,...,...
2060,Q9Y6V0,4269,4275,ELMI003084,DOC,DOC_MAPK_JIP1_4
2061,Q9Y6W5,437,453,ELMI001661,LIG,LIG_Actin_WH2_1
2062,Q9Y6W6,19,25,ELMI003078,DOC,DOC_MAPK_JIP1_4
2063,Q9Y6X2,446,451,ELMI002800,LIG,LIG_SUMO_SIM_par_1


In [6]:
df_elm_filt.to_csv("../data/reformat/elm_instances_homo_sapiens_reformat.tsv", sep="\t", index=False)

## STR data reformatting

In [7]:
df_strs = pd.read_csv("../data/raw/edb79_gen_to_prot_mapping.tsv", sep="\t", low_memory=False)
df_ensp_uniprot_mapping = pd.read_csv("../data/uniprot_mapping/ensp_uniprot_mapping.tsv", sep="\t").rename(columns = {"From": "protein_id", "Entry": "uniprot"})

df_strs

Unnamed: 0,seq_name,seq_start,seq_end,db_id,gen_msa,tx_id,exon_id,exon_rank,tx_start,tx_end,...,cds_width,protein_start,protein_end,protein_width,protein_id,gene_id,gene_name,seq_strand,protein_seq,prot_rel_msa
0,1,69457,69471,1099235.0,"CTACA,CTACA,CTACA",ENST00000335137,ENSE00002319515,1.0,367.0,381.0,...,15.0,123.0,127.0,5.0,ENSP00000334393,ENSG00000186092,OR4F5,+,LHYTT,"CTA,CAC,TAC,ACT,ACA"
1,1,187143,187154,1099249.0,"CTG,CTG,CTG,CTG",ENST00000623083,ENSE00003759679,8.0,1014.0,1025.0,...,12.0,338.0,342.0,5.0,ENSP00000485442,ENSG00000279457,FO538757.2,-,DSSSS,"-CT,GCT,GCT,GCT,G--"
2,1,187143,187154,1099249.0,"CTG,CTG,CTG,CTG",ENST00000624735,ENSE00003755532,11.0,1277.0,1288.0,...,12.0,347.0,351.0,5.0,ENSP00000485351,ENSG00000279457,FO538757.2,-,DSSSS,"-CT,GCT,GCT,GCT,G--"
3,1,187770,187777,1099253.0,"CT,CT,CT,CT",ENST00000623083,ENSE00003757255,6.0,656.0,663.0,...,8.0,219.0,221.0,3.0,ENSP00000485442,ENSG00000279457,FO538757.2,-,KRE,"CTC,TCT,CT-"
4,1,187770,187777,1099253.0,"CT,CT,CT,CT",ENST00000623834,ENSE00003757255,7.0,653.0,660.0,...,8.0,218.0,220.0,3.0,ENSP00000485222,ENSG00000279457,FO538757.2,-,KRE,"CTC,TCT,CT-"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34490,Y,24842482,24842496,1239967.0,"TCC,TCC,TCC,TCC,ACC",ENST00000449750,ENSE00001709278,6.0,703.0,717.0,...,15.0,132.0,137.0,6.0,ENSP00000389300,ENSG00000205916,DAZ4,+,NPPPPP,"--T,CCT,CCT,CCT,CCA,CC-"
34491,Y,24853330,24853344,1239968.0,"TCC,TCC,TCC,TCC,ACC",ENST00000382296,ENSE00001609022,11.0,891.0,905.0,...,15.0,297.0,302.0,6.0,ENSP00000371733,ENSG00000205916,DAZ4,+,NPPPPP,"--T,CCT,CCT,CCT,CCA,CC-"
34492,Y,24853330,24853344,1239968.0,"TCC,TCC,TCC,TCC,ACC",ENST00000382314,ENSE00001609022,11.0,1101.0,1115.0,...,15.0,297.0,302.0,6.0,ENSP00000371751,ENSG00000205916,DAZ4,+,NPPPPP,"--T,CCT,CCT,CCT,CCA,CC-"
34493,Y,25623970,25623977,1240064.0,"GA,GA,GA,GA",ENST00000306609,ENSE00001854309,1.0,1809.0,1816.0,...,8.0,510.0,512.0,3.0,ENSP00000302968,ENSG00000172288,CDY1,+,ERE,"GAG,AGA,GA-"


In [8]:
df_strs = df_strs[['protein_id', 'gene_id', 'protein_start', 'protein_end', 'gen_msa', 'prot_rel_msa']]
df_strs


Unnamed: 0,protein_id,gene_id,protein_start,protein_end,gen_msa,prot_rel_msa
0,ENSP00000334393,ENSG00000186092,123.0,127.0,"CTACA,CTACA,CTACA","CTA,CAC,TAC,ACT,ACA"
1,ENSP00000485442,ENSG00000279457,338.0,342.0,"CTG,CTG,CTG,CTG","-CT,GCT,GCT,GCT,G--"
2,ENSP00000485351,ENSG00000279457,347.0,351.0,"CTG,CTG,CTG,CTG","-CT,GCT,GCT,GCT,G--"
3,ENSP00000485442,ENSG00000279457,219.0,221.0,"CT,CT,CT,CT","CTC,TCT,CT-"
4,ENSP00000485222,ENSG00000279457,218.0,220.0,"CT,CT,CT,CT","CTC,TCT,CT-"
...,...,...,...,...,...,...
34490,ENSP00000389300,ENSG00000205916,132.0,137.0,"TCC,TCC,TCC,TCC,ACC","--T,CCT,CCT,CCT,CCA,CC-"
34491,ENSP00000371733,ENSG00000205916,297.0,302.0,"TCC,TCC,TCC,TCC,ACC","--T,CCT,CCT,CCT,CCA,CC-"
34492,ENSP00000371751,ENSG00000205916,297.0,302.0,"TCC,TCC,TCC,TCC,ACC","--T,CCT,CCT,CCT,CCA,CC-"
34493,ENSP00000302968,ENSG00000172288,510.0,512.0,"GA,GA,GA,GA","GAG,AGA,GA-"


In [9]:
df_strs = (df_strs
     .merge(df_ensp_uniprot_mapping, how="inner", on="protein_id")
     [['uniprot', 'protein_start', 'protein_end', 'protein_id', 'gene_id', 'gen_msa', 'prot_rel_msa']].sort_values(by=['uniprot', 'protein_start'])
     .rename(columns = {'protein_start': 'start', 'protein_end': 'end'})
     .astype({"start": int, "end": int})
     .reset_index(drop=True))

df_strs


Unnamed: 0,uniprot,start,end,protein_id,gene_id,gen_msa,prot_rel_msa
0,A0A087WVF3,277,279,ENSP00000478426,ENSG00000274419,"TC,TC,TC,TC","-TC,TCT,CTC"
1,A0A087WXS9,277,279,ENSP00000481258,ENSG00000274933,"GA,GA,GA,GA","GAG,AGA,GA-"
2,A0A087X179,277,279,ENSP00000483965,ENSG00000278599,"TC,TC,TC,TC","-TC,TCT,CTC"
3,A0A087X1G2,277,279,ENSP00000484181,ENSG00000273513,"TC,TC,TC,TC","-TC,TCT,CTC"
4,A0A096LP49,29,32,ENSP00000456543,ENSG00000260220,"AC,AC,AC,AC,AC","--A,CAC,ACA,CAC"
...,...,...,...,...,...,...,...
24603,Q9Y6Y8,757,760,ENSP00000358071,ENSG00000107651,"TG,TG,TG,TG,TG","--T,GTG,TGT,GTG"
24604,Q9Y6Y8,862,865,ENSP00000358071,ENSG00000107651,"TC,TC,TC,TC","--T,CTC,TCT,C--"
24605,Q9Y6Y8,910,914,ENSP00000358071,ENSG00000107651,"AAG,AAG,AAG,AAG","-AA,GAA,GAA,GAA,G--"
24606,Q9Y6Z7,40,43,ENSP00000332723,ENSG00000184374,"CA,CA,CA,CA,CA","--C,ACA,CAC,ACA"


In [10]:
df_strs.to_csv("../data/reformat/edb79_gen_to_prot_mapping_reformat.tsv", sep='\t', index=False)

## Prot STR -> genome mapping reformatting

In [2]:
df_prot_strs = pd.read_csv("../data/raw/edb79_prot_to_gen_mapping_with_div.tsv", sep="\t")
df_prot_strs

Unnamed: 0,uniprotid,protein_length,protein_msa,repeat_region_length,protein_start,protein_end,str_id,str_seq,gene_id,tx_ids,protein_ids,seq_strand,seq_name,gen_starts,gen_ends,gen_seq,prot_unit_size,h_div_prot,h_div_gen
0,A0A075B6H5,130.0,"L,L,L,L,L,L,L",7,21,27,,,,,,,,,,,,,
1,A0A075B6H7,,"SG,SG,SG",6,84,89,,,,,,,,,,,,,
2,A0A075B6I3,,"L,L,L,L,L,L",6,6,11,A0A075B6I3_1,LLLLLL,ENSG00000211641,ENST00000390286,ENSP00000374821,+,22,22201690,22201707,CTCCTCCTCCTGCTCCTC,1.0,0.000000,0.277778
3,A0A075B6J1,123.0,"L,L,L,L,L,L",6,6,11,,,,,,,,,,,,,
4,A0A075B6N2,111.0,"L,L,L,L,L,L,L",7,2,8,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3760,Q9Y6V0,5142.0,"A,A,A,A,A,A",6,17,22,Q9Y6V0_1,AAAAAA,ENSG00000186472,"ENST00000333891,ENST00000423517","ENSP00000334319,ENSP00000388393",-,7,83162527,83162544,AGCCGCTGCGGCCGCCGC,1.0,0.000000,0.666667
3761,Q9Y6V0,5142.0,"P,P,P,P,P,P,P,P,P,P,P,P,P,P,P,P,P,P,P,P,P,P,L,...",26,2405,2430,Q9Y6V0_2,PPPPPPPPPPPPPPPPPPPPPPLPPP,ENSG00000186472,"ENST00000333891,ENST00000423517","ENSP00000334319,ENSP00000388393",-,7,82953663,82953740,TGGTGGAGGAAGTGGTGGGGGAGGAGGGGGTGGTGGTGGAGGAGGA...,1.0,0.073964,0.639053
3762,Q9Y6W5,498.0,"P,P,P,P,P,P",6,333,338,Q9Y6W5_1,PPPPPP,ENSG00000158195,ENST00000618852,ENSP00000483313,-,1,27410017,27410034,AGGCGGTGGTGGAGGTGG,1.0,0.000000,0.611111
3763,Q9Y6W5,498.0,"P,P,P,P,P,P,P,P",8,364,371,Q9Y6W5_2,PPPPPPPP,ENSG00000158195,ENST00000618852,ENSP00000483313,-,1,27409918,27409941,TGGTGGTGGAGGAGGAGGTGGAGG,1.0,0.000000,0.500000


In [18]:
df_prot_strs_filt = (df_prot_strs
                         .dropna()
                         .assign(unit_size = lambda x: [len(i[0]) for i in x.protein_msa.str.split(",")])
                         .query("unit_size == 1 and h_div_prot == 0.0 and ',' not in gen_seq"))
df_prot_strs_filt = df_prot_strs_filt[[not "," in i for i in df_prot_strs_filt.gen_seq]].reset_index(drop=True)
df_prot_strs_filt

Unnamed: 0,uniprotid,protein_length,protein_msa,repeat_region_length,protein_start,protein_end,str_id,str_seq,gene_id,tx_ids,protein_ids,seq_strand,seq_name,gen_starts,gen_ends,gen_seq,prot_unit_size,h_div_prot,h_div_gen,unit_size
0,A0A096LP55,91.0,"E,E,E,E,E,E,E,E,E",9,17,25,A0A096LP55_1,EEEEEEEEE,ENSG00000233954,ENST00000483273,ENSP00000485401,-,1,15807575,15807601,TTCCTCCTCTTCCTCTTCCTCCTCCTC,1.0,0.0,0.444444,1
1,A0AV96,593.0,"A,A,A,A,A,A,A,A,A,A,A,A,A,A",14,489,502,A0AV96_2,AAAAAAAAAAAAAA,ENSG00000163694,"ENST00000381793,ENST00000295971","ENSP00000371212,ENSP00000295971",-,4,40432687,40432728,AGCGGCTGCGGCGGCTGCGGCCGCGGCTGCGGCGGCAGCAGC,1.0,0.0,0.653061,1
2,A0AV96,593.0,"A,A,A,A,A,A",6,559,564,A0AV96_3,AAAAAA,ENSG00000163694,"ENST00000381793,ENST00000295971","ENSP00000371212,ENSP00000295971",-,4,40425994,40426011,GGCGGCCGCGGCTGCCGC,1.0,0.0,0.611111,1
3,A0JLT2,244.0,"P,P,P,P,P,P,P,P",8,28,35,A0JLT2_1,PPPPPPPP,ENSG00000156603,ENST00000337672,ENSP00000337340,-,11,57712075,57712098,AGGAGGCGGTGGCGGGGGTGGAGG,1.0,0.0,0.718750,1
4,A0MZ66,631.0,"P,P,P,P,P,P,P,P,P",9,353,361,A0MZ66_1,PPPPPPPPP,ENSG00000187164,"ENST00000615301,ENST00000392903,ENST00000355371","ENSP00000480109,ENSP00000376636,ENSP00000347532",-,10,116927821,116927847,TGGTGGTGGAGGAGGAGGTGGTGGAGG,1.0,0.0,0.493827,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2299,Q9Y6U7,377.0,"P,P,P,P,P,P,P,P,P",9,12,20,Q9Y6U7_1,PPPPPPPPP,ENSG00000099999,ENST00000382363,ENSP00000371800,-,22,30387254,30387280,CGGAGGCGGCGGCGGAGGCGGCGGCGG,1.0,0.0,0.345679,1
2300,Q9Y6V0,5142.0,"A,A,A,A,A,A",6,17,22,Q9Y6V0_1,AAAAAA,ENSG00000186472,"ENST00000333891,ENST00000423517","ENSP00000334319,ENSP00000388393",-,7,83162527,83162544,AGCCGCTGCGGCCGCCGC,1.0,0.0,0.666667,1
2301,Q9Y6W5,498.0,"P,P,P,P,P,P",6,333,338,Q9Y6W5_1,PPPPPP,ENSG00000158195,ENST00000618852,ENSP00000483313,-,1,27410017,27410034,AGGCGGTGGTGGAGGTGG,1.0,0.0,0.611111,1
2302,Q9Y6W5,498.0,"P,P,P,P,P,P,P,P",8,364,371,Q9Y6W5_2,PPPPPPPP,ENSG00000158195,ENST00000618852,ENSP00000483313,-,1,27409918,27409941,TGGTGGTGGAGGAGGAGGTGGAGG,1.0,0.0,0.500000,1


In [22]:
df_prot_strs_filt = df_prot_strs_filt.rename(columns = {
    "uniprotid": "uniprot", 
    "protein_start": "start", 
    "protein_end": "end"})[[
    "uniprot",
    "start",
    "end",
    "protein_msa",
    "gen_seq"
]]
df_prot_strs_filt.to_csv("../data/reformat/edb79_prot_to_gen_mapping_with_div_reformat.tsv", sep='\t', index=False)

In [33]:
df_prot_strs_filt_wider = df_prot_strs_filt.assign(start = lambda x: [max(i - 10, 0) for i in x.start], end = lambda x: x.end + 10)
df_prot_strs_filt_wider.to_csv("../data/reformat/edb79_prot_to_gen_mapping_with_div_reformat_wider.tsv", sep='\t', index=False)


## IDR data reformatting

In [11]:
df_idrs = pd.read_csv("../data/raw/swissprot_intrinsic_disorder_mapping.tsv", sep="\t", header=None, names=['uniprot', 'start', 'end'])
df_idrs

Unnamed: 0,uniprot,start,end
0,sp|A0A075B6H9|LV469_HUMAN,73,92
1,sp|A0A075B6J9|LV218_HUMAN,20,52
2,sp|A0A075B6Y9|TJA42_HUMAN,1,20
3,sp|A0A087WSX0|LV545_HUMAN,68,92
4,sp|A0A087WUL8|NBPFJ_HUMAN,1047,1085
...,...,...,...
26892,sp|Q9Y6Z7|COL10_HUMAN,40,107
26893,sp|Q9YNA8|GAK19_HUMAN,170,189
26894,sp|Q9YNA8|GAK19_HUMAN,223,264
26895,sp|Q9YNA8|GAK19_HUMAN,598,640


In [12]:
df_idrs = (df_idrs
     .assign(uniprot = lambda x: [i[1] for i in x.uniprot.str.split("|")]))
df_idrs

Unnamed: 0,uniprot,start,end
0,A0A075B6H9,73,92
1,A0A075B6J9,20,52
2,A0A075B6Y9,1,20
3,A0A087WSX0,68,92
4,A0A087WUL8,1047,1085
...,...,...,...
26892,Q9Y6Z7,40,107
26893,Q9YNA8,170,189
26894,Q9YNA8,223,264
26895,Q9YNA8,598,640


In [13]:
df_idrs.to_csv("../data/reformat/swissprot_intrinsic_disorder_mapping_reformat.tsv", sep="\t", index=False)