In [692]:
from pathlib import Path

import json
from typing import Optional
from Bio import SeqIO

import pandas as pd
import numpy as np

In [754]:
from ppi_utils.reduce import dedup_pairs
from ppi_utils.pairs import estimate_bias, estimate_bias_per_species
from ppi_utils.api import uniprot_api_fetch
from ppi_utils.extract import ppis_to_hashes
from ppi_utils.general import get_seq_hash, get_ids, to_fasta

In [982]:
with Path('/home/quirin/PYTHON/prona/huintaf/'
          'huintaf2_plddts.json').open('r') as f:
    plds = json.load(f)

next(iter(plds.items()))

('ENSG00000000005-ENSG00000061656',
 {'ida': 'ENSG00000000005',
  'acontacts': [21,
   24,
   24,
   25,
   25,
   25,
   25,
   25,
   26,
   28,
   28,
   28,
   28,
   28,
   28,
   29,
   29,
   29,
   29,
   29,
   29,
   29,
   29,
   29,
   30,
   30,
   31,
   31,
   32,
   32,
   32,
   32,
   32,
   32,
   32,
   32,
   32,
   33,
   33,
   33,
   33,
   35,
   35,
   35,
   35,
   36,
   36,
   36,
   36,
   36,
   36,
   36,
   36,
   36,
   36,
   36,
   36,
   36,
   36,
   37,
   37,
   38,
   39,
   39,
   39,
   39,
   39,
   39,
   39,
   39,
   39,
   39,
   39,
   39,
   40,
   40,
   40,
   40,
   40,
   41,
   41,
   42,
   42,
   42,
   42,
   43,
   43,
   43,
   43,
   43,
   43,
   43,
   43,
   43,
   44,
   44,
   44,
   45,
   45,
   45,
   46,
   46,
   46,
   46,
   46,
   46,
   46,
   46,
   47,
   47,
   47,
   47,
   47,
   47,
   47,
   47,
   47,
   48,
   48,
   48,
   48,
   48,
   48,
   48,
   48,
   49,
   49,
   49,
   50,
   50,
   50,
   50,

## load AlphaFold pLDDT (keys)

In [367]:
with Path('/home/quirin/PYTHON/prona/huintaf/'
          'huintaf2_plddts.json').open('r') as f:
    plddts = set(json.load(f))

In [368]:
next(iter(plddts))

'ENSG00000013016-ENSG00000110047'

## load the table with all 65484 structures

In [366]:
huintaf_dir = Path('/home/quirin/PYTHON/huintaf2')

In [109]:
hua = pd.read_csv(huintaf_dir / 'leo' /
                  'table_AF2_HURI_HuMap_UNIQUE.csv', sep=',')
len(hua)

65484

In [110]:
hua.head()

Unnamed: 0,unique_ID,duplicates,id1,id2,NumRes,IF_plDDT,plDDT,pDockQ,DockQall,PDB,...,structure_file,NumDiso1.90,NumDiso1.70.90,NumDiso1.50.70,NumDiso1.50,NumDiso2.90,NumDiso2.70.90,NumDiso2.50.70,NumDiso2.50,NumOverlap
0,A0A024R0Y4_O14964,1,O14964,A0A024R0Y4,92,67.09186,67.37455,0.077304,,,...,ENSG00000185359-ENSG00000276234.pdb,114.0,253.0,60.0,350.0,72.0,286.0,42.0,43.0,2.0
1,A0A024R0Y4_O15287,1,O15287,A0A024R0Y4,35,68.53878,77.09663,0.038752,,,...,ENSG00000221829-ENSG00000276234.pdb,130.0,339.0,74.0,79.0,41.0,308.0,66.0,28.0,0.0
2,A0A024R0Y4_O60573,1,O60573,A0A024R0Y4,40,59.69327,78.52047,0.034888,,,...,ENSG00000135930-ENSG00000276234.pdb,105.0,58.0,15.0,67.0,118.0,263.0,28.0,34.0,0.0
3,A0A024R0Y4_O75431,1,O75431,A0A024R0Y4,51,76.82288,81.05951,0.075548,,,...,ENSG00000128654-ENSG00000276234.pdb,183.0,56.0,5.0,19.0,42.0,294.0,70.0,37.0,0.0
4,A0A024R0Y4_O75528,1,O75528,A0A024R0Y4,362,90.26991,76.77518,0.730883,,,...,ENSG00000171148-ENSG00000276234.pdb,134.0,119.0,52.0,127.0,180.0,210.0,22.0,31.0,0.0


In [111]:
mp = hua[['id1', 'id2', 'pDockQ',
          'Dataset_max', 'structure_file']].copy()
mp.columns = ['ida', 'idb', 'pdockq', 'dataset', 'pdbfile']
assert len(mp.drop_duplicates()) == len(mp)

In [112]:
mp = dedup_pairs(mp)
# mp is now sorted horizontally
len(mp)

65484

In [113]:
mp.head()

Unnamed: 0,ida,idb,pdockq,dataset,pdbfile
0,A0A024R0Y4,O14964,0.077304,HURI,ENSG00000185359-ENSG00000276234.pdb
1,A0A024R0Y4,O15287,0.038752,HURI,ENSG00000221829-ENSG00000276234.pdb
2,A0A024R0Y4,O60573,0.034888,HURI,ENSG00000135930-ENSG00000276234.pdb
3,A0A024R0Y4,O75431,0.075548,HURI,ENSG00000128654-ENSG00000276234.pdb
4,A0A024R0Y4,O75528,0.730883,HURI,ENSG00000171148-ENSG00000276234.pdb


## load the table with humap probabilities

In [114]:
humap_dir = Path.cwd().parent / 'ppi_data/hu.MAP'
assert humap_dir.is_dir()
humap_dir

PosixPath('/home/quirin/PYTHON/ppi/ppi_data/hu.MAP')

In [115]:
wumap = pd.read_csv(humap_dir / 'humap2_ppis_ACC_20200821'
                                '.pairsWprob',
                    sep='\t', header=None,
                    names=['ida', 'idb', 'prob'])
len(wumap)

17526311

In [116]:
wumap.head()

Unnamed: 0,ida,idb,prob
0,P46926,Q8TDQ7,1.0
1,P43631,Q14954,1.0
2,P43631,Q14953,1.0
3,P43631,P43629,1.0
4,100287045,Q86YD7,1.0


### filter the original hu.MAP scores a bit

In [117]:
wumap = wumap.loc[wumap.prob >= .5]
len(wumap)
# now that's a relief!

11043

In [118]:
# also drop weird numeric IDs
wumap = wumap.loc[(~wumap.ida.str.isnumeric())
                  & (~wumap.idb.str.isnumeric())]
len(wumap)

10868

In [119]:
wumap = dedup_pairs(wumap)
# wumap is now sorted horizontally
len(wumap)

10868

### glue the humap scores to the overall results frame

In [398]:
mpp = pd.merge(mp, wumap, on=['ida', 'idb'], how='left')
mpp.prob = mpp.prob.fillna(1)
mpp[['lena', 'lenb']] = 0, 0
assert len(mpp) == 65484

### filter: only the pdbfiles we have

In [435]:
mpp.pdbfile = mpp.pdbfile.str.split(';')
l1 = len(mpp)
mpp = mpp.explode('pdbfile').drop_duplicates()
l2 = len(mpp)
mpp.pdbfile = mpp.pdbfile.apply(lambda s: s.split('.')[0])
mpp = mpp.loc[mpp.pdbfile.isin(plddts)]
mpp.pdbfile += '.pdb'
l3 = len(mpp)
print(f'{l1} -> {l2} -> {l3}')
mpp.head()

65999 -> 65999 -> 65999


Unnamed: 0,ida,idb,pdockq,dataset,pdbfile,prob,lena,lenb
0,A0A024R0Y4,O14964,0.077304,HURI,ENSG00000185359-ENSG00000276234.pdb,1.0,0,0
1,A0A024R0Y4,O15287,0.038752,HURI,ENSG00000221829-ENSG00000276234.pdb,1.0,0,0
2,A0A024R0Y4,O60573,0.034888,HURI,ENSG00000135930-ENSG00000276234.pdb,1.0,0,0
3,A0A024R0Y4,O75431,0.075548,HURI,ENSG00000128654-ENSG00000276234.pdb,1.0,0,0
4,A0A024R0Y4,O75528,0.730883,HURI,ENSG00000171148-ENSG00000276234.pdb,1.0,0,0


## load the file of random pairs

In [436]:
r2k = pd.read_csv(huintaf_dir / 'data/random.csv', sep=',')
print(len(r2k))
r2k[['ida', 'idb']] = r2k.Name.str.split('-', expand=True)
r2k = r2k.rename(columns=dict(pDockQ='pdockq', Name='pdbfile',
                              len1='lena', len2='lenb'))
r2k.pdbfile += '.pdb'
r2k[['dataset', 'prob']] = '1849 random', 0.
r2k = r2k[mpp.columns]
# sort the IDs horizontally
r2k.loc[r2k.ida > r2k.idb, ['ida', 'idb']] = r2k.loc[
    r2k.ida > r2k.idb, ['idb', 'ida']].values
r2k = r2k.sort_values(by=['ida', 'idb']).drop_duplicates()
print(len(r2k))
r2k.head()

1849
1849


Unnamed: 0,ida,idb,pdockq,dataset,pdbfile,prob,lena,lenb
0,O00139,P40429,0.041981,1849 random,O00139-P40429.pdb,0.0,706,203
468,O00165,P39023,0.255203,1849 random,P39023-O00165.pdb,0.0,403,279
502,O00182,P40429,0.134777,1849 random,P40429-O00182.pdb,0.0,203,355
1,O00231,P31946,0.034794,1849 random,O00231-P31946.pdb,0.0,422,246
1175,O00231,P62913,0.031393,1849 random,P62913-O00231.pdb,0.0,178,422


## load our colabfold data

In [437]:
ppi_dir = Path('.').resolve().parent / 'ppi_data/v2.1/1:1_small'
ppi_dir, ppi_dir.is_dir()

(PosixPath('/home/quirin/PYTHON/ppi/ppi_data/v2.1/1:1_small'), True)

In [438]:
_hashing = pd.read_csv(ppi_dir / 'crc_hashes.tsv', sep='\t', header=None).sort_values(by=[1, 0]).reset_index(drop=True)
_hashing

Unnamed: 0,0,1
0,Q99KX1,CRC-00097529ECAB0123
1,P63039,CRC-0014B58B77D0127B
2,Q9BWG6-1,CRC-001576EA78C61C85
3,P36614,CRC-001980A1CC7646D7
4,Q8NET5,CRC-0026DBDE1FEAD2B3
...,...,...
9920,O94374,CRC-FFD4919E936E0941
9921,P26616,CRC-FFD61212F709EE3A
9922,P0A8A8,CRC-FFEC71975B4492AB
9923,Q9Y7M1,CRC-FFEDB9673A3CB066


In [439]:
_hashing.loc[_hashing.iloc[:, 0].duplicated(keep=False)]
# not with versionized uniprot IDs

Unnamed: 0,0,1


In [440]:
_hashing.loc[_hashing.iloc[:, 1].duplicated(
    keep=False)].sort_values(by=[1, 0])

Unnamed: 0,0,1
38,P68369,CRC-00F8429A4A10E5FE
39,P68370,CRC-00F8429A4A10E5FE
131,P63017,CRC-03A27B30E6C076ED
132,P63018,CRC-03A27B30E6C076ED
274,P62259,CRC-07817CCBD1F75B26
...,...,...
9804,Q8IE57,CRC-FCADC4C9D752D8CB
9808,O00308-1,CRC-FCCD75CBA61F2204
9809,O00308-4,CRC-FCCD75CBA61F2204
9918,A0A5K1K911,CRC-FFD0BEF99046D3AC


In [441]:
# this always picks the last entry in the table above -
# meaning the latest
_map = dict(zip(_hashing.iloc[:, 1], _hashing.iloc[:, 0]))
_map['CRC-FFD0BEF99046D3AC']

'Q8IDC4'

In [824]:
ours = pd.read_csv('colabfold_scores.tsv', sep='\t', header=0)
ours['min_chezod'] = ours[['za', 'zb']].min(axis=1)
ours['max_diso'] = ours[['da', 'db']].max(axis=1)
ours['max_chezod'] = ours[['za', 'zb']].max(axis=1)
ours['avg_chezod'] = ours[['za', 'zb']].mean(axis=1)
ours['dataset'] = ours.label.apply(lambda l: ['514 HuRI $-$', '516 HuRI $+$'][l])
ours['prob'] = ours.label.astype(float)
ours['pdbfile'] = ours.apply(lambda s: f'results_chunk_{s.chunk}/predictions/{s["index"]}_unrelaxed_rank_1_model_{s.model}.pdb', axis=1)
ours[['crca', 'crcb']] = ours[['ida', 'idb']]

ours.ida = ours.crca.apply(lambda s: _map.get(s))
ours.idb = ours.crcb.apply(lambda s: _map.get(s))
ours.head()

Unnamed: 0,ida,idb,label,index,lena,lenb,chunk,model,ptm,iptm,...,capri,min_chezod,max_diso,max_chezod,avg_chezod,dataset,prob,pdbfile,crca,crcb
0,Q9BWG6-1,Q4VC12-1,1,0,230,460,5,4,0.64,0.43,...,acceptable,6.533676,0.591304,10.537396,8.535536,516 HuRI $+$,1.0,results_chunk_5/predictions/0_unrelaxed_rank_1...,CRC-001576EA78C61C85,CRC-045C9C9CAE0FD639
1,Q9BWG6-1,Q8N0S2-1,1,1,230,351,5,2,0.27,0.08,...,incorrect,6.533676,0.591304,8.990527,7.762102,516 HuRI $+$,1.0,results_chunk_5/predictions/1_unrelaxed_rank_1...,CRC-001576EA78C61C85,CRC-12E45FF6696080F8
2,Q9BWG6-1,P17568,1,2,230,137,5,4,0.42,0.41,...,incorrect,6.533676,0.591304,10.560456,8.547066,516 HuRI $+$,1.0,results_chunk_5/predictions/2_unrelaxed_rank_1...,CRC-001576EA78C61C85,CRC-2743716544288776
3,Q9BWG6-1,O95995-1,1,3,230,478,5,5,0.26,0.09,...,incorrect,6.533676,0.591304,11.606744,9.07021,516 HuRI $+$,1.0,results_chunk_5/predictions/3_unrelaxed_rank_1...,CRC-001576EA78C61C85,CRC-2EDBC7981868EB12
4,Q9BWG6-1,Q8IWZ5-1,1,4,230,723,5,2,0.43,0.11,...,incorrect,6.533676,0.591304,9.997539,8.265607,516 HuRI $+$,1.0,results_chunk_5/predictions/4_unrelaxed_rank_1...,CRC-001576EA78C61C85,CRC-40897D5AF9B186A9


## prep an AF2 frame: huri, humap, random - not ours yet

In [904]:
af2 = pd.concat((mpp, r2k)).drop_duplicates()
assert len(af2) == len(mpp) + len(r2k)
print(len(af2))
af2.head()

67848


Unnamed: 0,ida,idb,pdockq,dataset,pdbfile,prob,lena,lenb
0,A0A024R0Y4,O14964,0.077304,HURI,ENSG00000185359-ENSG00000276234.pdb,1.0,0,0
1,A0A024R0Y4,O15287,0.038752,HURI,ENSG00000221829-ENSG00000276234.pdb,1.0,0,0
2,A0A024R0Y4,O60573,0.034888,HURI,ENSG00000135930-ENSG00000276234.pdb,1.0,0,0
3,A0A024R0Y4,O75431,0.075548,HURI,ENSG00000128654-ENSG00000276234.pdb,1.0,0,0
4,A0A024R0Y4,O75528,0.730883,HURI,ENSG00000171148-ENSG00000276234.pdb,1.0,0,0


In [905]:
af2.dataset.value_counts()

HURI           55638
HuMap          10361
1849 random     1849
Name: dataset, dtype: int64

In [906]:
# add some annotations
capris = ['incorrect', 'acceptable', 'medium', 'high']
num_capris = ['$< 0.23$', '$0.23 - 0.5$', '$> 0.5$']

In [907]:
af2['capri'] = af2.pdockq.apply(
    lambda pq: 'incorrect' if pq < .23 else 'acceptable' if pq < .5 else 'medium' if pq < .8 else 'high')
af2.loc[af2.dataset == 'HURI', 'dataset'] = 'HuRI'
af2.loc[af2.dataset == 'HuMap', 'dataset'] = 'hu.MAP'
af2.head()

Unnamed: 0,ida,idb,pdockq,dataset,pdbfile,prob,lena,lenb,capri
0,A0A024R0Y4,O14964,0.077304,HuRI,ENSG00000185359-ENSG00000276234.pdb,1.0,0,0,incorrect
1,A0A024R0Y4,O15287,0.038752,HuRI,ENSG00000221829-ENSG00000276234.pdb,1.0,0,0,incorrect
2,A0A024R0Y4,O60573,0.034888,HuRI,ENSG00000135930-ENSG00000276234.pdb,1.0,0,0,incorrect
3,A0A024R0Y4,O75431,0.075548,HuRI,ENSG00000128654-ENSG00000276234.pdb,1.0,0,0,incorrect
4,A0A024R0Y4,O75528,0.730883,HuRI,ENSG00000171148-ENSG00000276234.pdb,1.0,0,0,medium


In [908]:
af2_ids = get_ids(af2)
len(af2_ids), list(af2_ids)[:4]

(9950, ['Q99525', 'A0AVT1', 'O95848', 'Q6ZTN6'])

## load seqs for HuRI and hu.MAP

In [909]:
def proc(df: pd.DataFrame, name: Optional[str] = None
         ) -> [pd.DataFrame, dict[str, str]]:
    assert df.shape[1] == 2, 'pass two columns!'
    df = df.copy()
    df.columns = ['uniprotkb', 'seq']
    df = df.loc[~df.seq.isna()]
    assert not len(df.loc[df.uniprotkb.duplicated()]), 'duplicates in ID column!'
    df['crc64'] = df.seq.apply(get_seq_hash)
    df['len'] = df.seq.apply(len)
    if name:
        df['dataset'] = name
    d = dict(zip(df.crc64, df.seq))
    df = df.drop(columns='seq')
    return df, d

In [910]:
af2_seqs = dict()

In [911]:
humap = pd.read_csv(huintaf_dir /
                    'data/HuMap-uniprot.tab', sep='\t')
humap.head()

Unnamed: 0,Entry,Entry name,Status,Protein names,Gene names,Organism,Length,Sequence,Cross-reference (PDB),Gene ontology (cellular component),Subcellular location [CC]
0,A0A075B759,PAL4E_HUMAN,reviewed,Peptidyl-prolyl cis-trans isomerase A-like 4E ...,PPIAL4E,Homo sapiens (Human),164.0,MVNSVVFFEITRDGKPLGRISIKLFADKIPKTAENFRALSTGEKGF...,,cytoplasm [GO:0005737]; intracellular membrane...,SUBCELLULAR LOCATION: Cytoplasm {ECO:0000250|U...
1,A0A0B4J2A2,PAL4C_HUMAN,reviewed,Peptidyl-prolyl cis-trans isomerase A-like 4C ...,PPIAL4C,Homo sapiens (Human),164.0,MVNSVVFFDITVDGKPLGRISIKLFADKIPKTAENFRALSTGEKGF...,,cytoplasm [GO:0005737]; intracellular membrane...,SUBCELLULAR LOCATION: Cytoplasm {ECO:0000250|U...
2,A0AV02,S12A8_HUMAN,reviewed,Solute carrier family 12 member 8 (Cation-chlo...,SLC12A8 CCC9,Homo sapiens (Human),714.0,MTQMSQVQELFHEAAQQDALAQPQPWWKTQLFMWEPVLFGTWDGVF...,,integral component of membrane [GO:0016021],SUBCELLULAR LOCATION: Membrane {ECO:0000305}; ...
3,A0AV96,RBM47_HUMAN,reviewed,RNA-binding protein 47 (RNA-binding motif prot...,RBM47,Homo sapiens (Human),593.0,MTAEDSTAAMSSDSAAGSSAKVPEGVAGAPNEAALLALMERTGYSM...,2DIS;,nucleus [GO:0005634],SUBCELLULAR LOCATION: Nucleus {ECO:0000250}.
4,A0AVF1,IFT56_HUMAN,reviewed,Intraflagellar transport protein 56 (Tetratric...,TTC26 IFT56,Homo sapiens (Human),554.0,MMLSRAKPAVGRGVQHTDKRKKKGRKIPKLEELLSKRDFTGAITLL...,,centrosome [GO:0005813]; ciliary basal body [G...,"SUBCELLULAR LOCATION: Cell projection, cilium ..."


In [912]:
humap, af2_seqs['hu.MAP'] = proc(
    humap[['Entry', 'Sequence']], 'hu.MAP')

In [913]:
huri = pd.read_csv(huintaf_dir / 'data/HuRI-uniprot.tab', sep='\t')
huri.head()

Unnamed: 0,Entry,Entry name,Status,Protein names,Gene names,Organism,Length,Sequence,Cross-reference (PDB),Gene ontology (cellular component),Subcellular location [CC]
0,A0A024R0Y4,A0A024R0Y4_HUMAN,unreviewed,Transcriptional adapter,TADA2A TADA2L hCG_28187,Homo sapiens (Human),443.0,MDRLGSFSNDPSDKPPCRGCSSYLMEPYIKCAECGPPPFFLCLQCF...,,ATAC complex [GO:0140672]; mitotic spindle [GO...,SUBCELLULAR LOCATION: Nucleus {ECO:0000256|PIR...
1,A0A024R2X5,A0A024R2X5_HUMAN,,Deleted.,,,,,,,
2,A0A024R644,A0A024R644_HUMAN,unreviewed,"Ceroid-lipofuscinosis, neuronal 5, isoform CRA_a",CLN5 hCG_28176,Homo sapiens (Human),407.0,MRRNLRLGPSSGADAQGQGAPRPGLAAPRMLLPPASQASRGSGSTG...,6R99;,lysosome [GO:0005764],
3,A0A024RA52,A0A024RA52_HUMAN,unreviewed,Proteasome subunit alpha type,PSMA2 hCG_19883,Homo sapiens (Human),234.0,MAERGYSFSLTTFSPSGKLVQIEYALAAVAGGAPSVGIKAANGVVL...,,cytosol [GO:0005829]; nucleus [GO:0005634]; pr...,SUBCELLULAR LOCATION: Cytoplasm {ECO:0000256|R...
4,A0A024RAC6,A0A024RAC6_HUMAN,unreviewed,Elongin-A,ELOA TCEB3 hCG_37551,Homo sapiens (Human),772.0,MAAESALQVVEKLQARLAANPDPKKLLKYLKKLSTLPITVDILAET...,,elongin complex [GO:0070449],SUBCELLULAR LOCATION: Nucleus {ECO:0000256|ARB...


In [914]:
huri, af2_seqs['HuRI'] = proc(
    huri[['Entry', 'Sequence']], 'HuRI')

In [973]:
huri_ensg = pd.read_csv(huintaf_dir / 'data/HuRI/seqs.csv')
huri_ensg.head()

Unnamed: 0,Id,Length,Sequence,Entry,Entry name,Cross-reference (PANTHER),Cross-reference (PDB),Gene names
0,ENSG00000000005,317.0,MAKNPPENCEDCHILNAEAFKSKKICKSLKICGLVFGILALTLIVL...,Q9H2S6,TNMD_HUMAN,PTHR14064;,,TNMD CHM1L UNQ771/PRO1565
1,ENSG00000000419,260.0,MASLEVSRSPRRSRRELEVRSPRQNKYSVLLPTYNERENLPLIVWL...,O60762,DPM1_HUMAN,PTHR43398;,,DPM1
2,ENSG00000000457,742.0,MGSENSALKSYTLREPPFTLPSGLAVYPAVLQDGKFASVFVYKREN...,Q8IZE3,PACE1_HUMAN,,,SCYL3 PACE1
3,ENSG00000000460,853.0,MFLPHMNHLTLEQTFFSQVLPKTVKLFDDMMYELTSQARGLSSQNL...,Q9NSG2,CA112_HUMAN,PTHR16071;,,C1orf112
4,ENSG00000001036,467.0,MRPQELPRLAFPLLLLLLLLLPPPPCPAHSATRFDPTWESLDARQL...,Q9BTY2,FUCO2_HUMAN,PTHR10030;,,FUCA2 PSEC0151 UNQ227/PRO260


In [980]:
# insert: save the ENSG shit
enn = huri_ensg.copy()
enn = enn.rename(columns=dict(Id='ensg', Length='len', Entry='uniprotkb', Sequence='seq'))
enn.len = enn.len.astype(int)
enn['crc64'] = enn.seq.apply(get_seq_hash)
enn = enn[['ensg', 'len', 'uniprotkb', 'crc64']].drop_duplicates()
print(len(enn))
enn.head()

8888


Unnamed: 0,ensg,len,uniprotkb,crc64
0,ENSG00000000005,317,Q9H2S6,CRC-D17507D7E2EA0C68
1,ENSG00000000419,260,O60762,CRC-9792145BFC8F0514
2,ENSG00000000457,742,Q8IZE3,CRC-2F7D03681A4F641B
3,ENSG00000000460,853,Q9NSG2,CRC-E7FCA875C87052B1
4,ENSG00000001036,467,Q9BTY2,CRC-A42AA6B369A3AC39


In [981]:
enn.to_csv('ENSGs.tsv', sep='\t', header=True, index=False)

In [916]:
ensg, af2_seqs['ensg'] = proc(
    huri_ensg[['Entry', 'Sequence']], 'ensg')
ensg.head()

Unnamed: 0,uniprotkb,crc64,len,dataset
0,Q9H2S6,CRC-D17507D7E2EA0C68,317,ensg
1,O60762,CRC-9792145BFC8F0514,260,ensg
2,Q8IZE3,CRC-2F7D03681A4F641B,742,ensg
3,Q9NSG2,CRC-E7FCA875C87052B1,853,ensg
4,Q9BTY2,CRC-A42AA6B369A3AC39,467,ensg


In [917]:
af3 = pd.concat((humap, huri, ensg))
assert len(af3) == len(af3.drop_duplicates())
print(len(af3))
af3.head()

27646


Unnamed: 0,uniprotkb,crc64,len,dataset
0,A0A075B759,CRC-FD7DF57F1A6FDAA9,164,hu.MAP
1,A0A0B4J2A2,CRC-8FE074FFE5657EF8,164,hu.MAP
2,A0AV02,CRC-CE7669E9758EF00D,714,hu.MAP
3,A0AV96,CRC-AEA061F89A68010B,593,hu.MAP
4,A0AVF1,CRC-188062024DB89B97,554,hu.MAP


In [918]:
# first, merge all the entries that have pdbfiles with ENSGs only

In [919]:
[[li[:-4].split('-') for li in l] for l in af2.pdbfile.str.split(';')]

[[['ENSG00000185359', 'ENSG00000276234']],
 [['ENSG00000221829', 'ENSG00000276234']],
 [['ENSG00000135930', 'ENSG00000276234']],
 [['ENSG00000128654', 'ENSG00000276234']],
 [['ENSG00000171148', 'ENSG00000276234']],
 [['ENSG00000145982', 'ENSG00000276234']],
 [['ENSG00000168309', 'ENSG00000276234']],
 [['ENSG00000229809', 'ENSG00000276234']],
 [['ENSG00000166407', 'ENSG00000276234']],
 [['ENSG00000172053', 'ENSG00000276234']],
 [['ENSG00000182481', 'ENSG00000276234']],
 [['ENSG00000140259', 'ENSG00000276234']],
 [['ENSG00000162385', 'ENSG00000276234']],
 [['ENSG00000183431', 'ENSG00000276234']],
 [['ENSG00000179455', 'ENSG00000276234']],
 [['ENSG00000212747', 'ENSG00000276234']],
 [['ENSG00000175946', 'ENSG00000276234']],
 [['ENSG00000103852', 'ENSG00000276234']],
 [['ENSG00000174460', 'ENSG00000276234']],
 [['ENSG00000116903', 'ENSG00000276234']],
 [['ENSG00000161180', 'ENSG00000276234']],
 [['ENSG00000162222', 'ENSG00000276234']],
 [['ENSG00000173480', 'ENSG00000276234']],
 [['ENSG000

In [920]:
ensg_only_pairs = [all([all([_id.startswith('ENSG') for _id in li[:-4].split('-')]) for li in l]) for l in af2.pdbfile.str.split(';')]

In [921]:
# the ones that don't match but have
af2.loc[[not i for i in ensg_only_pairs] & af2.pdbfile.str.contains(';')]

Unnamed: 0,ida,idb,pdockq,dataset,pdbfile,prob,lena,lenb,capri


In [922]:
len(af2.loc[ensg_only_pairs])

55619

In [923]:
af2.loc[ensg_only_pairs].head()

Unnamed: 0,ida,idb,pdockq,dataset,pdbfile,prob,lena,lenb,capri
0,A0A024R0Y4,O14964,0.077304,HuRI,ENSG00000185359-ENSG00000276234.pdb,1.0,0,0,incorrect
1,A0A024R0Y4,O15287,0.038752,HuRI,ENSG00000221829-ENSG00000276234.pdb,1.0,0,0,incorrect
2,A0A024R0Y4,O60573,0.034888,HuRI,ENSG00000135930-ENSG00000276234.pdb,1.0,0,0,incorrect
3,A0A024R0Y4,O75431,0.075548,HuRI,ENSG00000128654-ENSG00000276234.pdb,1.0,0,0,incorrect
4,A0A024R0Y4,O75528,0.730883,HuRI,ENSG00000171148-ENSG00000276234.pdb,1.0,0,0,medium


In [924]:
# whut
af2.loc[ensg_only_pairs, 'dataset'].value_counts()

HuRI      55472
hu.MAP      147
Name: dataset, dtype: int64

In [925]:
# but now each line is only a single file anyway xD
af2.loc[af2.pdbfile.str.contains(';')]

Unnamed: 0,ida,idb,pdockq,dataset,pdbfile,prob,lena,lenb,capri


In [926]:
af2.dataset.value_counts()

HuRI           55638
hu.MAP         10361
1849 random     1849
Name: dataset, dtype: int64

In [927]:
# first, merge all the entries that have pdbfiles with ENSGs only
af4 = af2.loc[ensg_only_pairs].merge(af3.loc[af3.dataset == 'ensg'], how='left', left_on='ida', right_on='uniprotkb').drop(columns=['uniprotkb', 'dataset_y', 'lena']).rename(columns=dict(crc64='crca', len='lena', dataset_x='dataset')).drop_duplicates().merge(af3.loc[af3.dataset == 'ensg'], how='left', left_on='idb', right_on='uniprotkb').drop(columns=['uniprotkb', 'dataset_y', 'lenb']).rename(columns=dict(crc64='crcb', len='lenb', dataset_x='dataset')).drop_duplicates()
assert len(af4) == len(af2.loc[ensg_only_pairs])
print(len(af4))
af4.head()

55619


Unnamed: 0,ida,idb,pdockq,dataset,pdbfile,prob,capri,crca,lena,crcb,lenb
0,A0A024R0Y4,O14964,0.077304,HuRI,ENSG00000185359-ENSG00000276234.pdb,1.0,incorrect,CRC-27D3F4F4D5EFF3C8,443.0,CRC-DD64167A19DCF030,777.0
1,A0A024R0Y4,O15287,0.038752,HuRI,ENSG00000221829-ENSG00000276234.pdb,1.0,incorrect,CRC-27D3F4F4D5EFF3C8,443.0,CRC-4BC7475472AC3C84,622.0
2,A0A024R0Y4,O60573,0.034888,HuRI,ENSG00000135930-ENSG00000276234.pdb,1.0,incorrect,CRC-27D3F4F4D5EFF3C8,443.0,CRC-3D3075BFA48B3C12,245.0
3,A0A024R0Y4,O75431,0.075548,HuRI,ENSG00000128654-ENSG00000276234.pdb,1.0,incorrect,CRC-27D3F4F4D5EFF3C8,443.0,CRC-3A2EF476F1C78465,263.0
4,A0A024R0Y4,O75528,0.730883,HuRI,ENSG00000171148-ENSG00000276234.pdb,1.0,medium,CRC-27D3F4F4D5EFF3C8,443.0,CRC-C86153CFA83F9226,432.0


In [928]:
af4_done = (~af4.crca.isna()) & (~af4.crcb.isna())

In [929]:
not_ensg = [not i for i in ensg_only_pairs]
af5 = af2.loc[not_ensg].copy()
af5[['crca', 'crcb']] = pd.NA, pd.NA
af4, af4a = af4.loc[af4_done].copy(), af4.loc[
    [not i for i in af4_done]].copy()
af5 = pd.concat((af5[af4.columns], af4a))
print(len(af5))
af5.head()

12801


Unnamed: 0,ida,idb,pdockq,dataset,pdbfile,prob,capri,crca,lena,crcb,lenb
38,A0A075B759,P0DN37,0.12113,hu.MAP,P0DN37-A0A075B759.pdb,0.999979,incorrect,,0.0,,0.0
204,A0A0B4J2A2,P0DN37,0.098878,hu.MAP,P0DN37-A0A0B4J2A2.pdb,1.0,incorrect,,0.0,,0.0
587,A0AV96,Q9NQ94,0.033961,hu.MAP,Q9NQ94-A0AV96.pdb,0.547486,incorrect,,0.0,,0.0
589,A0AV96,Q9NY65,0.060896,hu.MAP,Q9NY65-A0AV96.pdb,0.664063,incorrect,,0.0,,0.0
592,A0AVF1,Q86WT1,0.048306,hu.MAP,A0AVF1-Q86WT1.pdb,0.688305,incorrect,,0.0,,0.0


In [930]:
af6, af7 = af5.loc[af5.crca.isna()].copy(), af5.loc[
    ~af5.crca.isna()].copy()
len(af6), len(af7)

(12598, 203)

In [931]:
af6 = af6.merge(af3.loc[af3.dataset != 'ensg'], how='left', left_on=['ida', 'dataset'], right_on=['uniprotkb', 'dataset']).drop(columns=['crca', 'lena', 'uniprotkb']).rename(columns=dict(crc64='crca', len='lena'))[af4.columns].drop_duplicates()
af6a, af6b = af6.loc[~af6.crca.isna()].copy(), af6.loc[af6.crca.isna()].copy()
af6b = af6b.merge(af3.loc[af3.dataset != 'ensg'], how='left', left_on=['ida'], right_on=['uniprotkb']).drop(columns=['crca', 'lena', 'uniprotkb', 'dataset_y']).rename(columns=dict(crc64='crca', len='lena', dataset_x='dataset'))[af4.columns].drop_duplicates()
af6 = pd.concat((af6a, af6b))
print(len(af6), len(af6.loc[af6.crca.isna()]))
af6.head()

12598 287


Unnamed: 0,ida,idb,pdockq,dataset,pdbfile,prob,capri,crca,lena,crcb,lenb
0,A0A075B759,P0DN37,0.12113,hu.MAP,P0DN37-A0A075B759.pdb,0.999979,incorrect,CRC-FD7DF57F1A6FDAA9,164.0,,0.0
1,A0A0B4J2A2,P0DN37,0.098878,hu.MAP,P0DN37-A0A0B4J2A2.pdb,1.0,incorrect,CRC-8FE074FFE5657EF8,164.0,,0.0
2,A0AV96,Q9NQ94,0.033961,hu.MAP,Q9NQ94-A0AV96.pdb,0.547486,incorrect,CRC-AEA061F89A68010B,593.0,,0.0
3,A0AV96,Q9NY65,0.060896,hu.MAP,Q9NY65-A0AV96.pdb,0.664063,incorrect,CRC-AEA061F89A68010B,593.0,,0.0
4,A0AVF1,Q86WT1,0.048306,hu.MAP,A0AVF1-Q86WT1.pdb,0.688305,incorrect,CRC-188062024DB89B97,554.0,,0.0


In [932]:
af8 = pd.concat((af6, af7))
af6, af7 = af8.loc[af8.crcb.isna()].copy(), af8.loc[
    ~af8.crcb.isna()].copy()
len(af6), len(af7)

(12437, 364)

In [933]:
af6 = af6.merge(af3.loc[af3.dataset != 'ensg'], how='left', left_on=['idb', 'dataset'], right_on=['uniprotkb', 'dataset']).drop(columns=['crcb', 'lenb', 'uniprotkb']).rename(columns=dict(crc64='crcb', len='lenb'))[af4.columns].drop_duplicates()
af6a, af6b = af6.loc[~af6.crcb.isna()].copy(), af6.loc[af6.crcb.isna()].copy()
af6b = af6b.merge(af3.loc[af3.dataset != 'ensg'], how='left', left_on=['idb'], right_on=['uniprotkb']).drop(columns=['crcb', 'lenb', 'uniprotkb', 'dataset_y']).rename(columns=dict(crc64='crcb', len='lenb', dataset_x='dataset'))[af4.columns].drop_duplicates()
af6 = pd.concat((af6a, af6b))
print(len(af6), len(af6.loc[af6.crca.isna()]))
af6.head()

12437 163


Unnamed: 0,ida,idb,pdockq,dataset,pdbfile,prob,capri,crca,lena,crcb,lenb
0,A0A075B759,P0DN37,0.12113,hu.MAP,P0DN37-A0A075B759.pdb,0.999979,incorrect,CRC-FD7DF57F1A6FDAA9,164.0,CRC-CEAFE5FE3D858213,164.0
1,A0A0B4J2A2,P0DN37,0.098878,hu.MAP,P0DN37-A0A0B4J2A2.pdb,1.0,incorrect,CRC-8FE074FFE5657EF8,164.0,CRC-CEAFE5FE3D858213,164.0
2,A0AV96,Q9NQ94,0.033961,hu.MAP,Q9NQ94-A0AV96.pdb,0.547486,incorrect,CRC-AEA061F89A68010B,593.0,CRC-AA5EF76BD8815807,594.0
3,A0AV96,Q9NY65,0.060896,hu.MAP,Q9NY65-A0AV96.pdb,0.664063,incorrect,CRC-AEA061F89A68010B,593.0,CRC-8D1AFB9D131529BD,449.0
4,A0AVF1,Q86WT1,0.048306,hu.MAP,A0AVF1-Q86WT1.pdb,0.688305,incorrect,CRC-188062024DB89B97,554.0,CRC-567FD2C26CC48435,665.0


In [934]:
af6a.loc[af6a.pdbfile.duplicated(keep=False)]

Unnamed: 0,ida,idb,pdockq,dataset,pdbfile,prob,capri,crca,lena,crcb,lenb


In [935]:
af8 = pd.concat((af6, af7))
print(len(af8))
af8.head()

12801


Unnamed: 0,ida,idb,pdockq,dataset,pdbfile,prob,capri,crca,lena,crcb,lenb
0,A0A075B759,P0DN37,0.12113,hu.MAP,P0DN37-A0A075B759.pdb,0.999979,incorrect,CRC-FD7DF57F1A6FDAA9,164.0,CRC-CEAFE5FE3D858213,164.0
1,A0A0B4J2A2,P0DN37,0.098878,hu.MAP,P0DN37-A0A0B4J2A2.pdb,1.0,incorrect,CRC-8FE074FFE5657EF8,164.0,CRC-CEAFE5FE3D858213,164.0
2,A0AV96,Q9NQ94,0.033961,hu.MAP,Q9NQ94-A0AV96.pdb,0.547486,incorrect,CRC-AEA061F89A68010B,593.0,CRC-AA5EF76BD8815807,594.0
3,A0AV96,Q9NY65,0.060896,hu.MAP,Q9NY65-A0AV96.pdb,0.664063,incorrect,CRC-AEA061F89A68010B,593.0,CRC-8D1AFB9D131529BD,449.0
4,A0AVF1,Q86WT1,0.048306,hu.MAP,A0AVF1-Q86WT1.pdb,0.688305,incorrect,CRC-188062024DB89B97,554.0,CRC-567FD2C26CC48435,665.0


In [936]:
aff = pd.concat((af8, af4)).drop_duplicates()
assert len(aff) == len(pd.concat((mpp, r2k)).drop_duplicates())
# aff[['lena', 'leb']] = aff[['lena', 'lenb']].astype(int)
print(len(aff))
aff.head()

67848


Unnamed: 0,ida,idb,pdockq,dataset,pdbfile,prob,capri,crca,lena,crcb,lenb
0,A0A075B759,P0DN37,0.12113,hu.MAP,P0DN37-A0A075B759.pdb,0.999979,incorrect,CRC-FD7DF57F1A6FDAA9,164.0,CRC-CEAFE5FE3D858213,164.0
1,A0A0B4J2A2,P0DN37,0.098878,hu.MAP,P0DN37-A0A0B4J2A2.pdb,1.0,incorrect,CRC-8FE074FFE5657EF8,164.0,CRC-CEAFE5FE3D858213,164.0
2,A0AV96,Q9NQ94,0.033961,hu.MAP,Q9NQ94-A0AV96.pdb,0.547486,incorrect,CRC-AEA061F89A68010B,593.0,CRC-AA5EF76BD8815807,594.0
3,A0AV96,Q9NY65,0.060896,hu.MAP,Q9NY65-A0AV96.pdb,0.664063,incorrect,CRC-AEA061F89A68010B,593.0,CRC-8D1AFB9D131529BD,449.0
4,A0AVF1,Q86WT1,0.048306,hu.MAP,A0AVF1-Q86WT1.pdb,0.688305,incorrect,CRC-188062024DB89B97,554.0,CRC-567FD2C26CC48435,665.0


In [937]:
aff.loc[aff.pdbfile.duplicated(keep=False)]

Unnamed: 0,ida,idb,pdockq,dataset,pdbfile,prob,capri,crca,lena,crcb,lenb


In [938]:
aff.loc[aff.crca.isna() | aff.crcb.isna()]

Unnamed: 0,ida,idb,pdockq,dataset,pdbfile,prob,capri,crca,lena,crcb,lenb
10192,A6NNZ2,P05362,0.031345,hu.MAP,P05362-A6NNZ2.pdb,1.0,incorrect,,,CRC-550089365A733AFB,532.0
10193,A6NNZ2,P07437,0.553691,hu.MAP,P07437-A6NNZ2.pdb,1.0,medium,,,CRC-1E6CD0A36773A103,444.0
10194,A6NNZ2,P25445,0.103594,hu.MAP,P25445-A6NNZ2.pdb,1.0,incorrect,,,CRC-0139942535111410,335.0
10195,A6NNZ2,Q13291,0.038629,hu.MAP,Q13291-A6NNZ2.pdb,1.0,incorrect,,,CRC-BFB0F27EA31D8C04,335.0
10196,A6NNZ2,Q13509,0.380734,hu.MAP,Q13509-A6NNZ2.pdb,1.0,acceptable,,,CRC-4B9CDE7DBA102949,450.0
...,...,...,...,...,...,...,...,...,...,...,...
3113,I3L4Z7,Q99081,0.042905,HuRI,ENSG00000140262-ENSG00000214087.pdb,1.0,incorrect,,,CRC-9736113D9361D3F5,682.0
3114,Q0VD77,Q5TD97,0.047478,HuRI,ENSG00000112214-ENSG00000185761.pdb,1.0,incorrect,,,CRC-82C03B1EAA147C11,284.0
3115,Q0VD77,Q6UY14,0.080997,HuRI,ENSG00000143382-ENSG00000185761.pdb,1.0,incorrect,,,CRC-79AE0E5DF5488CA1,1074.0
3116,Q0VD77,Q7Z3S9,0.103304,HuRI,ENSG00000185761-ENSG00000264343.pdb,1.0,incorrect,,,CRC-2DF01880A22814C9,236.0


In [939]:
aff.dataset.value_counts()

HuRI           55638
hu.MAP         10361
1849 random     1849
Name: dataset, dtype: int64

In [940]:
# aff.to_csv('huintaf2_fixed.tsv', sep='\t',
#            index=False, header=True,
#            float_format='{:.5}'.format)

In [941]:
missing_ids = set(aff.loc[aff.crca.isna(), 'ida']) | set(
    aff.loc[aff.crcb.isna(), 'idb'])
len(missing_ids), list(missing_ids)[:4]

(72, ['D6R934', 'Q96QV6', 'P20231', 'P04908'])

### fetch from UniProt

In [687]:
missing_mapping = uniprot_api_fetch(
    missing_ids, out_file=Path('uniprot/huintaf_72.fasta'))

jobId: 0024d8bc880ccb1f492c3671a50dc2b760aeaaef
huintaf_72: query + tab ... 73:0
huintaf_72: fasta ...API FASTA: 5db3cf411a5a8e4a3d54053b6a043bb3d93bda1dfcd96e4385b133de3088c75c


hash FASTA: 71it [00:00, 4452.35it/s]
fetch UniParc: 100%|██████████| 2/2 [00:00<00:00,  2.50it/s]


In [942]:
with Path('uniprot/huintaf_72.json').open('r') as json_file:
    missing_mapping = json.load(json_file)
next(iter(missing_mapping.items()))

('A0A087WXY6', 'CRC-3984027ECA520C4D')

In [943]:
missing_fasta = {r.id: str(r.seq) for r in SeqIO.parse(
    'uniprot/huintaf_72.hash.fasta', 'fasta')}
next(iter(missing_fasta.items()))

('CRC-3984027ECA520C4D',
 'PESISSLPETTQNFLFFLGTQAFAVPLLLISRSQTFGYNGRACQEWLPFLSPSAS')

In [944]:
for c in 'ab':
    aff.loc[aff[f'crc{c}'].isna(), f'crc{c}'] = aff.loc[aff[f'crc{c}'].isna(), f'id{c}'].apply(missing_mapping.get)
    aff.loc[aff[f'len{c}'].isna(), f'len{c}'] = aff.loc[aff[f'len{c}'].isna(), f'id{c}'].apply(lambda s: len(missing_fasta[missing_mapping[s]]))
    assert len(aff.loc[aff[f'len{c}'].isna()]) + len(aff.loc[aff[f'crc{c}'].isna()]) == 0, c
    aff[f'len{c}'] = aff[f'len{c}'].astype(int)

In [945]:
aff.dataset.value_counts()

HuRI           55638
hu.MAP         10361
1849 random     1849
Name: dataset, dtype: int64

In [946]:
aff.to_csv('uniprot/huintaf2_fixed.tsv', sep='\t',
           index=False, header=True,
           float_format='{:.5}'.format)

In [947]:
af2_seqs['missing'] = missing_fasta

In [948]:
with Path('uniprot/huintaf2_fixed.json').open('w') as json_file:
    json.dump(af2_seqs, json_file)

In [949]:
flat = dict()
for v in af2_seqs.values():
    flat |= v
len(flat)

14136

In [950]:
with Path('uniprot/huintaf2_fixed.fasta').open('w') as fasta:
    for _id in sorted(flat.keys()):
        to_fasta(_id, flat[_id], fasta)

## run SETH

In [None]:
#%env HOME=/mnt/project/kaindl
!bash "huintaf_seth.sh"

In [951]:
with Path('huintaf2_seth_scores.json').open('r') as json_file:
    seth_scores = json.load(json_file)
_id = next(iter(seth_scores.keys()))
_id, len(seth_scores)

('CRC-02A53B8AFBF34A17', 14135)

In [952]:
seth_scores[_id].keys()

dict_keys(['seq', 'diso_pred', 'confidence', 'zscores'])

In [953]:
af = aff[['ida', 'idb', 'pdockq', 'dataset', 'pdbfile',
          'prob', 'crca', 'crcb', 'lena', 'lenb', 'capri']].copy()
print(len(af))
af.head()

67848


Unnamed: 0,ida,idb,pdockq,dataset,pdbfile,prob,crca,crcb,lena,lenb,capri
0,A0A075B759,P0DN37,0.12113,hu.MAP,P0DN37-A0A075B759.pdb,0.999979,CRC-FD7DF57F1A6FDAA9,CRC-CEAFE5FE3D858213,164,164,incorrect
1,A0A0B4J2A2,P0DN37,0.098878,hu.MAP,P0DN37-A0A0B4J2A2.pdb,1.0,CRC-8FE074FFE5657EF8,CRC-CEAFE5FE3D858213,164,164,incorrect
2,A0AV96,Q9NQ94,0.033961,hu.MAP,Q9NQ94-A0AV96.pdb,0.547486,CRC-AEA061F89A68010B,CRC-AA5EF76BD8815807,593,594,incorrect
3,A0AV96,Q9NY65,0.060896,hu.MAP,Q9NY65-A0AV96.pdb,0.664063,CRC-AEA061F89A68010B,CRC-8D1AFB9D131529BD,593,449,incorrect
4,A0AVF1,Q86WT1,0.048306,hu.MAP,A0AVF1-Q86WT1.pdb,0.688305,CRC-188062024DB89B97,CRC-567FD2C26CC48435,554,665,incorrect


In [954]:
# calculate the average per-protein ChiZod-score
for c in 'ab':
    af[f'z{c}'] = af[f'crc{c}'].apply(lambda ida: np.array(
        seth_scores[ida]['zscores']).mean())
# and the average binary "residue-is-disordered" label
for c in 'ab':
    af[f'd{c}'] = af[f'crc{c}'].apply(lambda ida: np.array(
        seth_scores[ida]['diso_pred']).mean())

af['min_chezod'] = af[['za', 'zb']].min(axis=1)
af['max_diso'] = af[['da', 'db']].max(axis=1)
af['max_chezod'] = af[['za', 'zb']].max(axis=1)
af['avg_chezod'] = af[['za', 'zb']].mean(axis=1)
af = af.sort_values(by=['ida', 'idb'])
af.head()

Unnamed: 0,ida,idb,pdockq,dataset,pdbfile,prob,crca,crcb,lena,lenb,capri,za,zb,da,db,min_chezod,max_diso,max_chezod,avg_chezod
0,A0A024R0Y4,O14964,0.077304,HuRI,ENSG00000185359-ENSG00000276234.pdb,1.0,CRC-27D3F4F4D5EFF3C8,CRC-DD64167A19DCF030,443,777,incorrect,11.839822,6.937123,0.158014,0.516088,6.937123,0.516088,11.839822,9.388472
1,A0A024R0Y4,O15287,0.038752,HuRI,ENSG00000221829-ENSG00000276234.pdb,1.0,CRC-27D3F4F4D5EFF3C8,CRC-4BC7475472AC3C84,443,622,incorrect,11.839822,11.712423,0.158014,0.12701,11.712423,0.158014,11.839822,11.776122
2,A0A024R0Y4,O60573,0.034888,HuRI,ENSG00000135930-ENSG00000276234.pdb,1.0,CRC-27D3F4F4D5EFF3C8,CRC-3D3075BFA48B3C12,443,245,incorrect,11.839822,9.707793,0.158014,0.334694,9.707793,0.334694,11.839822,10.773807
3,A0A024R0Y4,O75431,0.075548,HuRI,ENSG00000128654-ENSG00000276234.pdb,1.0,CRC-27D3F4F4D5EFF3C8,CRC-3A2EF476F1C78465,443,263,incorrect,11.839822,12.701147,0.158014,0.087452,11.839822,0.158014,12.701147,12.270484
4,A0A024R0Y4,O75528,0.730883,HuRI,ENSG00000171148-ENSG00000276234.pdb,1.0,CRC-27D3F4F4D5EFF3C8,CRC-C86153CFA83F9226,443,432,medium,11.839822,8.650904,0.158014,0.388889,8.650904,0.388889,11.839822,10.245363


In [955]:
ours = ours[af.columns]
ours.loc[ours.ida > ours.idb, ['ida', 'idb', 'crca', 'crcb',
                               'za', 'zb', 'da', 'db']] = ours.loc[
    ours.ida > ours.idb, ['idb', 'ida', 'crcb', 'crca',
                          'zb', 'za', 'db', 'da']].values
ours = ours.sort_values(by=['ida', 'idb'])
ours.head()

Unnamed: 0,ida,idb,pdockq,dataset,pdbfile,prob,crca,crcb,lena,lenb,capri,za,zb,da,db,min_chezod,max_diso,max_chezod,avg_chezod
563,A0A024RAC6,P53672,0.295323,514 HuRI $-$,results_chunk_3/predictions/231_unrelaxed_rank...,0.0,CRC-0D2C93B1D54EB09A,CRC-F9C8E7CD48EB16C7,772,197,acceptable,4.261565,12.785627,0.765544,0.055838,4.261565,0.765544,12.785627,8.523596
561,A0A024RAC6,Q8N2H4-1,0.018689,514 HuRI $-$,results_chunk_3/predictions/227_unrelaxed_rank...,0.0,CRC-0D2C93B1D54EB09A,CRC-541FBB6326C9C0BB,772,156,incorrect,4.261565,11.430454,0.765544,0.102564,4.261565,0.765544,11.430454,7.846009
562,A0A024RAC6,Q8N4P3-2,0.184515,514 HuRI $-$,results_chunk_3/predictions/230_unrelaxed_rank...,0.0,CRC-0D2C93B1D54EB09A,CRC-E0946B0FA7BD8109,772,179,incorrect,4.261565,12.547508,0.765544,0.022346,4.261565,0.765544,12.547508,8.404536
636,A0A087WTU5,O95471-1,0.071203,514 HuRI $-$,results_chunk_2/predictions/62_unrelaxed_rank_...,0.0,CRC-227A4503F960A10F,CRC-7F3CC1B963D9006A,400,211,incorrect,4.020177,10.933795,0.7525,0.156398,4.020177,0.7525,10.933795,7.476986
106,A0A087WTU5,Q13363-2,0.090263,516 HuRI $+$,results_chunk_5/predictions/147_unrelaxed_rank...,1.0,CRC-227A4503F960A10F,CRC-F071DD30B385603F,400,440,incorrect,4.020177,10.136711,0.7525,0.245455,4.020177,0.7525,10.136711,7.078444


In [956]:
ours.loc[ours.ida > ours.idb]
# now ours is sorted horizontally

Unnamed: 0,ida,idb,pdockq,dataset,pdbfile,prob,crca,crcb,lena,lenb,capri,za,zb,da,db,min_chezod,max_diso,max_chezod,avg_chezod


In [957]:
h4 = pd.concat((af, ours))
h4.head()

Unnamed: 0,ida,idb,pdockq,dataset,pdbfile,prob,crca,crcb,lena,lenb,capri,za,zb,da,db,min_chezod,max_diso,max_chezod,avg_chezod
0,A0A024R0Y4,O14964,0.077304,HuRI,ENSG00000185359-ENSG00000276234.pdb,1.0,CRC-27D3F4F4D5EFF3C8,CRC-DD64167A19DCF030,443,777,incorrect,11.839822,6.937123,0.158014,0.516088,6.937123,0.516088,11.839822,9.388472
1,A0A024R0Y4,O15287,0.038752,HuRI,ENSG00000221829-ENSG00000276234.pdb,1.0,CRC-27D3F4F4D5EFF3C8,CRC-4BC7475472AC3C84,443,622,incorrect,11.839822,11.712423,0.158014,0.12701,11.712423,0.158014,11.839822,11.776122
2,A0A024R0Y4,O60573,0.034888,HuRI,ENSG00000135930-ENSG00000276234.pdb,1.0,CRC-27D3F4F4D5EFF3C8,CRC-3D3075BFA48B3C12,443,245,incorrect,11.839822,9.707793,0.158014,0.334694,9.707793,0.334694,11.839822,10.773807
3,A0A024R0Y4,O75431,0.075548,HuRI,ENSG00000128654-ENSG00000276234.pdb,1.0,CRC-27D3F4F4D5EFF3C8,CRC-3A2EF476F1C78465,443,263,incorrect,11.839822,12.701147,0.158014,0.087452,11.839822,0.158014,12.701147,12.270484
4,A0A024R0Y4,O75528,0.730883,HuRI,ENSG00000171148-ENSG00000276234.pdb,1.0,CRC-27D3F4F4D5EFF3C8,CRC-C86153CFA83F9226,443,432,medium,11.839822,8.650904,0.158014,0.388889,8.650904,0.388889,11.839822,10.245363


In [958]:
h4.dataset.value_counts()

HuRI            55638
hu.MAP          10361
1849 random      1849
516 HuRI $+$      516
514 HuRI $-$      514
Name: dataset, dtype: int64

In [959]:
h4.to_csv('huintaf2_scores.tsv', sep='\t', header=True, index=False)

In [972]:
h4.loc[h4.lena == h4.lenb]

Unnamed: 0,ida,idb,pdockq,dataset,pdbfile,prob,crca,crcb,lena,lenb,capri,za,zb,da,db,min_chezod,max_diso,max_chezod,avg_chezod
0,A0A075B759,P0DN37,0.121130,hu.MAP,P0DN37-A0A075B759.pdb,0.999979,CRC-FD7DF57F1A6FDAA9,CRC-CEAFE5FE3D858213,164,164,incorrect,12.515006,12.575314,0.018293,0.006098,12.515006,0.018293,12.575314,12.545160
1,A0A0B4J2A2,P0DN37,0.098878,hu.MAP,P0DN37-A0A0B4J2A2.pdb,1.000000,CRC-8FE074FFE5657EF8,CRC-CEAFE5FE3D858213,164,164,incorrect,12.573576,12.575314,0.018293,0.006098,12.573576,0.018293,12.575314,12.574445
217,A0A140G945,A0A140G945,0.371798,HuRI,ENSG00000276076-ENSG00000276076.pdb,1.000000,CRC-81804A8439837D50,CRC-81804A8439837D50,173,173,acceptable,8.826032,8.826032,0.468208,0.468208,8.826032,0.468208,8.826032,8.826032
219,A0A140G945,P02489,0.487291,HuRI,ENSG00000160202-ENSG00000276076.pdb,1.000000,CRC-81804A8439837D50,CRC-81804A8439837D50,173,173,acceptable,8.826032,8.826032,0.468208,0.468208,8.826032,0.468208,8.826032,8.826032
272,A0A2R8Y4R9,A0A2R8Y4R9,0.047575,HuRI,ENSG00000105991-ENSG00000105991.pdb,1.000000,CRC-663A212239A41D26,CRC-663A212239A41D26,335,335,incorrect,3.241654,3.241654,0.770149,0.770149,3.241654,0.770149,3.241654,3.241654
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10176,Q9Y5H5,Q9Y5H8,0.090635,hu.MAP,Q9Y5H8-Q9Y5H5.pdb,0.764448,CRC-AA1783B20D963957,CRC-0BF2CD4886D178B5,950,950,incorrect,8.988215,9.075795,0.309474,0.310526,8.988215,0.310526,9.075795,9.032005
10182,Q9Y5H8,Q9Y5I3,0.150597,hu.MAP,Q9Y5H8-Q9Y5I3.pdb,0.969076,CRC-0BF2CD4886D178B5,CRC-9FC170365565908A,950,950,incorrect,9.075795,9.151294,0.310526,0.301053,9.075795,0.310526,9.151294,9.113544
10183,Q9Y5H9,Q9Y5I2,0.106641,hu.MAP,Q9Y5I2-Q9Y5H9.pdb,1.000000,CRC-8D2E9644982AE59E,CRC-8F3C734B05EF4FB8,948,948,incorrect,8.942367,9.259448,0.325949,0.300633,8.942367,0.325949,9.259448,9.100907
55613,Q9Y5V3,Q9Y5V3,0.031996,HuRI,ENSG00000179222-ENSG00000179222.pdb,1.000000,CRC-D818690052D166CE,CRC-D818690052D166CE,778,778,incorrect,5.834958,5.834958,0.647815,0.647815,5.834958,0.647815,5.834958,5.834958


## compare with old DF

In [961]:
old = pd.read_csv('huintaf2_scores_bak.tsv', sep='\t')
old = old.rename(columns=dict(hash_A='crca', hash_B='crcb'))
print(len(old))
old.head()

68363


Unnamed: 0,ida,idb,pdockq,dataset,pdbfile,prob,crca,crcb,za,zb,da,db,min_chezod,max_diso,max_chezod,avg_chezod,capri
0,A0A024R0Y4,O14964,0.077304,HuRI,ENSG00000185359-ENSG00000276234.pdb,1.0,CRC-27D3F4F4D5EFF3C8,CRC-DD64167A19DCF030,11.84117,6.937132,0.158014,0.516088,6.937132,0.516088,11.84117,9.389151,incorrect
1,A0A024R0Y4,O15287,0.038752,HuRI,ENSG00000221829-ENSG00000276234.pdb,1.0,CRC-27D3F4F4D5EFF3C8,CRC-4BC7475472AC3C84,11.84117,11.711823,0.158014,0.12701,11.711823,0.158014,11.84117,11.776496,incorrect
2,A0A024R0Y4,O60573,0.034888,HuRI,ENSG00000135930-ENSG00000276234.pdb,1.0,CRC-27D3F4F4D5EFF3C8,CRC-3D3075BFA48B3C12,11.84117,9.707791,0.158014,0.334694,9.707791,0.334694,11.84117,10.77448,incorrect
3,A0A024R0Y4,O75431,0.075548,HuRI,ENSG00000128654-ENSG00000276234.pdb,1.0,CRC-27D3F4F4D5EFF3C8,CRC-3A2EF476F1C78465,11.84117,12.701179,0.158014,0.087452,11.84117,0.158014,12.701179,12.271174,incorrect
4,A0A024R0Y4,O75528,0.730883,HuRI,ENSG00000171148-ENSG00000276234.pdb,1.0,CRC-27D3F4F4D5EFF3C8,CRC-C86153CFA83F9226,11.84117,8.64969,0.158014,0.388889,8.64969,0.388889,11.84117,10.24543,medium


In [962]:
print(len(h4))
h4.head()

68878


Unnamed: 0,ida,idb,pdockq,dataset,pdbfile,prob,crca,crcb,lena,lenb,capri,za,zb,da,db,min_chezod,max_diso,max_chezod,avg_chezod
0,A0A024R0Y4,O14964,0.077304,HuRI,ENSG00000185359-ENSG00000276234.pdb,1.0,CRC-27D3F4F4D5EFF3C8,CRC-DD64167A19DCF030,443,777,incorrect,11.839822,6.937123,0.158014,0.516088,6.937123,0.516088,11.839822,9.388472
1,A0A024R0Y4,O15287,0.038752,HuRI,ENSG00000221829-ENSG00000276234.pdb,1.0,CRC-27D3F4F4D5EFF3C8,CRC-4BC7475472AC3C84,443,622,incorrect,11.839822,11.712423,0.158014,0.12701,11.712423,0.158014,11.839822,11.776122
2,A0A024R0Y4,O60573,0.034888,HuRI,ENSG00000135930-ENSG00000276234.pdb,1.0,CRC-27D3F4F4D5EFF3C8,CRC-3D3075BFA48B3C12,443,245,incorrect,11.839822,9.707793,0.158014,0.334694,9.707793,0.334694,11.839822,10.773807
3,A0A024R0Y4,O75431,0.075548,HuRI,ENSG00000128654-ENSG00000276234.pdb,1.0,CRC-27D3F4F4D5EFF3C8,CRC-3A2EF476F1C78465,443,263,incorrect,11.839822,12.701147,0.158014,0.087452,11.839822,0.158014,12.701147,12.270484
4,A0A024R0Y4,O75528,0.730883,HuRI,ENSG00000171148-ENSG00000276234.pdb,1.0,CRC-27D3F4F4D5EFF3C8,CRC-C86153CFA83F9226,443,432,medium,11.839822,8.650904,0.158014,0.388889,8.650904,0.388889,11.839822,10.245363


In [963]:
h4.ida = h4.ida.apply(lambda s: s.split('-')[0])
h4.idb = h4.idb.apply(lambda s: s.split('-')[0])
h4 = h4.drop_duplicates()
len(h4)

68878

In [964]:
ll = ['ida', 'idb', 'dataset']
mm = h4.merge(old, how='left',
         left_on=ll,
         right_on=ll,

         )
mm

Unnamed: 0,ida,idb,pdockq_x,dataset,pdbfile_x,prob_x,crca_x,crcb_x,lena,lenb,...,crcb_y,za_y,zb_y,da_y,db_y,min_chezod_y,max_diso_y,max_chezod_y,avg_chezod_y,capri_y
0,A0A024R0Y4,O14964,0.077304,HuRI,ENSG00000185359-ENSG00000276234.pdb,1.0,CRC-27D3F4F4D5EFF3C8,CRC-DD64167A19DCF030,443,777,...,CRC-DD64167A19DCF030,11.841170,6.937132,0.158014,0.516088,6.937132,0.516088,11.841170,9.389151,incorrect
1,A0A024R0Y4,O15287,0.038752,HuRI,ENSG00000221829-ENSG00000276234.pdb,1.0,CRC-27D3F4F4D5EFF3C8,CRC-4BC7475472AC3C84,443,622,...,CRC-4BC7475472AC3C84,11.841170,11.711823,0.158014,0.127010,11.711823,0.158014,11.841170,11.776496,incorrect
2,A0A024R0Y4,O60573,0.034888,HuRI,ENSG00000135930-ENSG00000276234.pdb,1.0,CRC-27D3F4F4D5EFF3C8,CRC-3D3075BFA48B3C12,443,245,...,CRC-3D3075BFA48B3C12,11.841170,9.707791,0.158014,0.334694,9.707791,0.334694,11.841170,10.774480,incorrect
3,A0A024R0Y4,O75431,0.075548,HuRI,ENSG00000128654-ENSG00000276234.pdb,1.0,CRC-27D3F4F4D5EFF3C8,CRC-3A2EF476F1C78465,443,263,...,CRC-3A2EF476F1C78465,11.841170,12.701179,0.158014,0.087452,11.841170,0.158014,12.701179,12.271174,incorrect
4,A0A024R0Y4,O75528,0.730883,HuRI,ENSG00000171148-ENSG00000276234.pdb,1.0,CRC-27D3F4F4D5EFF3C8,CRC-C86153CFA83F9226,443,432,...,CRC-C86153CFA83F9226,11.841170,8.649690,0.158014,0.388889,8.649690,0.388889,11.841170,10.245430,medium
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68873,Q9NX76,Q9Y316,0.217307,514 HuRI $-$,results_chunk_1/predictions/1_unrelaxed_rank_1...,0.0,CRC-43DAEB652A6B50B4,CRC-E315FD5587776211,183,297,...,CRC-E315FD5587776211,10.179374,13.413964,0.245902,0.016835,10.179374,0.245902,13.413964,11.796669,incorrect
68874,Q9UBR2,Q9Y2B5,0.045651,514 HuRI $-$,results_chunk_3/predictions/213_unrelaxed_rank...,0.0,CRC-6274FD1974D0EBDC,CRC-0C302E7BA496B27A,631,303,...,CRC-0C302E7BA496B27A,10.715342,8.735483,0.158416,0.375594,8.735483,0.375594,10.715342,9.725413,incorrect
68875,Q9UL45,Q9Y3C0,0.334684,516 HuRI $+$,results_chunk_4/predictions/38_unrelaxed_rank_...,1.0,CRC-37902FCFD4802294,CRC-B0CCF1AE76CDA6D2,172,194,...,CRC-B0CCF1AE76CDA6D2,9.385123,7.852589,0.302326,0.448454,7.852589,0.448454,9.385123,8.618856,acceptable
68876,Q9UNS2,Q9Y3C0,0.182672,514 HuRI $-$,results_chunk_2/predictions/40_unrelaxed_rank_...,0.0,CRC-1D371050C7D7BF8D,CRC-B0CCF1AE76CDA6D2,423,194,...,CRC-B0CCF1AE76CDA6D2,12.237804,7.852589,0.096927,0.448454,7.852589,0.448454,12.237804,10.045196,incorrect


In [965]:
mm.loc[(mm.crca_x != mm.crca_y) | (mm.crcb_x != mm.crcb_y), ['ida', 'idb', 'dataset', 'crca_x', 'crca_y', 'crcb_x', 'crcb_y']]

Unnamed: 0,ida,idb,dataset,crca_x,crca_y,crcb_x,crcb_y
77,A0A087WZ82,Q12983,HuRI,CRC-9122B7D23A61495E,CRC-9122B7D23A61495E,CRC-53967696EDA005E2,CRC-6D79E68F146D25EB
176,A0A0B4J1R5,O00526,HuRI,CRC-D1C2B66B8898F538,CRC-D1C4D06DE8FEF35E,CRC-83D963F06050A585,CRC-83D963F06050A585
177,A0A0B4J1R5,O95183,HuRI,CRC-D1C2B66B8898F538,CRC-D1C4D06DE8FEF35E,CRC-32A1F3B808A6016C,CRC-32A1F3B808A6016C
178,A0A0B4J1R5,P01031,HuRI,CRC-D1C2B66B8898F538,CRC-D1C4D06DE8FEF35E,CRC-A7589E352F74672A,CRC-A7589E352F74672A
179,A0A0B4J1R5,P19397,HuRI,CRC-D1C2B66B8898F538,CRC-D1C4D06DE8FEF35E,CRC-48D4633DBC9110B6,CRC-48D4633DBC9110B6
...,...,...,...,...,...,...,...
67717,Q9UPX8,Q9UQB8,HuRI,CRC-7322993CB9FE0929,CRC-45550DC5D5701399,CRC-3B9EDC6405DCC99D,CRC-3B9EDC6405DCC99D
67734,Q9Y232,Q9Y3U8,1849 random,CRC-A34E7221130626EC,,CRC-FEE8850A62080EB3,
67754,Q9Y297,Q9Y2T1,HuRI,CRC-4C67F3B7E400FD37,CRC-4C67F3B7E400FD37,CRC-F7B62BED6AB4664D,CRC-23E9C72187A9230C
67778,Q9Y2T1,Q9Y6W3,HuRI,CRC-F7B62BED6AB4664D,CRC-23E9C72187A9230C,CRC-814769611360D281,CRC-814769611360D281


In [966]:
pairs = mm.loc[(mm.crca_x != mm.crca_y) | (
        mm.crcb_x != mm.crcb_y), ['crca_x', 'crcb_x', 'dataset']].copy()
pairs = dedup_pairs(pairs)
print(len(pairs))
pairs.head()

2057


Unnamed: 0,crca_x,crcb_x,dataset
77,CRC-53967696EDA005E2,CRC-9122B7D23A61495E,HuRI
176,CRC-83D963F06050A585,CRC-D1C2B66B8898F538,HuRI
177,CRC-32A1F3B808A6016C,CRC-D1C2B66B8898F538,HuRI
178,CRC-A7589E352F74672A,CRC-D1C2B66B8898F538,HuRI
179,CRC-48D4633DBC9110B6,CRC-D1C2B66B8898F538,HuRI


In [967]:
pairs.dataset.value_counts()

HuRI           1228
1849 random     807
hu.MAP           22
Name: dataset, dtype: int64

In [968]:
pairs[['crca_x', 'crcb_x']].to_csv(
    'changed_pairs.tsv', sep='\t', header=False, index=False)