In [1]:
import pandas as pd
import pickle
import re
import random

In [2]:
e2u_file = '../ppi_ml/annotations/uniprot/EntrezToUniprot_clean.csv'
u2og_file = '../ppi_ml/data/og_proteomes/nog_mapping/human.euNOG.diamond.mapping.2759'
fmat_file = '../ppi_ml/data/apms/humap/humap2_feature_matrix_20200820.pkl'

In [3]:
u2og = pd.read_csv(u2og_file, sep='\t')
u2og['uniprot_id'] = u2og['ProteinID'].str.extract(r'(?<=\|)(.*)(?=\|)')    
u2og.head()

Unnamed: 0,ProteinID,ID,uniprot_id
0,tr|A0A024R1R8|A0A024R1R8_HUMAN,KOG4766,A0A024R1R8
1,sp|A0A024RBG1|NUD4B_HUMAN,KOG2839,A0A024RBG1
2,tr|A0A024RCN7|A0A024RCN7_HUMAN,ENOG502R0UR,A0A024RCN7
3,tr|A0A075B6H5|A0A075B6H5_HUMAN,ENOG502SA48,A0A075B6H5
4,sp|A0A075B6H7|KV37_HUMAN,ENOG502S3KF,A0A075B6H7


In [4]:
og_dict = dict()
og_issues = []
og_dupes = []
for i in range(len(u2og)):
    up = u2og['uniprot_id'][i]
    og = u2og['ID'][i]
    if up not in og_dict.keys():
        og_dict.update({up: og})
    else:
        if u2og['ID'][i] != og_dict[up]:
            og_issues.append(up)
            og_dict.pop(up)
        if up not in og_dupes:
            og_dupes.append(up)

In [5]:
print("# of total duplicated UniProt IDs = ", len(og_dupes))
print("# of UniProt IDs that map to multiple eggNOG IDs = ", len(set(og_issues)))

# of total duplicated UniProt IDs =  347
# of UniProt IDs that map to multiple eggNOG IDs =  347


In [6]:
sample_og_issues = random.sample(og_issues, 10)
u2og.apply(lambda row: row[u2og['uniprot_id'].isin(sample_og_issues)])

Unnamed: 0,ProteinID,ID,uniprot_id
2153,sp|O60229|KALRN_HUMAN,KOG0032,O60229
2154,sp|O60229|KALRN_HUMAN,KOG0689,O60229
2155,sp|O60229|KALRN_HUMAN,KOG4240,O60229
2813,sp|O94910|AGRL1_HUMAN,KOG3545,O94910
2814,sp|O94910|AGRL1_HUMAN,KOG4193,O94910
2815,sp|O94910|AGRL1_HUMAN,KOG4729,O94910
9562,sp|Q5QP82|DCA10_HUMAN,KOG0264,Q5QP82
9563,sp|Q5QP82|DCA10_HUMAN,KOG4155,Q5QP82
9803,sp|Q5TCS8|KAD9_HUMAN,KOG3078,Q5TCS8
9804,sp|Q5TCS8|KAD9_HUMAN,KOG3079,Q5TCS8


In [7]:
e2u = pd.read_csv(e2u_file)
e2u['entrez_id'] = e2u['entrez_id'].astype(str).replace('\.0', '', regex=True)
e2u.dropna(inplace=True)
e2u.reset_index(drop=True, inplace=True)
e2u.head()

Unnamed: 0,gene_stable_id,gene_stable_id_version,entrez_id,uniprot_id,uniprot_sp_id
0,ENSG00000198888,ENSG00000198888.2,4535,P03886,P03886
1,ENSG00000198888,ENSG00000198888.2,4535,U5Z754,P03886
2,ENSG00000198763,ENSG00000198763.3,4536,P03891,P03891
3,ENSG00000198763,ENSG00000198763.3,4536,Q7GXY9,P03891
4,ENSG00000198763,ENSG00000198763.3,4536,A0A1X7RBG6,P03891


In [8]:
ensembl_dict = dict()
ensembl_issues = []
ensembl_dupes = []
for i in range(len(e2u)):
    up = e2u['uniprot_sp_id'][i]
    ens = e2u['gene_stable_id'][i]
    if ens in ensembl_issues:
        continue
    elif ens not in ensembl_dict.keys():
        ensembl_dict.update({ens: up})
    else:
        if e2u['uniprot_sp_id'][i] != ensembl_dict[ens]:
            ensembl_issues.append(ens)
            ensembl_dict.pop(ens)
        if ens not in ensembl_dupes:
            ensembl_dupes.append(ens)

In [9]:
print("# of total Ensembl IDs uniquely mapped to UniProt IDs = ", len(ensembl_dict))
print("# of total duplicated Ensembl IDs = ", len(ensembl_dupes))
print("# of Ensembl IDs that map to multiple UniProt IDs = ", len(set(ensembl_issues)))

# of total Ensembl IDs uniquely mapped to UniProt IDs =  21236
# of total duplicated Ensembl IDs =  15456
# of Ensembl IDs that map to multiple UniProt IDs =  41


In [10]:
sample_ensembl_issues = random.sample(ensembl_issues, 3)
e2u.apply(lambda row: row[e2u['gene_stable_id'].isin(sample_ensembl_issues)])

Unnamed: 0,gene_stable_id,gene_stable_id_version,entrez_id,uniprot_id,uniprot_sp_id
12553,ENSG00000212710,ENSG00000212710.5,64693.0,Q9HC47,Q9HC47
12554,ENSG00000212710,ENSG00000212710.5,64693.0,Q96RT6,Q9HC47
12558,ENSG00000212710,ENSG00000212710.5,64693.0,Q9HC47,Q96RT6
12559,ENSG00000212710,ENSG00000212710.5,64693.0,Q96RT6,Q96RT6
16789,ENSG00000270550,ENSG00000270550.1,,P01768,P01768
16790,ENSG00000270550,ENSG00000270550.1,,P0DP03,P01768
16791,ENSG00000270550,ENSG00000270550.1,,P01768,P0DP03
16792,ENSG00000270550,ENSG00000270550.1,,P0DP03,P0DP03
41013,ENSG00000283063,ENSG00000283063.1,,A0A0J9YXY3,A0A0J9YXY3
41014,ENSG00000283063,ENSG00000283063.1,,P0DPF7,A0A0J9YXY3


In [11]:
entrez_dict = dict()
entrez_issues = []
entrez_dupes = []
for i in range(len(e2u)):
    up = e2u['uniprot_sp_id'][i]
    entrez = e2u['entrez_id'][i]
    if entrez == 'nan':
        continue
    elif entrez in entrez_issues:
        continue
    elif entrez not in entrez_dict.keys():
        entrez_dict.update({entrez: up})
    else:
        if e2u['uniprot_sp_id'][i] != entrez_dict[entrez]:
            entrez_issues.append(entrez)
            entrez_dict.pop(entrez)
        if entrez not in entrez_dupes:
            entrez_dupes.append(entrez)

In [12]:
print("# of total Entrez IDs uniquely mapped to UniProt IDs = ", len(entrez_dict))
print("# of total duplicated Entrez IDs = ", len(entrez_dupes))
print("# of Entrez IDs that map to multiple UniProt IDs = ", len(set(entrez_issues)))

# of total Entrez IDs uniquely mapped to UniProt IDs =  19105
# of total duplicated Entrez IDs =  14188
# of Entrez IDs that map to multiple UniProt IDs =  109


In [13]:
sample_entrez_issues = random.sample(entrez_issues, 3)
e2u.apply(lambda row: row[e2u['entrez_id'].isin(sample_entrez_issues)])

Unnamed: 0,gene_stable_id,gene_stable_id_version,entrez_id,uniprot_id,uniprot_sp_id
33441,ENSG00000215269,ENSG00000215269.6,645073,O76087,P0CL81
33442,ENSG00000215269,ENSG00000215269.6,645073,P0CL81,P0CL81
33443,ENSG00000215269,ENSG00000215269.6,645073,P0CL82,P0CL81
33456,ENSG00000215269,ENSG00000215269.6,645073,O76087,O76087
33457,ENSG00000215269,ENSG00000215269.6,645073,P0CL81,O76087
33458,ENSG00000215269,ENSG00000215269.6,645073,P0CL82,O76087
33471,ENSG00000215269,ENSG00000215269.6,645073,O76087,P0CL82
33472,ENSG00000215269,ENSG00000215269.6,645073,P0CL81,P0CL82
33473,ENSG00000215269,ENSG00000215269.6,645073,P0CL82,P0CL82
54472,ENSG00000236125,ENSG00000236125.3,392188,A6NCW7,A6NCW7


In [14]:
# PSMA4
entrez_dict['5685']

'P25789'

In [15]:
with open(fmat_file, 'rb') as handle:
    apms_fmat = pickle.load(handle)

In [16]:
apms_fmat['id1'] = apms_fmat['id1'].astype(str).replace('\.0', '', regex=True)
apms_fmat['id2'] = apms_fmat['id2'].astype(str).replace('\.0', '', regex=True)

In [17]:
apms_fmat.head()

Unnamed: 0,id1,id2,Ce_1111_poisson,Ce_1111_wcc,Ce_1111_apex,Ce_1111_pq_euc,Ce_6mg_1203_poisson,Ce_6mg_1203_wcc,Ce_6mg_1203_apex,Ce_6mg_1203_pq_euc,...,FoldChange,BFDR_youn,neg_ln_pval_youn_hygeo,pair_count_youn_hygeo,neg_ln_pval_youn_hygeo_gt2,pair_count_youn_hygeo_gt2,neg_ln_pval_youn_hygeo_gt4,pair_count_youn_hygeo_gt4,neg_ln_pval_treiber_hygeo_gt2,pair_count_treiber_hygeo_gt2
0,1,1,,,,,,,,,...,,,,,,,,,,
1,1,10005,,,,,,,,,...,,,,,,,,,,
2,1,10015,,,,,,,,,...,,,,,,,,,,
3,1,10043,,,,,,,,,...,,,,,,,,,,
4,1,10048,,,,,,,,,...,,,,,,,,,,


## Convert Ensembl/Entrez IDs to UniProt IDs:

In [19]:
old_id1_list = []
old_id2_list = []
new_id1_list = []
new_id2_list = []
for i in range(len(apms_fmat)):
    id1 = apms_fmat['id1'][i]
    id2 = apms_fmat['id2'][i]
    if id1.startswith('ENSG'):
        id1_new = (ensembl_dict.get(id1) if id1 in ensembl_dict else id1)
    else:
        id1_new = (entrez_dict.get(id1) if id1 in entrez_dict else id1)
    if id2.startswith('ENSG'):
        id2_new = (ensembl_dict.get(id2) if id2 in ensembl_dict else id2)
    else:
        id2_new = (entrez_dict.get(id2) if id2 in entrez_dict else id2)
    old_id1_list.append(id1)
    old_id2_list.append(id2)
    new_id1_list.append(id1_new)
    new_id2_list.append(id2_new)

In [20]:
ensembl_dict['ENSG00000146963']

'Q9Y383'

In [21]:
id1_replaced = []
for index, (old, new) in enumerate(zip(old_id1_list, new_id1_list)):
    if old != new:
        id1_replaced.append([index, old, new])
len(id1_replaced)

17278210

In [22]:
id2_replaced = []
for index, (old, new) in enumerate(zip(old_id2_list, new_id2_list)):
    if old != new:
        id2_replaced.append([index, old, new])
len(id2_replaced)

17270974

In [23]:
replaced_df1 = pd.DataFrame(id1_replaced, columns=['index_id1','old_id1','new_id1'])
replaced_df2 = pd.DataFrame(id2_replaced, columns=['index_id2','old_id2','new_id2'])
replaced_df1.to_csv('../ppi_ml/data/apms/humap/humap2_20200820_id1s_replaced.csv', index=False)
replaced_df2.to_csv('../ppi_ml/data/apms/humap/humap2_20200820_id2s_replaced.csv', index=False)

In [24]:
scores = apms_fmat.drop(['id1', 'id2'], axis=1)
uniprot_df = pd.DataFrame()
uniprot_df['ID1'] = new_id1_list
uniprot_df['ID2'] = new_id2_list
uniprot_fmat = uniprot_df.join(scores)

In [25]:
## free up some RAM
del apms_fmat
del scores
## write out intermediate results
uniprot_fmat.to_csv('../ppi_ml/data/apms/humap/humap2_featmat_20200820.upids.csv', index=False, na_rep='')
uniprot_fmat.to_pickle('../ppi_ml/data/apms/humap/humap2_featmat_20200820.upids.pkl')

## Convert UniProt IDs to euNOG IDs:

In [26]:
new_id1_list = []
new_id2_list = []
for i in range(len(uniprot_fmat)):
    id1 = uniprot_fmat['ID1'][i]
    id2 = uniprot_fmat['ID2'][i]
    id1_new = (og_dict.get(id1) if id1 in og_dict else id1)
    id2_new = (og_dict.get(id2) if id2 in og_dict else id2)
    new_id1_list.append(id1_new)
    new_id2_list.append(id2_new)

In [27]:
scores = uniprot_fmat.drop(['ID1', 'ID2'], axis=1)
og_df = pd.DataFrame()
og_df['ID1'] = new_id1_list
og_df['ID2'] = new_id2_list
og_fmat = og_df.join(scores)htop

In [28]:
del uniprot_fmat
del scores

In [29]:
len(og_fmat)

17564755

In [30]:
ogs_only = og_fmat[og_fmat['ID1'].str.startswith(('ENOG','KOG')) & og_fmat['ID2'].str.startswith(('ENOG','KOG'))]

In [31]:
len(ogs_only)

16234591

In [32]:
ogs_only.to_csv('../ppi_ml/data/apms/humap/humap2_featmat_20200820.euNOGs.csv', index=False, na_rep='')
ogs_only.to_pickle('../ppi_ml/data/apms/humap/humap2_featmat_20200820.euNOGs.pkl')

In [34]:
#del og_fmat
del ogs_only

## Get target score columns:

In [48]:
fmat_file = '../ppi_ml/data/apms/humap/orig9k_featmat.euNOGs.pkl'
feat_string = 'ext_Dm_guru,ext_Hs_malo,entropy_orig9k,zscore_orig9k,nwdscore_orig9k,plate_zscore_orig9k,uPeps_orig9k,neg_ln_pval,pair_count,prey.bait.correlation,valid.values,hein_neg_ln_pval,hein_pair_count,ave_apsm,nwdscore_bioplex2,zscore_bioplex2,plate_zscore_bioplex2,entropy_bioplex2,uPeps_bioplex2,neg_ln_pval_bioplex2_Z4,pair_count_bioplex2_Z4,neg_ln_pval_bioplex2_Z2,pair_count_bioplex2_Z2,neg_ln_pval_cilium_hygeo,pair_count_cilium_hygeo,neg_ln_pval_cilium_hygeo_avgspec2,pair_count_cilium_hygeo_avgspec2,neg_ln_pval_cilium_hygeo_avgspec4,pair_count_cilium_hygeo_avgspec4,SAij,Sij,Sji,Mij,neg_ln_pval_boldt_apms_hygeo,pair_count_boldt_apms_hygeo,neg_ln_pval_boldt_apms_hygeo_gt4,pair_count_boldt_apms_hygeo_gt4,neg_ln_pval_treiber_hygeo_gt4,pair_count_treiber_hygeo_gt4,neg_ln_pval_youn_hygeo,pair_count_youn_hygeo,neg_ln_pval_youn_hygeo_gt2,pair_count_youn_hygeo_gt2,neg_ln_pval_youn_hygeo_gt4,pair_count_youn_hygeo_gt4,neg_ln_pval_treiber_hygeo_gt2,pair_count_treiber_hygeo_gt2'

In [49]:
id_cols = ['ID1','ID2']
target_feats = feat_string.split(',')
len(feats)

47

In [50]:
with open(fmat_file, 'rb') as handle:
    fmat = pickle.load(handle)

In [51]:
fmat = fmat[id_cols+target_feats]
fmat.head()

Unnamed: 0,ID1,ID2,ext_Dm_guru,ext_Hs_malo,entropy_orig9k,zscore_orig9k,nwdscore_orig9k,plate_zscore_orig9k,uPeps_orig9k,neg_ln_pval,...,neg_ln_pval_treiber_hygeo_gt4,pair_count_treiber_hygeo_gt4,neg_ln_pval_youn_hygeo,pair_count_youn_hygeo,neg_ln_pval_youn_hygeo_gt2,pair_count_youn_hygeo_gt2,neg_ln_pval_youn_hygeo_gt4,pair_count_youn_hygeo_gt4,neg_ln_pval_treiber_hygeo_gt2,pair_count_treiber_hygeo_gt2
0,ENOG502RYEX,ENOG502RYEX,,,,,,,,,...,,,,,,,,,,
1,KOG1548,KOG4095,,,,,,,,,...,,,,,,,,,,
2,ENOG502QREN,KOG1548,,,,,,,,,...,,,2.914611,2.0,1.585333,1.0,,,,
3,KOG4269,KOG1548,,,,,,,,,...,,,0.064687,2.0,,,,,,
4,KOG1548,KOG0999,,,,,,,,,...,,,,,,,,,,


In [52]:
fmat.to_csv('../ppi_ml/data/apms/humap/orig9k_featmat.euNOGs.target_scores.csv', index=False, na_rep='')
fmat.to_pickle('../ppi_ml/data/apms/humap/orig9k_featmat.euNOGs.target_scores.pkl')

In [53]:
del fmat