## Obtaining coordinates from a FI network using UMAP

In [1]:
# Importing necessary libraries
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import umap

# Importing Node2Vec embeddings
emb = pd.read_csv('n2v-out.emb', skiprows = 1, sep = ' ', header = None, index_col=0)
emb_data = emb.to_numpy()
emb_labels = emb.index.to_numpy()

# UMAP
umap_emb = umap.UMAP(
    n_epochs = 400,
    random_state = 42,
    metric = 'dice',
    output_metric = 'mahalanobis'
).fit_transform(emb_data)

In [9]:
umap_emb_x, umap_emb_y = zip(*umap_emb)
umap_coord = pd.DataFrame([emb_labels, umap_emb_x, umap_emb_y]).transpose().rename(columns = {0: 'node_id', 1: 'x', 2: 'y'})

In [10]:
umap_coord

Unnamed: 0,node_id,x,y
0,4923.0,12.825298,3.014132
1,1059.0,13.220130,2.583034
2,145.0,12.807132,3.151362
3,872.0,12.943302,0.677321
4,2304.0,13.129017,1.030829
...,...,...,...
10951,8980.0,9.994845,2.994419
10952,10114.0,11.805233,2.998760
10953,8022.0,10.399275,4.330402
10954,10670.0,8.666698,6.282478


## Obtaining the template for UniProt/ENSEMBL/node id conversions

In [44]:
network = pd.read_csv('FIs_043009.txt', header=None, sep = ' ').rename(columns = {0: 'from', 1: 'to'})
biomart = pd.read_csv('mart_export.tsv', sep='\t')
used_uprots = pd.read_csv('node_id_name.tsv', sep='\t').rename(columns = {'name': 'UniProtKB Gene Name ID'})

human_genes = biomart.drop_duplicates(subset = 'Gene stable ID', keep = 'first', ignore_index = True).sort_values(by = 'Gene stable ID', ignore_index = True).dropna(subset = ['UniProtKB Gene Name ID', 'UniProtKB/Swiss-Prot ID', 'UniProtKB/TrEMBL ID'], how = 'all').fillna(axis = 1, method = 'bfill').drop(columns = ['UniProtKB/Swiss-Prot ID', 'UniProtKB/TrEMBL ID'])

In [45]:
human_genes

Unnamed: 0,Gene stable ID,UniProtKB Gene Name ID
0,ENSG00000000003,O43657
1,ENSG00000000005,Q9H2S6
2,ENSG00000000419,O60762
3,ENSG00000000457,Q8IZE3
4,ENSG00000000460,Q9NSG2
...,...,...
22788,ENSG00000288513,Q96R69
22789,ENSG00000288516,P17612
22791,ENSG00000288520,G9CGD6
22793,ENSG00000288534,H3BSU7


In [52]:
used_uprots

Unnamed: 0,UniProtKB Gene Name ID,node_id
0,Q494X3,1
1,P26006,2
2,O43296,3
3,P83876,4
4,O95622,5
...,...,...
10951,Q8WU10,10952
10952,P12277,10953
10953,Q96T51,10954
10954,Q8ND90,10955


In [55]:
template = pd.merge(human_genes, used_uprots, how = 'inner', on = 'UniProtKB Gene Name ID')

In [56]:
template

Unnamed: 0,Gene stable ID,UniProtKB Gene Name ID,node_id
0,ENSG00000000419,O60762,6479
1,ENSG00000000938,P09769,2261
2,ENSG00000001084,P48506,5576
3,ENSG00000001167,P23511,5652
4,ENSG00000001617,Q13275,5484
...,...,...,...
10223,ENSG00000285053,Q15813,2744
10224,ENSG00000284194,O43819,6667
10225,ENSG00000284723,Q8NH09,10415
10226,ENSG00000286169,A9YTQ3,4206
