In [11]:
# Housekeeping
import os, time, requests

# Math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.io

# Visualization
import plotly.express as px
import seaborn as sns
from ipysigma import Sigma
from pelote import edges_table_to_graph

# Processing
import multiprocessing
from concurrent.futures import ThreadPoolExecutor

In [12]:
dir = os.path.join('..', 'data', 'cancer types.mat')
mat = scipy.io.loadmat(dir)

In [13]:
cancerTypes = [type[0][0] for type in mat['cancerTypes']]
encodedTypes = mat['data'][:,971]
data = mat['data'][:,:971] # removed column indicating cancer type
genes = [id[0]for id in mat['geneIds'][0]]

In [14]:
df = pd.DataFrame(data=data, columns=genes)
df

Unnamed: 0,AARS,ABCB6,ABCC5,ABCF1,ABCF3,ABHD4,ABHD6,ABL1,ACAA1,ACAT2,...,ZMIZ1,ZMYM2,ZNF131,ZNF274,ZNF318,ZNF395,ZNF451,ZNF586,ZNF589,ZW10
0,44.023542,9.216286,11.319078,33.215176,16.901427,9.031338,1.109961,20.017821,16.724363,10.494192,...,29.275809,21.706486,16.315579,4.224009,8.602081,23.762341,8.302416,1.408731,4.295620,8.768768
1,29.746157,9.765600,40.540128,30.169134,20.047393,32.237287,2.460624,17.029112,28.346167,17.017284,...,65.896789,12.815215,10.150965,8.914809,6.797915,15.379187,11.420690,6.599729,3.819019,5.758501
2,35.799315,9.884781,3.886043,29.984211,17.135946,21.273727,1.501203,20.598204,25.855152,12.275738,...,44.571276,14.344729,11.224647,7.870991,7.724003,25.762396,8.628786,4.104879,4.382387,5.306177
3,26.490401,7.085828,10.804003,23.482255,17.044085,14.880104,1.299056,14.978582,31.214294,10.015235,...,51.223656,13.660995,9.730124,7.804760,5.030966,8.964868,7.990036,4.251886,3.702483,7.500498
4,27.632466,7.642971,3.670265,16.584843,20.375321,22.174600,1.553541,14.909150,54.435490,13.392213,...,55.927277,16.650019,8.584938,7.485410,5.945771,9.205302,8.761025,4.656969,3.827945,7.939863
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2081,23.758989,15.394219,10.883932,21.691852,20.209391,30.078659,0.978559,13.002383,28.629486,6.387968,...,19.716977,10.900725,4.188131,2.499783,2.422833,10.061337,3.966998,2.323246,42.449235,2.661792
2082,27.990210,28.998590,8.701462,27.579071,29.770012,15.744797,3.759037,13.468529,20.548527,5.557939,...,38.833607,13.409906,8.146828,3.390904,3.621281,18.706152,4.272848,1.199481,3.990134,1.665184
2083,14.665414,20.195646,6.703477,19.648529,12.530305,24.321260,2.263398,9.642926,30.248579,22.157856,...,29.583893,4.310888,7.920039,2.731255,4.168946,4.470333,3.064729,1.341491,1.830216,2.355292
2084,31.116022,14.562066,9.121585,13.831678,15.535040,41.278765,1.044817,8.012867,8.701291,4.777847,...,15.115717,5.303415,3.716170,1.802522,1.466401,4.209371,2.633839,0.908784,1.176131,1.891297


In [15]:
def get_gene_network(gene):

    url = f"https://string-db.org/api/json/interaction_partners?identifiers={gene}"
    response = requests.get(url).json()

    sourceGenes = [entry.get("preferredName_A", "").upper() for entry in response]
    targetGenes = [entry.get("preferredName_B", "").upper() for entry in response]

    return {"sourceGene": sourceGenes, "targetGene": targetGenes}

In [16]:
num_threads = 5

with ThreadPoolExecutor(max_workers=num_threads) as executor:
    results = list(executor.map(get_gene_network, genes))

gene_df = pd.DataFrame(columns=["sourceGene", "targetGene"])

for result in results:
    temp_df = pd.DataFrame(result)
    gene_df = pd.concat([gene_df, temp_df], ignore_index=True)

print(f"{len(genes)} genes processed...")

971 genes processed...


In [17]:
gene_df

Unnamed: 0,sourceGene,targetGene
0,AARS,YARS
1,AARS,EPRS
2,AARS,GARS
3,AARS,IARS
4,AARS,KARS
...,...,...
9667,ZW10,ZWINT
9668,ZW10,USE1
9669,ZW10,BNIP1
9670,ZW10,ZNF787


In [18]:
test = gene_df.copy()
test

Unnamed: 0,sourceGene,targetGene
0,AARS,YARS
1,AARS,EPRS
2,AARS,GARS
3,AARS,IARS
4,AARS,KARS
...,...,...
9667,ZW10,ZWINT
9668,ZW10,USE1
9669,ZW10,BNIP1
9670,ZW10,ZNF787


In [19]:
test = test[test['targetGene'].isin(genes)].reset_index(drop=True)
test

Unnamed: 0,sourceGene,targetGene
0,AARS,EPRS
1,AARS,IARS2
2,ABCC5,ABCF3
3,ABHD4,ABHD6
4,ABHD6,ABHD4
...,...,...
1481,ZMYM2,HDAC2
1482,ZNF131,FHL2
1483,ZNF131,KDM5A
1484,ZNF274,GATA2


In [20]:
graph = edges_table_to_graph(
  test,
  edge_source_col="sourceGene",
  edge_target_col="targetGene",
  count_rows_as_weight=True,
  directed=True,
)

sigma = Sigma(graph,
      edge_size="weight",
      default_edge_type="curve",
      start_layout=10,
    )

sigma

Sigma(nx.DiGraph with 761 nodes and 1,486 edges)