In [1]:
# loading libraries
import pandas as pd
import networkx as nx
import os
from tqdm import tqdm

In [2]:
# loading and cleaning ENSG converter
test = []
geneId_geneName = {}
with open('Homo_sapiens.GRCh37.74.gtf', 'r') as file:
    for line in file:
        line = line.strip()
        data = line.split('\t')[-1]
        test.append(data)
        if 'gene_name' in data:
            attributes = data.split(';')
            geneId = attributes[0].split(' ')[1].strip('"')
            for attr in attributes:
                if 'gene_name' in attr:
                    geneName = attr.split(' ')[2].strip('"')
                    if geneId not in geneId_geneName:
                        geneId_geneName[geneId] = geneName
geneName_geneId = {v: k for k, v in geneId_geneName.items()}

In [3]:
def createRegulatory(regulatory_filepath):
    # loading, cleaning, and permutating regulatory dataset
    print(f'loading: {regulatory_filepath}')
    regulatory = pd.read_csv(regulatory_filepath, index_col=0)
    
    exceptions = []
    for name in regulatory.index:
        try:
            regulatory = regulatory.rename(index={name: geneName_geneId[name]})
        except:
            exceptions.append(name)
    
    print(f'Row exception count: {len(exceptions)}')
    
    for exc in exceptions:
        regulatory = regulatory.drop(exc)
    
    exceptions = []
    for ID in regulatory.columns.tolist():
        if ID not in geneId_geneName:
            exceptions.append(ID)
    
    print(f'Column exception count: {len(exceptions)}')

    for exc in exceptions:
        regulatory = regulatory.drop(exc)
    
    def inverse(x):
        return 1/x
    
    def absolute(x):
        return abs(x)
    
    regulatory = regulatory.map(inverse)
    regulatory = regulatory.map(absolute)

    return regulatory

def createEndpoints(filepath):
    # loading and cleaning dataset
    endpoints = pd.read_csv(filepath, sep='\t', index_col=0)
    
    exceptions = []
    for name in endpoints.index:
        try:
            endpoints = endpoints.rename(index={name: geneName_geneId[name.upper()]})
        except:
            exceptions.append(name)
    
    print(f'Endpoints exception count: {len(exceptions)}')
    
    for exc in exceptions:
        endpoints = endpoints.drop(exc)

    return list(endpoints.index)

def createRandomEndpoints(regulatory, num):
    rand = []
    for i in range(num):
        rand.append(regulatory.sample().index[0])
    return rand

def createGraph(regulatory, endpoints):
    # creating network
    regMatrix = regulatory.to_numpy().tolist()
    
    G = nx.Graph()
    nodes = list(set(list(regulatory.index) + regulatory.columns.tolist() + endpoints))
    G.add_nodes_from(nodes)
    edgeCount = 0
    for rowName, row in zip(regulatory.index, regMatrix):
        for columnName, cell in zip(regulatory.columns.tolist(), row):
            # if cell < 10:
            G.add_edge(rowName, columnName, weight=cell)
            edgeCount += 1
    
    print(edgeCount)

    return G

def connectionEnrichment(origins, endpoints):
    datasets = os.listdir('data')
    count = {}
    for dataset in datasets:
        regulatory = createRegulatory(f'data/{dataset}')
        G = createGraph(regulatory, endpoints)
        for origin in origins:
            for i, endpoint in enumerate(tqdm(endpoints)):
                try:
                    path = nx.shortest_path(G, origin, endpoint, weight="weight")
                    for node in path:
                        if node not in count:
                            count[node] = 1
                        else:
                            count[node] += 1
                except:
                    pass
    return count

In [4]:
# defining origins
OCT4 = 'ENSG00000229094'
SOX2 = 'ENSG00000181449'
KLF4 = 'ENSG00000136826'
OSKgenes = [OCT4, SOX2, KLF4]

In [None]:
# multi-dataset aging gene enrichment
globalAgingGenes = createEndpoints('global_aging_genes.tsv')
count = connectionEnrichment(OSKgenes, globalAgingGenes)

loading: data/Testis.csv
Row exception count 0
Column exception count 0
19476492


 37%|██████████████████████████████████████████████████▊                                                                                      | 115/310 [01:06<01:35,  2.05it/s]

In [None]:
sorted_dict = dict(reversed(sorted(count.items(), key=lambda item: item[1])))
print(len(sorted_dict))
print(sorted_dict)

In [None]:
# control experiment
datasets = os.listdir('data')
regulatory = createRegulatory(f'data/{datasets[0]}')
randomGenes = createRandomEndpoints(regulatory, len(globalAgingGenes))
count = connectionEnrichment(OSKgenes, randomGenes)

loading: data/Testis.csv
Row exception count: 0
Column exception count: 0
loading: data/Testis.csv
Row exception count: 0
Column exception count: 0
19476492


  7%|██████████▏                                                                                                                               | 23/310 [00:09<02:04,  2.30it/s]

In [None]:
sorted_dict = dict(reversed(sorted(count.items(), key=lambda item: item[1])))
print(len(sorted_dict))
print(sorted_dict)