In [1]:
# loading libraries
import pandas as pd
import networkx as nx
import os
from tqdm import tqdm
import statistics
import seaborn as sns
import matplotlib.pyplot as plt


In [2]:
# loading and cleaning ENSG converter
test = []
geneId_geneName = {}
with open('Homo_sapiens.GRCh37.74.gtf', 'r') as file:
    for line in file:
        line = line.strip()
        data = line.split('\t')[-1]
        test.append(data)
        if 'gene_name' in data:
            attributes = data.split(';')
            geneId = attributes[0].split(' ')[1].strip('"')
            for attr in attributes:
                if 'gene_name' in attr:
                    geneName = attr.split(' ')[2].strip('"')
                    if geneId not in geneId_geneName:
                        geneId_geneName[geneId] = geneName
geneName_geneId = {v: k for k, v in geneId_geneName.items()}

In [3]:
def createRegulatory(regulatory_filepath):
    # loading, cleaning, and permutating regulatory dataset
    print(f'Loading: {regulatory_filepath}')
    regulatory = pd.read_csv(regulatory_filepath, index_col=0)
    
    exceptions = []
    for name in regulatory.index:
        try:
            regulatory = regulatory.rename(index={name: geneName_geneId[name]})
        except:
            exceptions.append(name)
    
    print(f'Row exception count: {len(exceptions)}')
    
    for exc in exceptions:
        regulatory = regulatory.drop(exc)

    # if columns are ENSG IDs, uncomment this part
    exceptions = []
    for ID in regulatory.columns.tolist():
        if ID not in geneId_geneName:
            exceptions.append(ID)

    # if columns are gene names, uncomment this part
    # exceptions = []
    # for name in regulatory.columns.tolist():
    #     try:
    #         regulatory = regulatory.rename(columns={name: geneName_geneId[name]})
    #     except:
    #         exceptions.append(name)

    
    print(f'Column exception count: {len(exceptions)}')

    for exc in exceptions:
        regulatory = regulatory.drop(exc)

    
    def inverse(x):
        return 1/x
    
    def absolute(x):
        return abs(x)
    
    regulatory = regulatory.map(inverse)
    regulatory = regulatory.map(absolute)

    return regulatory

def createEndpoints(filepath):
    # loading and cleaning dataset
    endpoints = pd.read_csv(filepath, sep='\t', index_col=0)
    
    exceptions = []
    for name in endpoints.index:
        try:
            endpoints = endpoints.rename(index={name: geneName_geneId[name.upper()]})
        except:
            exceptions.append(name)
    
    print(f'Endpoints exception count: {len(exceptions)}')
    
    for exc in exceptions:
        endpoints = endpoints.drop(exc)

    return list(endpoints.index)

def createRandomEndpoints(regulatory, num, seed):
    rand = regulatory.sample(n=num, random_state=seed)
    return list(rand.index)

def createGraph(regulatory, endpoints):
    # creating network
    regMatrix = regulatory.to_numpy().tolist()
    
    G = nx.Graph()
    nodes = list(set(list(regulatory.index) + regulatory.columns.tolist() + endpoints))
    G.add_nodes_from(nodes)
    edgeCount = 0
    for rowName, row in zip(regulatory.index, regMatrix):
        for columnName, cell in zip(regulatory.columns.tolist(), row):
            # if cell < 10:
            G.add_edge(rowName, columnName, weight=cell)
            edgeCount += 1
    
    print(f'EdgeCount: {edgeCount}')

    return G

def connectionEnrichment(origins, endpoints):
    datasets = os.listdir('data')
    datasets.remove('.DS_Store')
    count = {}
    for i, dataset in enumerate(datasets):
        print(i)
        regulatory = createRegulatory(f'data/{dataset}')
        G = createGraph(regulatory, endpoints)
        for origin in origins:
            for i, endpoint in enumerate(tqdm(endpoints)):
                try:
                    path = nx.shortest_path(G, origin, endpoint, weight="weight")
                    path.pop(0)
                    path.pop()
                    for node in path:
                        if node not in count:
                            count[node] = 1
                        else:
                            count[node] += 1
                except:
                    pass
    return count

def sortDic(dic):
    return dict(reversed(sorted(dic.items(), key=lambda item: item[1])))

def printResultsWithStats(dic):
    print(f'Size: {len(dic)}')
    print(f'Average: {statistics.mean(dic.values())}')
    print(f'Median: {statistics.median(dic.values())}')
    print(dic)

In [4]:
# multi-dataset aging gene enrichment
globalAgingGenes = createEndpoints('global_aging_genes.tsv')
agingCount = connectionEnrichment(globalAgingGenes, globalAgingGenes)

Endpoints exception count: 15
0
Loading: data/fibroblastCellLine.csv
Row exception count: 0
Column exception count: 0
EdgeCount: 19476492


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 310/310 [02:37<00:00,  1.96it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 310/310 [02:40<00:00,  1.93it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 310/310 [00:00<00:00, 511701.79it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 310/310 [02:36<00:00,  1.98it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 310/310 [02:36<00:00,  1.99it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████

In [5]:
## printing results
printResultsWithStats(sortDic(agingCount))

Size: 1599
Average: 62.79049405878674
Median: 6
{'ENSG00000161940': 568, 'ENSG00000143867': 478, 'ENSG00000006468': 464, 'ENSG00000100968': 454, 'ENSG00000117036': 454, 'ENSG00000122877': 438, 'ENSG00000157554': 428, 'ENSG00000101216': 414, 'ENSG00000179348': 390, 'ENSG00000128710': 390, 'ENSG00000188909': 378, 'ENSG00000165495': 376, 'ENSG00000170485': 372, 'ENSG00000126351': 370, 'ENSG00000163132': 366, 'ENSG00000114853': 362, 'ENSG00000102974': 358, 'ENSG00000126746': 358, 'ENSG00000171956': 350, 'ENSG00000074047': 348, 'ENSG00000187079': 342, 'ENSG00000124782': 338, 'ENSG00000125618': 334, 'ENSG00000005102': 332, 'ENSG00000170265': 326, 'ENSG00000116044': 326, 'ENSG00000143355': 322, 'ENSG00000146592': 322, 'ENSG00000152977': 320, 'ENSG00000106038': 318, 'ENSG00000139352': 318, 'ENSG00000185668': 318, 'ENSG00000137273': 312, 'ENSG00000136367': 312, 'ENSG00000114439': 312, 'ENSG00000162676': 312, 'ENSG00000261345': 310, 'ENSG00000163884': 310, 'ENSG00000120798': 308, 'ENSG0000012409

In [7]:
# control experiment 1
datasets = os.listdir('data')
datasets.remove('.DS_Store')
regulatory = createRegulatory(f'data/{datasets[0]}')
globalAgingGenes = createEndpoints('global_aging_genes.tsv')
randomGenes = createRandomEndpoints(regulatory, len(globalAgingGenes), 42)
controlCount1 = connectionEnrichment(randomGenes, randomGenes)

Loading: data/fibroblastCellLine.csv
Row exception count: 0
Column exception count: 0
Endpoints exception count: 15
0
Loading: data/fibroblastCellLine.csv
Row exception count: 0
Column exception count: 0
EdgeCount: 19476492


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 310/310 [01:49<00:00,  2.83it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 310/310 [01:50<00:00,  2.81it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 310/310 [01:49<00:00,  2.83it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 310/310 [01:48<00:00,  2.87it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 310/310 [01:52<00:00,  2.75it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████

In [8]:
# printing results
printResultsWithStats(sortDic(controlCount1))

Size: 14440
Average: 6.249445983379501
Median: 4.0
{'ENSG00000163558': 58, 'ENSG00000228115': 58, 'ENSG00000105388': 54, 'ENSG00000186063': 50, 'ENSG00000227206': 50, 'ENSG00000268892': 50, 'ENSG00000171097': 48, 'ENSG00000255298': 48, 'ENSG00000268149': 48, 'ENSG00000130433': 48, 'ENSG00000168172': 48, 'ENSG00000228915': 46, 'ENSG00000184384': 46, 'ENSG00000260482': 46, 'ENSG00000198121': 44, 'ENSG00000123989': 44, 'ENSG00000234902': 44, 'ENSG00000110786': 44, 'ENSG00000259478': 44, 'ENSG00000198670': 44, 'ENSG00000173473': 44, 'ENSG00000259644': 42, 'ENSG00000137168': 42, 'ENSG00000107798': 42, 'ENSG00000123091': 42, 'ENSG00000101463': 40, 'ENSG00000167657': 40, 'ENSG00000254872': 40, 'ENSG00000133812': 40, 'ENSG00000143179': 40, 'ENSG00000170653': 40, 'ENSG00000272138': 40, 'ENSG00000235024': 38, 'ENSG00000260831': 38, 'ENSG00000215022': 38, 'ENSG00000226051': 38, 'ENSG00000272498': 36, 'ENSG00000186281': 36, 'ENSG00000179761': 36, 'ENSG00000029993': 36, 'ENSG00000144843': 36, 'ENSG

In [None]:
# control experiment 2
datasets = os.listdir('data')
datasets.remove('.DS_Store')
regulatory = createRegulatory(f'data/{datasets[0]}')
globalAgingGenes = createEndpoints('global_aging_genes.tsv')
randomGenes = createRandomEndpoints(regulatory, len(globalAgingGenes), 43)
controlCount2 = connectionEnrichment(randomGenes, randomGenes)

Loading: data/fibroblastCellLine.csv
Row exception count: 0
Column exception count: 0
Endpoints exception count: 15
0
Loading: data/fibroblastCellLine.csv
Row exception count: 0
Column exception count: 0
EdgeCount: 19476492


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 310/310 [02:14<00:00,  2.31it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 310/310 [02:07<00:00,  2.44it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 310/310 [02:06<00:00,  2.44it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 310/310 [02:20<00:00,  2.21it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 310/310 [02:11<00:00,  2.36it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████

In [None]:
# printing results
printResultsWithStats(sortDic(controlCount2))

In [None]:
# control experiment 3
datasets = os.listdir('data')
datasets.remove('.DS_Store')
regulatory = createRegulatory(f'data/{datasets[0]}')
globalAgingGenes = createEndpoints('global_aging_genes.tsv')
randomGenes = createRandomEndpoints(regulatory, len(globalAgingGenes), 44)
controlCount3 = connectionEnrichment(randomGenes, randomGenes)

In [None]:
# printing results
printResultsWithStats(sortDic(controlCount3))

In [None]:
# creating graph
agingVal = list(agingCount.values())
controlVal1 = list(controlCount1.values())
controlVal2 = list(controlCount2.values())
controlVal3 = list(controlCount3.values())
agingName = ['aging' for i in range(len(agingVal))]
controlName1 = ['control1' for i in range(len(controlVal1))]
controlName2 = ['control2' for i in range(len(controlVal2))]
controlName3 = ['control3' for i in range(len(controlVal3))]

counts = agingVal + controlVal1 + controlVal2 + controlVal3
names = agingName + controlName1 + controlName2 + controlName3

df = pd.DataFrame({'counts': counts, 'names': names})

sns.set_theme(style="ticks")

f, ax = plt.subplots(figsize=(7, 5))
sns.despine(f)

sns.histplot(
    df,
    x="counts", hue="names",
    multiple="dodge",
    palette="light:m_r",
    edgecolor=".3",
    linewidth=.5,
    bins = 16,
)

plt.yscale('log')
plt.xlabel("Number of overlaps")

In [None]:
for name1, dataset1 in zip(['aging', 'control1', 'control2', 'control3'], [agingCount, controlCount1, controlCount2, controlCount3]):
    print(f'Origin: {name1}')
    for name2, dataset2 in zip(['aging', 'control1', 'control2', 'control3'], [agingCount, controlCount1, controlCount2, controlCount3]):
        print(f'Endpoint: {name2}')
        intersection = set(dataset1).intersection(set(dataset2))
        sideTotal2 = sum([dataset2[i] for i in intersection])
        print(f'{len(intersection)} // {sideTotal2} // {int(sideTotal2 / len(intersection))}')