In [1]:
%%writefile my_functions.py

import pandas as pd
import os
from tqdm import tqdm
import statistics
import seaborn as sns
import matplotlib.pyplot as plt
import multiprocessing
from graph_tool.all import *

# loading and cleaning ENSG converter
def createGeneConverter():
    test = []
    geneId_geneName = {}
    with open('Homo_sapiens.GRCh37.74.gtf', 'r') as file:
        for line in file:
            line = line.strip()
            data = line.split('\t')[-1]
            test.append(data)
            if 'gene_name' in data:
                attributes = data.split(';')
                geneId = attributes[0].split(' ')[1].strip('"')
                for attr in attributes:
                    if 'gene_name' in attr:
                        geneName = attr.split(' ')[2].strip('"')
                        if geneId not in geneId_geneName:
                            geneId_geneName[geneId] = geneName
    geneName_geneId = {v: k for k, v in geneId_geneName.items()}
    return geneName_geneId, geneId_geneName

def createRegulatory(regulatory_filepath):
    geneName_geneId, geneId_geneName = createGeneConverter()
    # loading, cleaning, and permutating regulatory dataset
    print(f'Loading: {regulatory_filepath}')
    regulatory = pd.read_csv(regulatory_filepath, index_col=0)
    
    exceptions = []
    for name in regulatory.index:
        try:
            regulatory = regulatory.rename(index={name: geneName_geneId[name]})
        except:
            exceptions.append(name)
    
    print(f'Row exception count: {len(exceptions)}')
    
    for exc in exceptions:
        regulatory = regulatory.drop(exc)

    # if columns are ENSG IDs, uncomment this part
    exceptions = []
    for ID in regulatory.columns.tolist():
        if ID not in geneId_geneName:
            exceptions.append(ID)

    # if columns are gene names, uncomment this part
    # exceptions = []
    # for name in regulatory.columns.tolist():
    #     try:
    #         regulatory = regulatory.rename(columns={name: geneName_geneId[name]})
    #     except:
    #         exceptions.append(name)

    
    print(f'Column exception count: {len(exceptions)}')

    for exc in exceptions:
        regulatory = regulatory.drop(exc)

    
    def inverse(x):
        return 1/x
    
    def absolute(x):
        return abs(x)
    
    regulatory = regulatory.map(inverse)
    regulatory = regulatory.map(absolute)

    return regulatory

def createEndpoints(filepath):
    geneName_geneId, geneId_geneName = createGeneConverter()
    # loading and cleaning dataset
    endpoints = pd.read_csv(filepath, sep='\t', index_col=0)
    
    exceptions = []
    for name in endpoints.index:
        try:
            endpoints = endpoints.rename(index={name: geneName_geneId[name.upper()]})
        except:
            exceptions.append(name)
    
    print(f'Endpoints exception count: {len(exceptions)}')
    
    for exc in exceptions:
        endpoints = endpoints.drop(exc)

    return list(endpoints.index)

def createRandomEndpoints(regulatory, num, seed):
    rand = regulatory.sample(n=num, random_state=seed)
    return list(rand.index)

def createGraph(regulatory, endpoints):
    regMatrix = regulatory.to_numpy().tolist()
    nodes = list(set(list(regulatory.index) + regulatory.columns.tolist() + endpoints))
    g = Graph()
    name_prop = g.new_vertex_property('string')
    vertexNameDic = {}
    for node in nodes:
        v = g.add_vertex()
        name_prop[v] = node
        vertexNameDic[v] = node
    g.vertex_properties['name'] = name_prop
    nameVertexDic = {v: k for k, v in vertexNameDic.items()}

    edgeCount = 0
    g.ep.weight = g.new_edge_property("double")
    for rowName, row in zip(regulatory.index, regMatrix):
        for columnName, cell in zip(regulatory.columns.tolist(), row):
            
            rowVertex = nameVertexDic[rowName]
            columnVertex = nameVertexDic[columnName]
            
            edge = g.add_edge(rowVertex, columnVertex)
            g.ep.weight[edge] = cell
            edgeCount += 1
    
    print(f'EdgeCount: {edgeCount}')

    return g, nameVertexDic, vertexNameDic

def processData(data):
    dataset = data['dataset']
    origins = data['origins']
    endpoints = data['endpoints']
    regulatory = createRegulatory(f'data/{dataset}')
    g, nameVertexDic, vertexNameDic = createGraph(regulatory, endpoints)
    
    count = {}
    for i, origin in enumerate(origins):
        print(i)
        for endpoint in endpoints:
            try:
                path = shortest_path(g, nameVertexDic[origin], nameVertexDic[endpoint], weights=g.ep.weight)
                path.pop(0)
                path.pop()
                for node in [vertexNameDic[node] for node in path]:
                    if node not in count:
                        count[node] = 1
                    else:
                        count[node] += 1
            except:
                pass
        
    return count

def connectionEnrichment(origins, endpoints):
    datasets = os.listdir('data')
    datasets.remove('.DS_Store')
    count = {}

    data = [{'dataset': dataset, 'origins': origins, 'endpoints': endpoints} for dataset in datasets]
    with multiprocessing.Pool(processes=12) as pool:
        results = pool.map(processData, data)
            
    for dic in results:
        for key, val in dic.items():
            if key not in count:
                count[key] = 1
            else:
                count[key] += 1
    return count
    
def sortDic(dic):
    return dict(reversed(sorted(dic.items(), key=lambda item: item[1])))

def printResultsWithStats(dic):
    print(f'Size: {len(dic)}')
    print(f'Average: {statistics.mean(dic.values())}')
    print(f'Median: {statistics.median(dic.values())}')
    print(dic)

Overwriting my_functions.py


In [2]:
# loading libraries
from my_functions import createRegulatory, createEndpoints, createRandomEndpoints, createGraph, processData, connectionEnrichment, sortDic, printResultsWithStats

In [3]:
import pandas as pd
import os
from tqdm import tqdm
import statistics
import seaborn as sns
import matplotlib.pyplot as plt
import multiprocessing

In [4]:
regulatory = createRegulatory(f'data/Testis.csv')
globalAgingGenes = createEndpoints('global_aging_genes.tsv')

Loading: data/Testis.csv
Row exception count: 0
Column exception count: 0
Endpoints exception count: 15


In [5]:
from graph_tool.all import *

endpoints = globalAgingGenes

regMatrix = regulatory.to_numpy().tolist()
nodes = list(set(list(regulatory.index) + regulatory.columns.tolist() + endpoints))
g = Graph()
name_prop = g.new_vertex_property('string')
vertexNameDic = {}
for node in nodes:
    v = g.add_vertex()
    name_prop[v] = node
    vertexNameDic[v] = node
g.vertex_properties['name'] = name_prop
nameVertexDic = {v: k for k, v in vertexNameDic.items()}

In [6]:
edgeCount = 0
g.ep.weight = g.new_edge_property("double")
for rowName, row in zip(regulatory.index, regMatrix):
    for columnName, cell in zip(regulatory.columns.tolist(), row):
        
        rowVertex = nameVertexDic[rowName]
        columnVertex = nameVertexDic[columnName]
        
        edge = g.add_edge(rowVertex, columnVertex)
        g.ep.weight[edge] = cell
        edgeCount += 1

print(f'EdgeCount: {edgeCount}')

EdgeCount: 19476492


In [12]:
path = shortest_path(g, nameVertexDic['ENSG00000141510'], nameVertexDic['ENSG00000100412'], weights=g.ep.weight)

In [15]:
path[0]

[<Vertex object with index '5952' at 0x39ea5eac0>,
 <Vertex object with index '557' at 0x39ea5e9c0>,
 <Vertex object with index '11009' at 0x39ea5e940>,
 <Vertex object with index '13490' at 0x39c92e540>]

In [19]:
[vertexNameDic[node] for node in path[0]]

['ENSG00000141510', 'ENSG00000162924', 'ENSG00000135625', 'ENSG00000100412']

In [None]:
# multi-dataset aging gene enrichment
globalAgingGenes = createEndpoints('global_aging_genes.tsv')
agingCount = connectionEnrichment(globalAgingGenes, globalAgingGenes)

Endpoints exception count: 15


In [None]:
## printing results
printResultsWithStats(sortDic(agingCount))

In [None]:
# control experiment 1
datasets = os.listdir('data')
datasets.remove('.DS_Store')
regulatory = createRegulatory(f'data/{datasets[0]}')
globalAgingGenes = createEndpoints('global_aging_genes.tsv')
randomGenes = createRandomEndpoints(regulatory, len(globalAgingGenes), 42)
controlCount1 = connectionEnrichment(randomGenes, randomGenes)

In [None]:
# printing results
printResultsWithStats(sortDic(controlCount1))

In [None]:
# control experiment 2
datasets = os.listdir('data')
datasets.remove('.DS_Store')
regulatory = createRegulatory(f'data/{datasets[0]}')
globalAgingGenes = createEndpoints('global_aging_genes.tsv')
randomGenes = createRandomEndpoints(regulatory, len(globalAgingGenes), 43)
controlCount2 = connectionEnrichment(randomGenes, randomGenes)

In [None]:
# printing results
printResultsWithStats(sortDic(controlCount2))

In [None]:
# control experiment 3
datasets = os.listdir('data')
datasets.remove('.DS_Store')
regulatory = createRegulatory(f'data/{datasets[0]}')
globalAgingGenes = createEndpoints('global_aging_genes.tsv')
randomGenes = createRandomEndpoints(regulatory, len(globalAgingGenes), 44)
controlCount3 = connectionEnrichment(randomGenes, randomGenes)

In [None]:
# printing results
printResultsWithStats(sortDic(controlCount3))

In [None]:
# creating graph
agingVal = list(agingCount.values())
controlVal1 = list(controlCount1.values())
controlVal2 = list(controlCount2.values())
controlVal3 = list(controlCount3.values())
agingName = ['aging' for i in range(len(agingVal))]
controlName1 = ['control1' for i in range(len(controlVal1))]
controlName2 = ['control2' for i in range(len(controlVal2))]
controlName3 = ['control3' for i in range(len(controlVal3))]

counts = agingVal + controlVal1 + controlVal2 + controlVal3
names = agingName + controlName1 + controlName2 + controlName3

df = pd.DataFrame({'counts': counts, 'names': names})

sns.set_theme(style="ticks")

f, ax = plt.subplots(figsize=(7, 5))
sns.despine(f)

sns.histplot(
    df,
    x="counts", hue="names",
    multiple="dodge",
    palette="light:m_r",
    edgecolor=".3",
    linewidth=.5,
    bins = 16,
)

plt.yscale('log')
plt.xlabel("Number of overlaps")

In [None]:
for name1, dataset1 in zip(['aging', 'control1', 'control2', 'control3'], [agingCount, controlCount1, controlCount2, controlCount3]):
    print(f'Origin: {name1}')
    for name2, dataset2 in zip(['aging', 'control1', 'control2', 'control3'], [agingCount, controlCount1, controlCount2, controlCount3]):
        print(f'Endpoint: {name2}')
        intersection = set(dataset1).intersection(set(dataset2))
        sideTotal2 = sum([dataset2[i] for i in intersection])
        print(f'{len(intersection)} // {sideTotal2} // {int(sideTotal2 / len(intersection))}')

In [None]:
genageDF = pd.read_csv("genage_human.csv")
genageList = list(genageDF['symbol'])
genageENSGList = [geneName_geneId[name] for name in genageList]

In [None]:
# experiment
filteredAgingCount = {k:v for k,v in agingCount.items() if k in genageENSGList}

In [None]:
printResultsWithStats(sortDic(filteredAgingCount))

In [None]:
# control 1
filteredControlCount1 = {k:v for k,v in controlCount1.items() if k in genageENSGList}

In [None]:
printResultsWithStats(sortDic(filteredControlCount1))

In [None]:
# control 1
filteredControlCount2 = {k:v for k,v in controlCount2.items() if k in genageENSGList}

In [None]:
printResultsWithStats(sortDic(filteredControlCount2))

In [None]:
# control 1
filteredControlCount3 = {k:v for k,v in controlCount3.items() if k in genageENSGList}

In [None]:
printResultsWithStats(sortDic(filteredControlCount3))

In [None]:
# creating graph
agingVal = list(filteredAgingCount.values())
controlVal1 = list(filteredControlCount1.values())
controlVal2 = list(filteredControlCount2.values())
controlVal3 = list(filteredControlCount3.values())
agingName = ['aging' for i in range(len(agingVal))]
controlName1 = ['control1' for i in range(len(controlVal1))]
controlName2 = ['control2' for i in range(len(controlVal2))]
controlName3 = ['control3' for i in range(len(controlVal3))]

counts = agingVal + controlVal1 + controlVal2 + controlVal3
names = agingName + controlName1 + controlName2 + controlName3

df = pd.DataFrame({'counts': counts, 'names': names})

sns.set_theme(style="ticks")

f, ax = plt.subplots(figsize=(7, 5))
sns.despine(f)

sns.histplot(
    df,
    x="counts", hue="names",
    multiple="dodge",
    palette="light:m_r",
    edgecolor=".3",
    linewidth=.5,
    bins = 16,
)

plt.yscale('log')
plt.xlabel("Number of overlaps between central genes and GenAge")