In [1]:
import os

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import torch

import networkx as nx
import torch_geometric
from torch_geometric.data import Data
from torch_geometric.utils import to_undirected

from ogb.io import DatasetSaver
from ogb.linkproppred import LinkPropPredDataset

## README 
https://www.eqtlgen.org/phase1.html

### cis-eQTL
This README accompanies the files with cis-eQTL results from eQTLGez


#### File
- File with full cis-eQTL results: 2019-12-11-cis-eQTLsFDR-ProbeLevel-CohortInfoRemoved-BonferroniAdded.txt.gz
- File with significant (FDR<0.05) cis-eQTL results: 2019-12-11-cis-eQTLsFDR0.05-ProbeLevel-CohortInfoRemoved-BonferroniAdded.txt.gz 

#### Column Names
- Pvalue - P-value
- SNP - SNP rs ID
- SNPChr - SNP chromosome
- SNPPos - SNP position
- AssessedAllele - Assessed allele, the Z-score refers to this allele
- OtherAllele - Not assessed allele
- Zscore - Z-score
- Gene - ENSG name (Ensembl v71) of the eQTL gene
- GeneSymbol - HGNC name of the gene
- GeneChr - Gene chromosome
- GenePos - Centre of gene position
- NrCohorts - Total number of cohorts where this SNP-gene combination was tested
- NrSamples - Total number of samples where this SNP-gene combination was tested
- FDR - False discovery rate estimated based on permutations
- BonferroniP - P-value after Bonferroni correction

#### Additional information
- These files contain all cis-eQTL results from eQTLGen, accompanying the article.
- 19,250 genes that showed expression in blood were tested.
- Every SNP-gene combination with a distance <1Mb from the center of the gene and  tested in at least 2 cohorts was included.
- Associations where SNP/proxy positioned in Illumina probe were not removed from combined analysis.


### trans-eQTL

This README accompanies the file with trans-eQTL results from eQTLGen 

- File with full trans-eQTL results: 2018-09-04-trans-eQTLsFDR-CohortInfoRemoved-BonferroniAdded.txt.gz
- File with significant (FDR<0.05) trans-eQTL results: 2018-09-04-trans-eQTLsFDR0.05-CohortInfoRemoved-BonferroniAdded.txt.gz

#### Column Names
- Pvalue - P-value
- SNP - SNP rs ID
- SNPChr - SNP chromosome
- SNPPos - SNP position
- AssessedAllele - Assessed allele, the Z-score refers to this allele
- OtherAllele - Not assessed allele
- Zscore - Z-score
- Gene - ENSG name (Ensembl v71) of the eQTL gene
- GeneSymbol - HGNC name of the gene
- GeneChr - Gene chromosome
- GenePos - Centre of gene position
- NrCohorts - Total number of cohorts where this SNP-gene combination was tested
- NrSamples - Total number of samples where this SNP-gene combination was tested
- FDR - False discovery rate estimated based on permutations
- BonferroniP - P-value after Bonferroni correction

#### Additional information
- These files contain all trans-eQTL results from eQTLGen, accompanying the article.
- 19,960 genes that showed expression in blood were tested.
- 10,317 trait-associated SNPs (based on GWAS Catalog, Immunobase and Astle et al. study, see Online Methods) were tested.
- Every SNP-gene combination with a distance >5Mb and tested in at least 2 cohorts was included.

#### FDR calculation
---------------
To determine nominal P-value threshold corresponding to FDR=0.05, we used a pruned set of SNPs for trans-eQTL mapping and permutation-based FDR calculation (See Methods).

#### Crossmapping filter
-------------------
Some trans-eQTL are artefacts resulting from genes that map (partially) to a location nearby the eQTL SNP, effectively represting a cis-eQTL effect.
These potential artefacts were identified by mapping trans-eQTL gene sequences to the immediate surroundings of the SNP (Supplementary Note).
If there was strong evidence that the trans-eQTL was a cross-mapping artefact, it was removed from the list of significant trans-eQTLs.
After filtering, the FDR was re-calculated on the remaining effects.

PLEASE NOTE: the full results file have not been filtered for cross-mapping effects.
The file may include artefacts resulting from genes that map (partially) to a location nearby the eQTL SNP, effectively representing a cis-eQTL effect.

In [2]:
# Read files
cis = pd.read_csv("sig-cis.txt", sep='\t')
trans = pd.read_csv("sig-trans.txt", sep='\t')

In [3]:
# Print cis
cis.head()

Unnamed: 0,Pvalue,SNP,SNPChr,SNPPos,AssessedAllele,OtherAllele,Zscore,Gene,GeneSymbol,GeneChr,GenePos,NrCohorts,NrSamples,FDR,BonferroniP
0,3.2717e-310,rs12230244,12,10117369,T,A,200.7534,ENSG00000172322,CLEC12A,12,10126104,34,30596,0.0,4.1662e-302
1,3.2717e-310,rs12229020,12,10117683,G,C,200.6568,ENSG00000172322,CLEC12A,12,10126104,34,30596,0.0,4.1662e-302
2,3.2717e-310,rs61913527,12,10116198,T,C,200.2654,ENSG00000172322,CLEC12A,12,10126104,34,30598,0.0,4.1662e-302
3,3.2717e-310,rs2594103,12,10115428,T,C,200.042,ENSG00000172322,CLEC12A,12,10126104,34,30598,0.0,4.1662e-302
4,3.2717e-310,rs12231833,12,10118428,A,G,199.9508,ENSG00000172322,CLEC12A,12,10126104,34,30592,0.0,4.1662e-302


In [4]:
# Print trans
trans.head()

Unnamed: 0,Pvalue,SNP,SNPChr,SNPPos,AssessedAllele,OtherAllele,Zscore,Gene,GeneSymbol,GeneChr,GenePos,NrCohorts,NrSamples,FDR,BonferroniP
0,1.128895e-308,rs3811444,1,248039451,T,C,-72.1091,ENSG00000166086,JAM3,11,133980358,37,31684,0.0,6.6595e-302
1,1.128895e-308,rs35340377,1,248038210,A,G,-65.4239,ENSG00000166086,JAM3,11,133980358,37,31684,0.0,6.6595e-302
2,1.128895e-308,rs705705,12,56435504,C,G,60.9665,ENSG00000265688,MAFG-AS1,17,79887167,9,10740,0.0,6.6595e-302
3,1.128895e-308,rs1131017,12,56435929,C,G,59.0847,ENSG00000265688,MAFG-AS1,17,79887167,20,15015,0.0,6.6595e-302
4,1.128895e-308,rs10876864,12,56401085,G,A,58.8148,ENSG00000265688,MAFG-AS1,17,79887167,19,14930,0.0,6.6595e-302


In [5]:
print(f"PyTorch version: {torch.__version__}")
print(f"PyTorch Geometric version: {torch_geometric.__version__}")

PyTorch version: 2.0.0+cu118
PyTorch Geometric version: 2.3.1


In [6]:
# Combine the cis and trans dataframes
data = pd.concat([cis, trans], ignore_index=True)

# Create mappings for genes and SNPs to integer indices
genes = data['Gene'].unique()
snps = data['SNP'].unique()
gene_to_idx = {gene: idx for idx, gene in enumerate(genes)}
snp_to_idx = {snp: idx + len(genes) for idx, snp in enumerate(snps)}

# Create node type labels
node_types = torch.tensor([0] * len(genes) + [1] * len(snps), dtype=torch.long)

# Create edges
edges = data.apply(lambda row: (gene_to_idx[row['Gene']], snp_to_idx[row['SNP']]), axis=1)
edges = torch.tensor(list(edges), dtype=torch.long).t().contiguous()

# Convert edges to undirected
edges = to_undirected(edges)

# Create the PyTorch Geometric graph
graph = Data(x=node_types.view(-1, 1), edge_index=edges)

print(graph)

Data(x=[3681495, 1], edge_index=[2, 21134900])


In [7]:
G = nx.Graph()
for edge in graph.edge_index.t().numpy():
    G.add_edge(edge[0], edge[1])

# Number of nodes
num_nodes = G.number_of_nodes()

# Number of edges
num_edges = G.number_of_edges()

# Number of connected components
num_connected_components = nx.number_connected_components(G)

# Average degree
average_degree = np.mean([degree for _, degree in G.degree()])

# Density
density = nx.density(G)

# Assortativity
assortativity = nx.degree_assortativity_coefficient(G)

print(f"Number of nodes: {num_nodes}")
print(f"Number of edges: {num_edges}")
print(f"Number of connected components: {num_connected_components}")
print(f"Average degree: {average_degree:.2f}")
print(f"Density: {density:.10f}")
print(f"Assortativity: {assortativity:.10f}")

Number of nodes: 3681495
Number of edges: 10567450
Number of connected components: 424
Average degree: 5.74
Density: 0.0000015594
Assortativity: -0.2267915607


## OGB Dataset

In [8]:
# Create a constructor for the DatasetSaver:
dataset_name = 'ogbl-eQTL'
saver = DatasetSaver(dataset_name=dataset_name, is_hetero=False, version=1)

In [9]:
# Create mappings for genes and SNPs to integer indices:
genes = data['Gene'].unique()
snps = data['SNP'].unique()
gene_to_idx = {gene: idx for idx, gene in enumerate(genes)}
snp_to_idx = {snp: idx + len(genes) for idx, snp in enumerate(snps)}

In [10]:
# Create node type labels:
node_types = torch.tensor([0] * len(genes) + [1] * len(snps), dtype=torch.long)

In [11]:
# Create edges and convert them to undirected:
edges = data.apply(lambda row: (gene_to_idx[row['Gene']], snp_to_idx[row['SNP']]), axis=1)
edges = torch.tensor(list(edges), dtype=torch.long).t().contiguous()
edges = to_undirected(edges)

In [12]:
# Create the PyTorch Geometric graph and save it
graph = Data(x=node_types.view(-1, 1), edge_index=edges)

# Convert the PyTorch Geometric graph to a dictionary format for OGB
graph_dict = {
    'edge_index': graph.edge_index.numpy(),
    'num_nodes': graph.num_nodes,
    'node_feat': graph.x.numpy()
}

# Save the graph list with a single graph
saver.save_graph_list([graph_dict])

dict_keys(['edge_index', 'num_nodes', 'node_feat'])
Saving edge_index
Saving all the files!
Validating...
Reading saved files
Loading necessary files...
This might take a while.
Processing graphs...


100%|████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<?, ?it/s]


Checking read graphs and given graphs are the same


100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.62it/s]


In [13]:
# Calculate the number of data points and create a permutation of indices:
num_data = graph.edge_index.shape[1] // 2  # Since it's an undirected graph
perm = np.random.permutation(num_data)

In [14]:
# Create the train, valid, and test splits using the 80:10:10 ratio:
train_size = int(0.8 * num_data)
valid_size = int(0.1 * num_data)

split_idx = dict()
split_idx['train'] = perm[:train_size]
split_idx['valid'] = perm[train_size: train_size + valid_size]
split_idx['test'] = perm[train_size + valid_size:]

In [15]:
# Save the dataset split using the save_split method of the saver object:
saver.save_split(split_idx, split_name='random')

In [16]:
# Store all the mapping information and README.md in mapping_path and 
# call saver.copy_mapping_dir(mapping_path):

mapping_path = 'mapping/'

# prepare mapping information first and store it under this directory (empty below).
os.makedirs(mapping_path, exist_ok=True)  # Add exist_ok=True to avoid issues if the directory already exists

# Replace os.mknod() with open() to create an empty README.md file
with open(os.path.join(mapping_path, 'README.md'), 'w') as f:
    pass

saver.copy_mapping_dir(mapping_path)

In [17]:
# Saving task information
saver.save_task_info(task_type='link_prediction', eval_metric='rocauc')

link_prediction
None


In [18]:
# Get the meta information dictionary:
meta_dict = saver.get_meta_dict()

In [19]:
# Test the OGB dataset object:
dataset = LinkPropPredDataset(dataset_name, meta_dict=meta_dict)
print(dataset[0])

Loading necessary files...
This might take a while.
Processing graphs...


100%|████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<?, ?it/s]

Saving...





{'edge_index': array([[      0,       0,       0, ..., 3681492, 3681493, 3681494],
       [  17470,   17471,   17472, ...,   11916,    6531,    6488]],
      dtype=int64), 'edge_feat': None, 'node_feat': array([[0],
       [0],
       [0],
       ...,
       [1],
       [1],
       [1]], dtype=int64), 'num_nodes': 3681495}


In [20]:
# Zip and clean up:
saver.zip()
saver.cleanup()