# Grapher

## Data description

### UKBB_94traits_release1.{tsv|bed}.gz

This file contains genetic variant data used in a study investigating 94 complex diseases and traits from the UK Biobank. Each row represents a variant with columns detailing characteristics such as its genomic location, allele details, association statistics, and more. It also includes indicators for linkage disequilibrium with variants failing Hardy Weinberg equilibrium or with common structural variants. This file is particularly valuable for those interested in the genetic association results and the fine-mapping of these traits and diseases.

Columns:

1. Chromosome: hg19 autosomes only
2. Start: 0-indexed hg19 start position
3. End: 0-indexed hg19 end position
4. Variant: unique variant identifier (chr:pos:ref:alt)
5. rsid: rsid identifier
6. Allele1: reference allele in hg19
7. Allele2: alternative allele in hg19
8. Minor allele: minor allele in cohort
9. Cohort: GWAS cohort
10. Model_marginal: type of regression model used
11. Method: fine-mapping method used
12. Trait: abbreviation for phenotype in genetic association tests
13. Region: fine-mapping region in hg19
14. MAF: minor allele frequency in cohort
15. Beta_marginal: marginal association effect size (effect allele: alternative)
16. SE_marginal: standard error on marginal association effect size
17. Chisq_marginal: test statistic for marginal association
18. PIP: posterior probability of association from fine-mapping
19. CS_ID: ID of 95% credible set (-1 if variant not in 95% CS)
20. Beta_posterior: posterior expectation of true effect size (effect allele: alternative)
21. SD_posterior: posterior standard deviation of true effect size
22. LD_HWE: indicator for LD (R^2 > 0.6) with a variant that failed HWE (p < 10^-12) in UK10K LD
23. LD_SV: indicator for LD (R^2 > 0.8) with a common structural variant in gnomAD European samples

## Load libraries

In [None]:
import os
import pandoc

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support, roc_auc_score, average_precision_score
from sklearn.preprocessing import LabelEncoder, StandardScaler

import torch
import torch.nn.functional as F
import torch_geometric
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv, GATConv
from torch_geometric.utils import to_undirected, negative_sampling

import networkx as nx
from ogb.io import DatasetSaver
from ogb.linkproppred import LinkPropPredDataset

from scipy.spatial import cKDTree

## Perform checks

In [None]:
print(f"PyTorch version: {torch.__version__}")
print(f"PyTorch Geometric version: {torch_geometric.__version__}")

In [None]:
if torch.cuda.is_available():
    device = torch.device("cuda")          # Current CUDA device
    print(f"Using {torch.cuda.get_device_name()} ({device})")
    print(f"CUDA version: {torch.version.cuda}")
    print(f"Number of CUDA devices: {torch.cuda.device_count()}")
else:
    print("CUDA is not available on this device.")

## Load data

In [None]:
data = pd.read_csv('~/Desktop/geometric-omics/UKBB-fine-mapping/data/UKBB_94traits_release1.csv')
hg19_gene_positions = pd.read_csv("~/Desktop/geometric-omics/UKBB-fine-mapping/data/hg19-gene-positions.csv") 

In [None]:
data['position'] = data['variant'].str.split(':').str[1]

## Get geneSymbol

In [None]:
%%time

# Sort the dataframes and convert columns to string type
data = data.sort_values(by=['chromosome', 'start'])
data['chromosome'] = data['chromosome'].astype(str)

hg19_gene_positions = hg19_gene_positions.sort_values(by=['chrom', 'txStart'])
hg19_gene_positions['chrom'] = hg19_gene_positions['chrom'].astype(str)

# Define leniency
leniency = 100000

# Convert the 'chrom' column to category type for efficient memory usage
data['chromosome'] = data['chromosome'].astype('category')
hg19_gene_positions['chrom'] = hg19_gene_positions['chrom'].astype('category')

# Create an empty dictionary to store geneSymbols
gene_symbols_dict = {}

# Iterate over unique chromosome
for chromosome in data['chromosome'].cat.categories:
    # Subset data for current chromosome
    data_chromosome = data[data['chromosome'] == chromosome]
    hg19_gene_positions_chromosome = hg19_gene_positions[hg19_gene_positions['chrom'] == chromosome]

    # Build KDTree for efficient nearest-neighbor search
    tree = cKDTree(np.expand_dims(hg19_gene_positions_chromosome['txStart'].values, axis=1))

    # Query the KDTree for nearest neighbors within the leniency
    distances, indices = tree.query(np.expand_dims(data_chromosome['start'].values, axis=1), distance_upper_bound=leniency)

    # Create a list of gene symbols
    gene_symbols = []
    for idx, distance in zip(indices, distances):
        if distance == np.inf:
            gene_symbols.append('N/A')  # or any other default value you want
        else:
            gene_symbols.append(hg19_gene_positions_chromosome.iloc[idx]['geneSymbol'])

    # Assign geneSymbols to data dictionary
    for idx, gene_symbol in zip(data_chromosome.index, gene_symbols):
        gene_symbols_dict[idx] = gene_symbol

# Convert the dictionary to a Series and assign it to a new column in 'data'
data['geneSymbol'] = pd.Series(gene_symbols_dict)

In [None]:
null_values = data['geneSymbol'].isnull().sum()
print("Number of null values in data['geneSymbol']: ", null_values)

unique_elements1 = data['geneSymbol'].nunique()
print("Number of unique elements in data['geneSymbol']: ", unique_elements1)

unique_elements2 = hg19_gene_positions['geneSymbol'].nunique()
print("Number of unique elements in hg19_gene_positions['geneSymbol']: ", unique_elements2)

In [None]:
#data = data.sample(frac=0.2) 

## Proposed graph features

All columns from `data` dataframe:

Phenotype nodes features:
- `trait` column

Gene nodes features:
- `geneSymbol` column
- `chromosome` column
- `start` column
- `end` column

SNP node features:
- `rsid` column
- `chromosome` column
- `position` column
- `allele1` column
- `allele2` column

Edge features:
- undirected
- unweighted 
- positive if associations exist in graph
- negative if not (100 random negative edges for every positive edge)

## Create graph

In [None]:
%%time

import random

# Create mappings for phenotypes, genes, and SNPs to integer indices
phenotypes = data['trait'].unique()
genes = data['geneSymbol'].unique()
snps = data['rsid'].unique()
phenotype_to_idx = {phenotype: idx for idx, phenotype in enumerate(phenotypes)}
gene_to_idx = {gene: idx + len(phenotypes) for idx, gene in enumerate(genes)}
snp_to_idx = {snp: idx + len(phenotypes) + len(genes) for idx, snp in enumerate(snps)}


# Create node feature vectors for phenotypes, genes, and SNPs
phenotype_features = data.loc[data['trait'].isin(phenotypes)][['trait']].drop_duplicates().sort_values(by='trait').reset_index(drop=True)
gene_features = data.loc[data['geneSymbol'].isin(genes)][['geneSymbol', 'chromosome', 'start', 'end']].drop_duplicates().sort_values(by='geneSymbol').reset_index(drop=True)
snp_features = data.loc[data['rsid'].isin(snps)][['rsid', 'chromosome', 'position', 'allele1', 'allele2']].drop_duplicates().sort_values(by='rsid').reset_index(drop=True)

# Create node type labels
node_types = torch.tensor([0] * len(phenotypes) + [1] * len(genes) + [2] * len(snps), dtype=torch.long)

# Create positive edges between SNPs and genes
positive_edges_snp_gene = data.loc[:, ['rsid', 'geneSymbol']].drop_duplicates()
positive_edges_snp_gene['snp_idx'] = positive_edges_snp_gene['rsid'].map(snp_to_idx)
positive_edges_snp_gene['gene_idx'] = positive_edges_snp_gene['geneSymbol'].map(gene_to_idx)
positive_edges_snp_gene = positive_edges_snp_gene[['snp_idx', 'gene_idx']].values
positive_edges_snp_gene = torch.tensor(positive_edges_snp_gene, dtype=torch.long).t().contiguous()

# Create positive edges between genes and phenotypes
positive_edges_gene_phenotype = data.loc[:, ['geneSymbol', 'trait']].drop_duplicates()
positive_edges_gene_phenotype['gene_idx'] = positive_edges_gene_phenotype['geneSymbol'].map(gene_to_idx)
positive_edges_gene_phenotype['phenotype_idx'] = positive_edges_gene_phenotype['trait'].map(phenotype_to_idx)
positive_edges_gene_phenotype = positive_edges_gene_phenotype[['gene_idx', 'phenotype_idx']].values
positive_edges_gene_phenotype = torch.tensor(positive_edges_gene_phenotype, dtype=torch.long).t().contiguous()

# Create negative edges for SNP-Gene
negative_edges_snp_gene = []
positive_edges_set = set([tuple(x) for x in positive_edges_snp_gene.t().tolist()])
for _ in range(100_000 * len(positive_edges_snp_gene[0])):
    while True:
        random_snp = random.randint(0, len(snps) - 1)
        random_gene = random.randint(0, len(genes) - 1)
        negative_edge = (random_snp, random_gene)
        if negative_edge not in positive_edges_set:
            negative_edges_snp_gene.append(negative_edge)
            break
negative_edges_snp_gene = torch.tensor(negative_edges_snp_gene, dtype=torch.long).t().contiguous()

# Create negative edges for Gene-Phenotype
negative_edges_gene_phenotype = []
positive_edges_set = set([tuple(x) for x in positive_edges_gene_phenotype.t().tolist()])
for _ in range(100 * len(positive_edges_gene_phenotype[0])):
    while True:
        random_gene = random.randint(0, len(genes) - 1)
        random_phenotype = random.randint(0, len(phenotypes) - 1)
        negative_edge = (random_gene, random_phenotype)
        if negative_edge not in positive_edges_set:
            negative_edges_gene_phenotype.append(negative_edge)
            break
negative_edges_gene_phenotype = torch.tensor(negative_edges_gene_phenotype, dtype=torch.long).t().contiguous()

# Combine positive and negative edges
edges = torch.cat([positive_edges_snp_gene, positive_edges_gene_phenotype, negative_edges_snp_gene, negative_edges_gene_phenotype], dim=1)

# Create edge attributes
edge_attr = torch.ones(edges.size(1), dtype=torch.float)

# Combine the feature vectors
combined_features = pd.concat([phenotype_features, gene_features, snp_features], ignore_index=True).drop(['trait', 'geneSymbol', 'rsid'], axis=1)

# Now you can fill NaNs with 'N/A'
nan_replacements = {'chromosome': 'N/A', 'start': 0, 'end': 0, 'position': 0, 'allele1': 'N/A', 'allele2': 'N/A'}
for col, replacement in nan_replacements.items():
    if col in combined_features:
        if combined_features[col].dtype.name == 'category' and replacement not in combined_features[col].cat.categories:
            combined_features[col] = combined_features[col].cat.add_categories([replacement])
        combined_features[col].fillna(replacement, inplace=True)

# Label encoding for categorical columns
le = LabelEncoder()
combined_features = combined_features.apply(lambda col: le.fit_transform(col.astype(str)) if col.dtype == 'object' else col)

# Standardize numerical features
scaler = StandardScaler()
numerical_columns = ['start', 'end', 'position']
categorical_columns = ['chromosome', 'allele1', 'allele2']
for col in categorical_columns:
    combined_features[col] = combined_features[col].astype('category').cat.codes

features = torch.tensor(combined_features.values, dtype=torch.float)

# Create the PyTorch Geometric graph
graph = Data(x=features, edge_index=edges, edge_attr=edge_attr)
graph.node_types = node_types

print(f"Number of nodes: {graph.num_nodes}")
print(f"Number of positive edges between SNPs and genes: {positive_edges_snp_gene.size(1)}")
print(f"Number of positive edges between genes and phenotypes: {positive_edges_gene_phenotype.size(1)}")
print(f"Number of negative edges for SNPs and genes: {negative_edges_snp_gene.size(1)}")
print(f"Number of negative edges for genes and phenotypes: {negative_edges_gene_phenotype.size(1)}")
print(f"Number of edges: {graph.num_edges}")
print(f"Node feature dimension: {graph.num_node_features}")
print(f"Node types: {graph.node_types}")

## Graph stats

In [None]:
# Check for NaN values in features
nan_in_features = torch.isnan(graph.x).any().item()
print(f"Are there any NaN values in features? {nan_in_features}")

In [None]:
from torch_geometric.utils import degree

# Compute the degree of each node
degrees = degree(graph.edge_index[0])

# Print statistics about the degrees
print(f"Minimum degree: {degrees.min().item()}")
print(f"Maximum degree: {degrees.max().item()}")
print(f"Average degree: {degrees.float().mean().item()}")

# Count the number of nodes with degree 1
num_nodes_degree_1 = (degrees == 1).sum().item()
print(f"Number of nodes with degree 1: {num_nodes_degree_1}")

## Data splitting

In [None]:
from torch_geometric.transforms import RandomLinkSplit 

transform = RandomLinkSplit(num_val=0.2, num_test=0.2, is_undirected=True)
graph_train, graph_val, graph_test = transform(graph)

print(graph_train)
print(graph_val)
print(graph_test)

## Create model

In [None]:
# Task: Link prediction: does an edge exist between two nodes?
# Node Types: 0 = phenotypes, 1 = gene, 2 = snps
# Node Feature Vector: 6-dimensional

torch.cuda.empty_cache()

# Define the GCN model
class GCN(torch.nn.Module):
    def __init__(self, hidden_channels):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(6, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, hidden_channels)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.conv2(x, edge_index)
        return x

# Train and evaluate the model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = GCN(hidden_channels=16).to(device)

graph_train = graph_train.to(device)
graph_val = graph_val.to(device)
graph_test = graph_test.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)

# Train function
from torch_geometric.utils import negative_sampling
def train():
    model.train()
    optimizer.zero_grad()
    z = model(graph_train.x.float(), graph_train.edge_index)

    # Only consider positive edges for the positive score calculation
    pos_edge_index = graph_train.edge_index
    pos = (z[pos_edge_index[0]] * z[pos_edge_index[1]]).sum(dim=-1)

    # Use negative_sampling to generate negative edges
    neg_edge_index = negative_sampling(edge_index=pos_edge_index, num_nodes=z.size(0), num_neg_samples=pos_edge_index.size(1))
    neg = (z[neg_edge_index[0]] * z[neg_edge_index[1]]).sum(dim=-1)

    logits = torch.cat([pos, neg], dim=0)
    targets = torch.tensor([1] * pos.size(0) + [0] * neg.size(0), dtype=torch.float32).to(device)

    loss = F.binary_cross_entropy_with_logits(logits, targets)
    loss.backward()
    optimizer.step()
    return loss.item()


# Evaluation function
def evaluate(edge_index, graph):
    model.eval()
    with torch.no_grad():
        z = model(graph.x.float(), graph.edge_index)
        pos = torch.sigmoid((z[edge_index[0]] * z[edge_index[1]]).sum(dim=-1)).view(-1)
        neg_edge_index = negative_sampling(edge_index, num_nodes=graph.num_nodes, num_neg_samples=edge_index.size(1))
        neg = torch.sigmoid((z[neg_edge_index[0]] * z[neg_edge_index[1]]).sum(dim=-1)).view(-1)

        preds = np.concatenate([pos.cpu().numpy(), neg.cpu().numpy()])
        true_labels = np.concatenate([np.ones_like(pos.cpu().numpy()), np.zeros_like(neg.cpu().numpy())])

        roc_auc = roc_auc_score(true_labels, preds)
        mrr = compute_mrr(preds, true_labels)
        hits_at_5 = compute_hits_at_k(preds, true_labels, k=5)

        return roc_auc, mrr, hits_at_5

def compute_mrr(preds, true_labels):
    # Find the predicted scores for positive examples
    pos_preds = preds[:len(true_labels)]
    # Rank the positive examples by predicted score in descending order
    sorted_idx = np.argsort(pos_preds)[::-1]
    # Find the rank of the first true positive
    for i, idx in enumerate(sorted_idx):
        if true_labels[idx] == 1:
            return 1.0 / (i + 1)
    return 0.0

def compute_hits_at_k(preds, true_labels, k=5):
    # Find the predicted scores for positive examples
    pos_preds = preds[:len(true_labels)]
    # Rank the positive examples by predicted score in descending order
    sorted_idx = np.argsort(pos_preds)[::-1]
    # Check if the first k predictions contain at least one true positive
    hits = 0
    for idx in sorted_idx[:k]:
        if true_labels[idx] == 1:
            hits = 1
            break
    return hits

max_val_roc_auc = -np.inf
max_val_mrr = -np.inf
max_val_hits5 = -np.inf

max_test_roc_auc = -np.inf
max_test_mrr = -np.inf
max_test_hits5 = -np.inf

for epoch in range(150):
    loss = train()
    val_roc_auc, val_mrr, val_hits_at_5 = evaluate(graph_val.edge_index, graph_val)
    print(f"Epoch: {epoch + 1}, Loss: {loss:.4f}, Val ROC-AUC: {val_roc_auc:.10f}, Val MRR: {val_mrr:.10f}, Val Hits@5: {val_hits_at_5}")
    max_val_roc_auc = max(max_val_roc_auc, val_roc_auc)
    max_val_mrr = max(max_val_mrr, val_mrr)
    max_val_hits5 = max(max_val_hits5, val_hits_at_5)

## Evaluate model

In [None]:
val_roc_auc, val_mrr, val_hits5 = evaluate(graph_val.edge_index, graph_val)
test_roc_auc, test_mrr, test_hits5 = evaluate(graph_test.edge_index, graph_test)

max_test_roc_auc = max(max_test_roc_auc, test_roc_auc)
max_test_mrr = max(max_test_mrr, test_mrr)
max_test_hits5 = max(max_test_hits5, test_hits5)

print(f"Maximum Validation ROC-AUC: {max_val_roc_auc:.10f}")
print(f"Maximum Validation MRR: {max_val_mrr:.10f}")
print(f"Maximum Validation Hits@5: {max_val_hits5:.10f}")

print(f"Maximum Test ROC-AUC: {max_test_roc_auc:.10f}")
print(f"Maximum Test MRR: {max_test_mrr:.10f}")
print(f"Maximum Test Hits@5: {max_test_hits5:.10f}")