## Data Setup 

### Libraries

In [1]:
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pyarrow.parquet as pq


from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support, roc_auc_score, average_precision_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler


import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.nn.init as init

import torch_sparse

import torch_geometric
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv, GATConv
from torch_geometric.utils import to_undirected, negative_sampling, subgraph
from torch_geometric.transforms import RandomLinkSplit

import networkx as nx
from ogb.io import DatasetSaver
from ogb.linkproppred import LinkPropPredDataset

In [2]:
print(f"PyTorch version: {torch.__version__}")
print(f"PyTorch Geometric version: {torch_geometric.__version__}")

PyTorch version: 2.0.0+cu118
PyTorch Geometric version: 2.3.1


In [3]:
if torch.cuda.is_available():
    device = torch.device("cuda")          # Current CUDA device
    print(f"Using {torch.cuda.get_device_name()} ({device})")
    print(f"CUDA version: {torch.version.cuda}")
    print(f"Number of CUDA devices: {torch.cuda.device_count()}")
else:
    print("CUDA is not available on this device.")

Using NVIDIA GeForce RTX 3060 Ti (cuda)
CUDA version: 11.8
Number of CUDA devices: 1


### Load files

In [7]:
data_types = {'Pvalue': float, 'SNP': str, 'SNPChr': str, 'SNPPos': float,
              'AssessedAllele': str, 'OtherAllele': str, 'Zscore': float,
              'Gene': str, 'GeneSymbol': str, 'GeneChr': str, 'GenePos': float,
              'NrCohorts': int, 'NrSamples': int, 'FDR': float,
              'BonferroniP': float, 'GeneStart': float, 'GeneEnd': float}

gene_data = pd.read_csv("~/Desktop/geometric-omics/data/sig-combined-with-genes.csv", dtype=data_types)

In [8]:
%%time
# Read the Parquet files into Pandas DataFrames
trans = pd.read_parquet('~/Desktop/geometric-omics/data/big-trans.parquet')
cis = pd.read_parquet('~/Desktop/geometric-omics/data/big-cis.parquet')

sig_trans = pd.read_parquet('~/Desktop/geometric-omics/data/sig-trans.parquet')
sig_cis = pd.read_parquet('~/Desktop/geometric-omics/data/sig-cis.parquet')

CPU times: total: 4min 38s
Wall time: 5min 46s


In [10]:
# Print DataFrame lengths
print("trans length:", len(trans))
print("cis length:", len(cis))
print("sig_trans length:", len(sig_trans))
print("sig_cis length:", len(sig_cis))

trans length: 203547746
cis length: 127341798
sig_trans length: 59786
sig_cis length: 10507664


### Filter by Sig

In [None]:
%%time
# For trans_gene DataFrame
trans_gene['Sig'] = trans_gene.apply(lambda row: 1 if row.name in sig_trans.index and row.equals(sig_trans.loc[row.name]) else 0, axis=1)

# For cis_gene DataFrame
cis_gene['Sig'] = cis_gene.apply(lambda row: 1 if row.name in sig_cis.index and row.equals(sig_cis.loc[row.name]) else 0, axis=1)

### Map gene start/end info to dfs

In [None]:
def add_gene_start_end(df, gene_data):
    # Group gene_data by Gene
    gene_groups = gene_data.groupby('Gene')

    # Create a dictionary mapping genes to their GeneStart and GeneEnd values
    gene_dict = {
        gene: {'GeneStart': group['GeneStart'].iloc[0], 'GeneEnd': group['GeneEnd'].iloc[0]}
        for gene, group in gene_groups
        if not group['GeneStart'].isna().any() and not group['GeneEnd'].isna().any()
    }

    # Set GeneStart and GeneEnd columns of the dataframe using the gene_dict
    df = df.assign(
        GeneStart=df['Gene'].apply(lambda gene: gene_dict.get(gene, {}).get('GeneStart')),
        GeneEnd=df['Gene'].apply(lambda gene: gene_dict.get(gene, {}).get('GeneEnd'))
    )

    # Drop rows with missing GeneStart or GeneEnd values
    df.dropna(subset=['GeneStart', 'GeneEnd'], inplace=True)

    return df

In [None]:
%%time
trans_gene = add_gene_start_end(trans_gene, gene_data)
cis_gene = add_gene_start_end(cis_gene, gene_data)

In [None]:
print(cis_gene.columns)
print(trans_gene.columns)

In [None]:
print(len(cis_gene))
print(len(trans_gene))
print(len(cis))
print(len(trans))

### To CSV

In [None]:
%%time
trans_gene.to_csv('big-trans-genes.csv')
cis_gene.to_csv('big-cis-genes.csv')