# Data

## Load libraries

In [1]:
import sys
import os
import random
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import sklearn
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support, roc_auc_score, average_precision_score
from sklearn.preprocessing import RobustScaler, LabelEncoder, StandardScaler, OrdinalEncoder, OneHotEncoder, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
import torch
import torch.nn.functional as F
import torch_geometric
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv, GATConv
from torch_geometric.utils import to_undirected, negative_sampling
import networkx as nx
from scipy.spatial import cKDTree
from scipy.special import expit
from typing import List, Dict
import time
import cProfile
import pstats
import io
import category_encoders as ce
import torch
import torch.nn.functional as F
from torch_geometric.nn import SAGEConv
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score
import copy
from torch_geometric.transforms import RandomNodeSplit
from collections import Counter
from category_encoders import BinaryEncoder
import cProfile
import pstats
import io



# Print versions of imported libraries
print(f"Python version: {sys.version}")
print(f"NumPy version: {np.__version__}")
print(f"Pandas version: {pd.__version__}")
print(f"Matplotlib version: {matplotlib.__version__}")
print(f"Scikit-learn version: {sklearn.__version__}")
print(f"Torch version: {torch.__version__}")
print(f"Torch Geometric version: {torch_geometric.__version__}")
print(f"NetworkX version: {nx.__version__}")

if torch.cuda.is_available():
    device = torch.device("cuda")          # Current CUDA device
    print(f"Using {torch.cuda.get_device_name()} ({device})")
    print(f"CUDA version: {torch.version.cuda}")
    print(f"Number of CUDA devices: {torch.cuda.device_count()}")
else:
    print("CUDA is not available on this device.")

Python version: 3.11.4 (tags/v3.11.4:d2340ef, Jun  7 2023, 05:45:37) [MSC v.1934 64 bit (AMD64)]
NumPy version: 1.24.1
Pandas version: 1.5.3
Matplotlib version: 3.7.1
Scikit-learn version: 1.3.0
Torch version: 2.0.1+cu117
Torch Geometric version: 2.3.1
NetworkX version: 3.0
Using NVIDIA RTX A6000 (cuda)
CUDA version: 11.7
Number of CUDA devices: 2


## Load data

In [2]:
dtypes = {
    'id': 'string',
    '#chrom': 'int64',
    'pos': 'int64',
    'ref': 'string',
    'alt': 'string',
    'rsids': 'string',
    'nearest_genes': 'string',
    'pval': 'float64',
    'mlogp': 'float64',
    'beta': 'float64',
    'sebeta': 'float64',
    'af_alt': 'float64',
    'af_alt_cases': 'float64',
    'af_alt_controls': 'float64',
    'finemapped': 'int64',
    'causal': 'int64',
    'trait': 'string'
}

data = pd.read_csv('gwas-fine-causal.csv', dtype=dtypes)

# Assert column names
expected_columns = ['#chrom', 'pos', 'ref', 'alt', 'rsids', 'nearest_genes', 'pval', 'mlogp', 'beta',
                    'sebeta', 'af_alt', 'af_alt_cases', 'af_alt_controls', 'finemapped',
                    'id', 'causal', 'trait']
assert set(data.columns) == set(expected_columns), "Unexpected columns in the data DataFrame."

# Assert data types
expected_dtypes = {
    'id': 'string',
    '#chrom': 'int64',
    'pos': 'int64',
    'ref': 'string',
    'alt': 'string',
    'rsids': 'string',
    'nearest_genes': 'string',
    'pval': 'float64',
    'mlogp': 'float64',
    'beta': 'float64',
    'sebeta': 'float64',
    'af_alt': 'float64',
    'af_alt_cases': 'float64',
    'af_alt_controls': 'float64',
    'finemapped': 'int64',
    'causal': 'int64',
    'trait': 'string'
}

for col, expected_dtype in expected_dtypes.items():
    assert data[col].dtype == expected_dtype, f"Unexpected data type for column {col}."

In [3]:
# Check for total number of null values in each column
null_counts = data.isnull().sum()

print("Total number of null values in each column:")
print(null_counts)

Total number of null values in each column:
#chrom                   0
pos                      0
ref                      0
alt                      0
rsids              1366396
nearest_genes       727855
pval                     0
mlogp                    0
beta                     0
sebeta                   0
af_alt                   0
af_alt_cases             0
af_alt_controls          0
id                       0
finemapped               0
causal                   0
trait                    0
dtype: int64


## Data manipulation

In [4]:
#data = data.sample(frac=0.05, random_state=42)
len(data)

20170006

### Find nearest gene

In [5]:
data['nearest_genes'] = data['nearest_genes'].astype(str)

# Assert column 'nearest_genes' is a string
assert data['nearest_genes'].dtype == 'object', "Column 'nearest_genes' is not of string type."

# Get the length of the data before transformation
original_length = len(data)

# Extract the first gene name from the 'nearest_genes' column
data['nearest_genes'] = data['nearest_genes'].str.split(',').str[0]

# Reset index to have a standard index
data = data.reset_index(drop=True)

# Assert the length of the data remains the same
assert len(data) == original_length, "Length of the data has changed after transformation."

## Spec

### Data

`data` Pandas DataFrame:

- `id`: This column represents the id of the variant in the following format: #chrom:pos:ref:alt (string).
- `#chrom`: This column represents the chromosome number where the genetic variant is located (int).
- `pos`: This is the position of the genetic variant on the chromosome (int: 1-200,000).
- `ref`: This column represents the reference allele (or variant) at the genomic position.
- `alt`: This is the alternate allele observed at this position.
- `rsids`: This stands for reference SNP cluster ID. It's a unique identifier for each variant used in the dbSNP database.
- `nearest_genes`: This column represents the gene which is nearest to the variant (string).
- `pval`: This represents the p-value, which is a statistical measure for the strength of evidence against the null hypothesis.
- `mlogp`: This represents the minus log of the p-value, commonly used in genomic studies.
- `beta`: The beta coefficient represents the effect size of the variant.
- `sebeta`: This is the standard error of the beta coefficient.
- `af_alt`: This is the allele frequency of the alternate variant in the general population (float: 0-1.
- `af_alt_cases`: This is the allele frequency of the alternate variant in the cases group (float: 0-1).
- `af_alt_controls`: This is the allele frequency of the alternate variant in the control group (float: 0-1).
- `finemapped`: This column represents whether the variant is included in the post-finemapped dataset (1) or not (0) (int).
- `trait`: This column represents the trait associated with the variant. In this dataset, it is the response to the drug paracetamol and NSAIDs.


### Nodes and Their Features

There is one type of node: SNP nodes.

- **SNP Nodes**: Each SNP Node is characterized by various features, including `id`, `nearest_genes`, `#chrom`, `pos`, `ref`, `alt`, `mlogp`, `beta`, `sebeta`,  `af_alt`, `af_alt_cases`, and `af_alt_controls` columns.

### Edges, Their Features, and Labels

Edges represent relationships between SNP nodes in the graph:

- For each pair of SNPs (row1 and row2) that exist on the same chromosome (`#chrom`), an edge is created if the absolute difference between their positions (`pos`) is less than or equal to 1,000,000 and greater than 1 (no loops). Create edges between all pairs of SNPs within the 1,000,000 base distance threshold. The edge weight is determined by the following formula:
     
```
weights = 1 * e^(-ln(2) / 100_000 * pos_diff_abs)
```

# pyg

## pyg creation

In [6]:
import pandas as pd
import numpy as np
import torch
from torch_geometric.data import Data
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import time
from sklearn.preprocessing import RobustScaler
import cProfile, pstats, io
from scipy.spatial import distance

## Graph stats

# nx 

## nx creation

In [7]:
import math
import networkx as nx
from scipy.spatial import distance
import numba
import cupy as cp
from numba import cuda

### GPU-enhanced

In [8]:
%%time

def create_graph(data):
    data.sort_values(['#chrom', 'pos'], inplace=True)
    
    G = nx.Graph()
    
    nodes = data.set_index('id')[['nearest_genes', 'mlogp', 'beta', 'sebeta', 'af_alt', 'af_alt_cases', 'af_alt_controls']].to_dict('index')
    G.add_nodes_from(nodes.items())

    def calculate_weights(pos_diffs):
        mask = (pos_diffs > 1) & (pos_diffs <= 300_000)
        indices = cp.argwhere(mask)  # indices where condition holds
        unique_indices = indices[indices[:, 0] < indices[:, 1]]  # indices where first index < second index
        if unique_indices.size > 0:  # check if there are unique_indices
            unique_pos_diffs = pos_diffs[unique_indices[:, 0], unique_indices[:, 1]]
            return unique_indices, cp.exp(-cp.log(2) / 100_000 * unique_pos_diffs)
        else:
            return cp.array([]), cp.array([])

    # Divide the data into 2 halves for multi-GPU computation
    halves = [data[data['#chrom'] <= data['#chrom'].median()], data[data['#chrom'] > data['#chrom'].median()]]
    
    for device, data_half in enumerate(halves):
        with cp.cuda.Device(device):  # Specify the device
            for chrom, group in data_half.groupby('#chrom'):
                ids = group['id'].values
                pos = cp.asarray(group['pos'].values, dtype=cp.float32)  # Use float32 for reduced memory
                
                chunk_size = 37_000
                overlap = 2_000 
                num_chunks = math.ceil(len(pos) / chunk_size)
                
                for chunk in range(num_chunks):
                    start_idx = max(0, chunk * chunk_size - overlap)
                    end_idx = min((chunk + 1) * chunk_size + overlap, len(pos))
                    
                    chunk_pos = pos[start_idx:end_idx]
                    chunk_pos_diffs = cp.empty((len(chunk_pos), len(chunk_pos)), dtype=cp.float32)
                    chunk_pos_diffs -= chunk_pos[:, None]  # Compute difference in-place
                    chunk_pos_diffs = cp.abs(chunk_pos_diffs)

                    unique_indices, unique_weights = calculate_weights(chunk_pos_diffs)
                    unique_weights = cp.asnumpy(unique_weights)
                    unique_indices = cp.asnumpy(unique_indices)  # conversion to NumPy array

                    if unique_indices.size > 0:
                        edges = [(ids[unique_indices[i, 0]], ids[unique_indices[i, 1]], unique_weights[i]) for i in range(unique_indices.shape[0])]
                        G.add_weighted_edges_from(edges)

                del ids, pos
                cp.cuda.Stream.null.synchronize()
                cp._default_memory_pool.free_all_blocks()
                group = None

    return G

nx_graph = create_graph(data)


CPU times: total: 35min 43s
Wall time: 36min 17s


### CPU-only

## Write nx 

## Read nx

## nx stats

In [9]:
%%time

# Print basic graph statistics
num_nodes = nx_graph.number_of_nodes()
num_edges = nx_graph.number_of_edges()

print("Number of nodes:", num_nodes)
print("Number of edges:", num_edges)

# Print number of connected components
num_components = nx.number_connected_components(nx_graph)
print("Number of connected components:", num_components)

from collections import Counter
import numpy as np

# Get degree distribution
degree_sequence = sorted([d for n, d in nx_graph.degree()], reverse=True)
degree_counts = Counter(degree_sequence)

# Print degree distribution statistics
degrees = np.array(list(degree_counts.keys()))
print("Degree distribution statistics:")
print("Minimum degree:", degrees.min())
print("Maximum degree:", degrees.max())
print("Average degree:", degrees.mean())

# Calculate edge weight statistics
edge_sum = 0
edge_count = 0
min_weight = float('inf')
max_weight = float('-inf')

for _, _, data in nx_graph.edges(data=True):
    if 'weight' in data:
        weight = float(data['weight'])
        edge_sum += weight
        edge_count += 1
        min_weight = min(min_weight, weight)
        max_weight = max(max_weight, weight)

average_weight = edge_sum / edge_count if edge_count > 0 else float('nan')

# Print edge weight statistics
print("Edge weight statistics:")
print("Minimum weight:", min_weight)
print("Maximum weight:", max_weight)
print("Average weight:", average_weight)

# Find the largest connected component
largest_component = max(nx.connected_components(nx_graph), key=len)

# Print the size of the largest connected component
print("Size of the largest connected component (nodes):", len(largest_component))
print("Size of the largest connected component (edges):", num_edges - (num_nodes - len(largest_component)))


Number of nodes: 20170006
Number of edges: 1065156421
Number of connected components: 19543996
Degree distribution statistics:
Minimum degree: 0
Maximum degree: 38999
Average degree: 3460.0434782608695
Edge weight statistics:
Minimum weight: 0.12500519871195762
Maximum weight: 0.9998267282181494
Average weight: 0.3296916121860768
Size of the largest connected component (nodes): 39000
Size of the largest connected component (edges): 1045025415
CPU times: total: 13min 58s
Wall time: 26min 21s


## Clustering

### Louvain Algorithm

#### Community Size Histogram

#### Degree Distribution per Community

#### Centrality measures per community

#### Community interconnectivity

#### Community contribution to overall network structure

#### Community chrom and min-max pos identification

#### Subgraph visualization

### k-clique communities

### Girvan-Newman

### Label Propagation Algorithm (LPA)

### Leading Eigenvector

### Walktrap

### Edge Betweenness

### Spectral Clustering

# nk 

In [13]:
import math
import networkit as nk
from scipy.spatial import distance
import numba
import cupy as cp
from numba import cuda

## nx -> nk

In [14]:
%%time

nk_graph = nk.nxadapter.nx2nk(nx_graph)

CPU times: total: 10min 28s
Wall time: 13min 40s


## Write nk

In [15]:
%%time

# Assuming nk_graph has been converted from a networkx graph
nk.writeGraph(nk_graph, 'nk_graph.networkit', nk.Format.NetworkitBinary)




CPU times: total: 2min 5s
Wall time: 2min 15s


## Read nk

In [None]:
%%time

# Assuming nk_graph has been written to a GraphML file
nk_graph = nk.readGraph('nk_graph.networkit', nk.Format.NetworkitBinary)

## Clustering

### PLM (Parallel Louvain)

In [16]:
%%time

# Choose and initialize algorithm
plmCommunities = nk.community.detectCommunities(nk_graph, algo=nk.community.PLM(nk_graph, True))

print("{0} elements assigned to {1} subsets".format(plmCommunities.numberOfElements(), plmCommunities.numberOfSubsets()))
print("the biggest subset has size {0}".format(max(plmCommunities.subsetSizes())))

  warn("networkit.Timer is deprecated, will be removed in future updates.")


Communities detected in 17.56860 [s]
solution properties:
-------------------  --------------
# communities            1.9544e+07
min community size       1
max community size   39000
avg. community size      1.03203
imbalance            19500
edge cut                 0
edge cut (portion)       0
modularity               0.926649
-------------------  --------------
20170006 elements assigned to 19543996 subsets
the biggest subset has size 39000
CPU times: total: 3min 52s
Wall time: 39 s


In [17]:
# Find index of largest subset
max_size = max(plmCommunities.subsetSizes())
max_index = plmCommunities.subsetSizes().index(max_size)

print(max_size)
print(max_index)

39000
0
