# Point Cloud Analysis

## Load libraries

In [1]:
import sys
import os
import random
import numpy as np
from numba import jit, prange
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import sklearn
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support, roc_auc_score, average_precision_score
from sklearn.preprocessing import RobustScaler, LabelEncoder, StandardScaler, OrdinalEncoder, OneHotEncoder, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
import torch
import torch.nn.functional as F
import torch_geometric
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv, GATConv
from torch_geometric.utils import to_undirected, negative_sampling
import networkx as nx
from scipy.spatial import cKDTree
from scipy.special import expit
from typing import List, Dict
import time
import cProfile
import pstats
import io
import category_encoders as ce
import torch
import torch.nn.functional as F
from torch_geometric.nn import SAGEConv
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score
import copy
from torch_geometric.transforms import RandomNodeSplit
from collections import Counter



# Print versions of imported libraries
print(f"Python version: {sys.version}")
print(f"NumPy version: {np.__version__}")
print(f"Pandas version: {pd.__version__}")
print(f"Matplotlib version: {matplotlib.__version__}")
print(f"Scikit-learn version: {sklearn.__version__}")
print(f"Torch version: {torch.__version__}")
print(f"Torch Geometric version: {torch_geometric.__version__}")
print(f"NetworkX version: {nx.__version__}")

if torch.cuda.is_available():
    device = torch.device("cuda")          # Current CUDA device
    print(f"Using {torch.cuda.get_device_name()} ({device})")
    print(f"CUDA version: {torch.version.cuda}")
    print(f"Number of CUDA devices: {torch.cuda.device_count()}")
else:
    print("CUDA is not available on this device.")

Python version: 3.11.4 (tags/v3.11.4:d2340ef, Jun  7 2023, 05:45:37) [MSC v.1934 64 bit (AMD64)]
NumPy version: 1.24.1
Pandas version: 1.5.3
Matplotlib version: 3.7.1
Scikit-learn version: 1.3.0
Torch version: 2.0.1+cu117
Torch Geometric version: 2.3.1
NetworkX version: 3.0
Using NVIDIA RTX A6000 (cuda)
CUDA version: 11.7
Number of CUDA devices: 2


## Load data

In [2]:
dtypes = {
    'id': 'string',
    '#chrom': 'int64',
    'pos': 'int64',
    'ref': 'string',
    'alt': 'string',
    'rsids': 'string',
    'nearest_genes': 'string',
    'pval': 'float64',
    'mlogp': 'float64',
    'beta': 'float64',
    'sebeta': 'float64',
    'af_alt': 'float64',
    'af_alt_cases': 'float64',
    'af_alt_controls': 'float64',
    'finemapped': 'int64'
}

data = pd.read_csv('~/Desktop/GeoGWAS/FinnGen/data/gwas-finemap.csv', dtype=dtypes)

# Assert column names
expected_columns = ['#chrom', 'pos', 'ref', 'alt', 'rsids', 'nearest_genes', 'pval', 'mlogp', 'beta',
                    'sebeta', 'af_alt', 'af_alt_cases', 'af_alt_controls', 'finemapped',
                    'id', 'trait']
assert set(data.columns) == set(expected_columns), "Unexpected columns in the data DataFrame."

# Assert data types
expected_dtypes = {
    'id': 'string',
    '#chrom': 'int64',
    'pos': 'int64',
    'ref': 'string',
    'alt': 'string',
    'rsids': 'string',
    'nearest_genes': 'string',
    'pval': 'float64',
    'mlogp': 'float64',
    'beta': 'float64',
    'sebeta': 'float64',
    'af_alt': 'float64',
    'af_alt_cases': 'float64',
    'af_alt_controls': 'float64',
    'finemapped': 'int64'
}

for col, expected_dtype in expected_dtypes.items():
    assert data[col].dtype == expected_dtype, f"Unexpected data type for column {col}."

## Data manipulation

In [3]:
#data = data.sample(frac=0.05, random_state=42)

In [4]:
# Calculate the number of SNPs per chromosome
snps_per_chrom = data.groupby('#chrom')['id'].nunique()

# Filter the data for finemapped variants
finemapped_data = data[data['finemapped'] == 1]

# Count the number of finemapped SNPs per chromosome
finemapped_counts = finemapped_data.groupby('#chrom')['id'].nunique()

# Calculate the percentage of finemapped SNPs per chromosome
percentage_finemapped = (finemapped_counts / snps_per_chrom * 100).fillna(0)

# Print the results
print("Number of SNPs per Chromosome:")
print(snps_per_chrom)
print("\nNumber of Finemapped SNPs per Chromosome:")
print(finemapped_counts)
print("\nPercentage of Finemapped SNPs per Chromosome:")
print(percentage_finemapped)

Number of SNPs per Chromosome:
#chrom
1     1527735
2     1668096
3     1395815
4     1395767
5     1264833
6     1255027
7     1135429
8     1083225
9      849406
10     971906
11     956229
12     925156
13     701351
14     637320
15     560161
16     624439
17     536114
18     550566
19     430361
20     441761
21     259164
22     272290
23     727855
Name: id, dtype: int64

Number of Finemapped SNPs per Chromosome:
#chrom
1     173381
2     182870
3     160831
4     203738
5      88348
6     243622
7     183372
8     120372
9     210862
10    165926
11    225123
12    197761
13    101524
14     77829
15     79464
16    107026
17    127416
18    122219
19     84577
20     90008
22     18619
23     92959
Name: id, dtype: int64

Percentage of Finemapped SNPs per Chromosome:
#chrom
1     11.348892
2     10.962798
3     11.522372
4     14.596849
5      6.984954
6     19.411694
7     16.150019
8     11.112373
9     24.824642
10    17.072227
11    23.542792
12    21.375963
13    14.475

### Find nearest gene

In [5]:
data['nearest_genes'] = data['nearest_genes'].astype(str)

# Assert column 'nearest_genes' is a string
assert data['nearest_genes'].dtype == 'object', "Column 'nearest_genes' is not of string type."

# Get the length of the data before transformation
original_length = len(data)

# Extract the first gene name from the 'nearest_genes' column
data['nearest_genes'] = data['nearest_genes'].str.split(',').str[0]

# Reset index to have a standard index
data = data.reset_index(drop=True)

# Assert the length of the data remains the same
assert len(data) == original_length, "Length of the data has changed after transformation."

## Spec

### Data Description

The dataset is a Pandas DataFrame named `data`, which includes the following columns:

- `id`: Unique ID of the variant in the format #chrom:pos:ref:alt (string).
- `#chrom`: Chromosome number where the genetic variant is located.
- `pos`: Position of the genetic variant on the chromosome (integer between 1 and 200,000).
- `ref`: Reference allele (or variant) at the genomic position.
- `alt`: Alternate allele observed at the genomic position.
- `rsids`: Reference SNP cluster ID, a unique identifier for each variant used in the dbSNP database.
- `nearest_genes`: Gene nearest to the variant (string).
- `pval`: P-value, a statistical measure for the strength of evidence against the null hypothesis.
- `mlogp`: Minus log of the p-value, commonly used in genomic studies.
- `beta`: Beta coefficient, representing the effect size of the variant.
- `sebeta`: Standard error of the beta coefficient.
- `af_alt`: Allele frequency of the alternate variant in the general population (float between 0 and 1).
- `af_alt_cases`: Allele frequency of the alternate variant in the cases group (float between 0 and 1).
- `af_alt_controls`: Allele frequency of the alternate variant in the control group (float between 0 and 1).
- `finemapped`: Indicator whether the variant is included in the post-finemapped dataset (1) or not (0) (integer).
- `trait`: Trait associated with the variant. In this dataset, it refers to the response to the drug paracetamol and NSAIDs.

### Task Overview

The task is to predict whether variants are included in the post-finemapped dataset based on `finemapped` using point cloud data dimensionality reduction and clustering techniques in combination with machine learning techniques. 

## Stats

In [6]:
data.describe()

Unnamed: 0,#chrom,pos,pval,mlogp,beta,sebeta,af_alt,af_alt_cases,af_alt_controls,finemapped
count,20170010.0,20170010.0,20170010.0,20170010.0,20170010.0,20170010.0,20170010.0,20170010.0,20170010.0,20170010.0
mean,9.158641,78594380.0,0.4544099,0.5596207,9.619015e-05,0.1052461,0.117583,0.1175803,0.1175835,0.1516037
std,6.273152,55476480.0,0.3008589,0.7377075,0.2055397,0.1628265,0.2207685,0.2207494,0.2207725,0.3586363
min,1.0,10253.0,4.57088e-208,4.54967e-09,-28.4731,0.00549407,6.83689e-06,0.0,1.7938e-06,0.0
25%,4.0,33107010.0,0.182273,0.145785,-0.0276381,0.0114862,0.000644216,0.00064895,0.000643445,0.0
50%,8.0,69626110.0,0.438384,0.358145,-0.0001961835,0.0440978,0.00626732,0.00627537,0.00626507,0.0
75%,13.0,113934300.0,0.71485,0.739278,0.0262773,0.142107,0.114145,0.114154,0.114157,0.0
max,23.0,248945500.0,1.0,207.34,24.5392,12.247,0.999993,0.999999,0.999996,1.0


### Mann-Whitney U test

## TDA

## UMAP

### Mapping

### Classification + Mapping

#### Random Forest

In [7]:
%%time

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from umap import UMAP

# Filter numerical columns
numerical_cols = [
    'pos', 'mlogp', 'af_alt_cases', 'af_alt_controls'
]

# Filter data for chromosome 
data = data[data['#chrom'] == 2]

# Sort the data by 'pos' column
data.sort_values(by='pos', inplace=True)

X = data[numerical_cols]
y = data['finemapped']

# Split the data into training set and temp set (validation + test)
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
) # leaving 40% for the validation and test sets

# Split the temp set into validation and test sets
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp
) # splitting the 40% evenly into validation and test sets, so they each are 20% of the original data

# Create a pipeline
pipeline = Pipeline([
    ('scaling', RobustScaler()), 
    ('umap', UMAP(n_components=4, random_state=42)), 
    ('rf', RandomForestClassifier(random_state=42))
])

# Train the pipeline on training set
pipeline.fit(X_train, y_train)

# Validate the model on validation set
y_val_pred = pipeline.predict(X_val)

# After model selection and tuning, finally test the model on the test set
y_test_pred = pipeline.predict(X_test)

  @numba.jit()
  @numba.jit()
  @numba.jit()
  from .autonotebook import tqdm as notebook_tqdm
  @numba.jit()


CPU times: total: 10h 5min 45s
Wall time: 1h 16min 5s


In [8]:
%%time

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Make predictions on validation set
y_val_pred = pipeline.predict(X_val)
y_val_proba = pipeline.predict_proba(X_val)[:, 1]  # get probabilities for the positive class

# Print metrics for validation set
print("Validation Metrics:")
print(f"Accuracy: {accuracy_score(y_val, y_val_pred)}")
print(f"Precision: {precision_score(y_val, y_val_pred)}")
print(f"Recall: {recall_score(y_val, y_val_pred)}")
print(f"F1 Score: {f1_score(y_val, y_val_pred)}")
print(f"ROC AUC Score: {roc_auc_score(y_val, y_val_proba)}")

# Make predictions on test set
y_test_pred = pipeline.predict(X_test)
y_test_proba = pipeline.predict_proba(X_test)[:, 1]  # get probabilities for the positive class

# Print metrics for test set
print("\nTest Metrics:")
print(f"Accuracy: {accuracy_score(y_test, y_test_pred)}")
print(f"Precision: {precision_score(y_test, y_test_pred)}")
print(f"Recall: {recall_score(y_test, y_test_pred)}")
print(f"F1 Score: {f1_score(y_test, y_test_pred)}")
print(f"ROC AUC Score: {roc_auc_score(y_test, y_test_proba)}")

# Masks for finemapped and not_finemapped
finemapped_mask_test = y_test == 1
not_finemapped_mask_test = y_test == 0

# Apply masks to y_test_pred
y_pred_finemapped_test = y_test_pred[finemapped_mask_test]
y_pred_not_finemapped_test = y_test_pred[not_finemapped_mask_test]

# Create masks for correctly predicted finemapped and not_finemapped
correct_finemapped_mask_test = y_pred_finemapped_test == 1
correct_not_finemapped_mask_test = y_pred_not_finemapped_test == 0

# Calculate accuracy for both groups on test set
accuracy_finemapped_test = sum(correct_finemapped_mask_test) / len(y_pred_finemapped_test)
accuracy_not_finemapped_test = sum(correct_not_finemapped_mask_test) / len(y_pred_not_finemapped_test)

print("\n")
print(f"Test Accuracy for finemapped nodes: {accuracy_finemapped_test}")
print(f"Test Accuracy for not finemapped nodes: {accuracy_not_finemapped_test}")
print("\n")

Validation Metrics:
Accuracy: 0.9568329509939492
Precision: 0.8491580229286524
Recall: 0.737185563251914
F1 Score: 0.7892199910231642
ROC AUC Score: 0.9288726994894057

Test Metrics:
Accuracy: 0.9570569310393062
Precision: 0.8497526620273329
Recall: 0.7389449892457439
F1 Score: 0.7904845471385396
ROC AUC Score: 0.9310159030253243


Test Accuracy for finemapped nodes: 0.7389449892457439
Test Accuracy for not finemapped nodes: 0.983912668773341


CPU times: total: 26min 46s
Wall time: 26min 42s


#### Plotting