In [1]:
import sys
import os
import random
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import sklearn
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support, roc_auc_score, average_precision_score
from sklearn.preprocessing import RobustScaler, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
import torch
import torch.nn.functional as F
import torch_geometric
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv, GATConv, SAGEConv
from torch_geometric.utils import to_undirected, negative_sampling
import networkx as nx
from scipy.spatial import cKDTree
from scipy.special import expit
from typing import List, Dict
import time
import cProfile
import pstats
import io
import category_encoders as ce
from itertools import combinations
from collections import Counter
from torch_geometric.transforms import RandomNodeSplit

# Print versions of imported libraries
print(f"Python version: {sys.version}")
print(f"NumPy version: {np.__version__}")
print(f"Pandas version: {pd.__version__}")
print(f"Matplotlib version: {matplotlib.__version__}")
print(f"Scikit-learn version: {sklearn.__version__}")
print(f"Torch version: {torch.__version__}")
print(f"Torch Geometric version: {torch_geometric.__version__}")
print(f"NetworkX version: {nx.__version__}")

if torch.cuda.is_available():
    device = torch.device("cuda")  # Current CUDA device
    print(f"Using {torch.cuda.get_device_name()} ({device})")
    print(f"CUDA version: {torch.version.cuda}")
    print(f"Number of CUDA devices: {torch.cuda.device_count()}")
else:
    print("CUDA is not available on this device.")

Python version: 3.11.5 (tags/v3.11.5:cce6ba9, Aug 24 2023, 14:38:34) [MSC v.1936 64 bit (AMD64)]
NumPy version: 1.24.1
Pandas version: 2.1.0
Matplotlib version: 3.7.2
Scikit-learn version: 1.3.0
Torch version: 2.0.1+cu117
Torch Geometric version: 2.3.1
NetworkX version: 3.0
Using NVIDIA RTX A6000 (cuda)
CUDA version: 11.7
Number of CUDA devices: 2


## Spec

### Data

`data` Pandas DataFrame:

- `#chrom`: chromosome of SNP (int).
- `id`: the ID of the variant in the following format: `#chrom:pos:ref:alt` (string).
- `pos`: position of the genetic variant on the chromosome (int).
- `ref`: reference allele (or variant) at the genomic position (string).
- `alt`: alternate allele observed at this position (string).
- `gene_0` to `gene_21`: genes which are nearest to the variant (string).
- `mlogp`: minus log of the p-value, commonly used in genomic studies (float).
- `beta`: beta coefficient represents the effect size of the variant (float).
- `sebeta`: standard error of the beta coefficient (float).
- `af_alt`: allele frequency of the alternate variant in the general population (float).
- `af_alt_cases`: allele frequency of the alternate variant in the cases group (float).
- `af_alt_controls`: allele frequency of the alternate variant in the control group (float).
- `prob`: posterior probability of association (float).
- `lead_r2`: r2 value to a lead variant (the one with maximum PIP) in a credible set (float).
- `cs_99`: credible set to which the variant belongs to (int).
- `causal`: indicates causality of variant (1) or not (0) (int). 

### Task Overview

The objective is to design and implement a binary node classification GNN model to predict whether variants are causal (`causal=1`) or not (`causal=0`).

## Graph Creation

In [2]:
# Load and prepare the data
data = pd.read_parquet('gwas_fm_t2d.parquet')

# Process only chromosome 10 and 3
chroms = [3, 10, 12]

data = data[data['#chrom'].isin(chroms)]

## RF+Grid

In [3]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.metrics import auc, roc_auc_score, roc_curve, precision_recall_curve
import category_encoders as ce

seed_value = 0
np.random.seed(seed_value)

if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed_value)
    
def get_unique_snps(data: pd.DataFrame) -> dict:
    return {snp: idx for idx, snp in enumerate(data['id'].unique())}

def preprocess_snp_features(data: pd.DataFrame, snp_to_idx: dict) -> pd.DataFrame:
    gene_cols = [f'gene_{i}' for i in range(22)]
    cols_to_extract = ['id', '#chrom', 'pos', 'ref', 'alt', 'mlogp', 'beta', 'sebeta', 
                       'af_alt', 'af_alt_cases', 'af_alt_controls', 'lead_r2', 'cs_99','prob'] + gene_cols
    
    snp_features = data.loc[data['id'].isin(snp_to_idx.keys()), cols_to_extract].set_index('id').sort_index()
    
    # Columns to be binary encoded
    categorical_cols = ['ref', 'alt'] + gene_cols
    
    # Applying Binary Encoder to the string columns
    encoder = ce.BinaryEncoder(cols=categorical_cols)
    snp_features = encoder.fit_transform(snp_features)
    
    snp_features = snp_features.fillna(0)
    return snp_features

# Assume data is your DataFrame
snp_to_idx = get_unique_snps(data)
snp_features = preprocess_snp_features(data, snp_to_idx)

# Split data into features and target variable
X = snp_features.values
y = data['causal'].values

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35, random_state=42)

# Create the Random Forest classifier
clf = RandomForestClassifier(random_state=42)

# Set up the parameter grid to seaerch
param_grid = {
    'n_estimators': [10, 50, 100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10, 15],
    'min_samples_leaf': [1, 2, 4, 8]
}

# Create Grid Search object and fit to data
grid_search = GridSearchCV(clf, param_grid, cv=5, n_jobs=-1, scoring='roc_auc', verbose=2)
grid_search.fit(X_train, y_train)

# Get the best estimator
best_clf = grid_search.best_estimator_

# Make predictions on the testing data
y_pred = best_clf.predict(X_test)
y_pred_proba = best_clf.predict_proba(X_test)[:, 1]  # Get the probability of the positive class

# Evaluate the model using AUC-ROC and AUPRC
roc_auc = roc_auc_score(y_test, y_pred_proba)
fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
roc_auc = auc(fpr, tpr)

precision, recall, _ = precision_recall_curve(y_test, y_pred_proba)
auprc = auc(recall, precision)

print(f'AUC-ROC: {roc_auc}')
print(f'AUPRC: {auprc}')


  elif pd.api.types.is_categorical_dtype(cols):
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  

Fitting 5 folds for each of 320 candidates, totalling 1600 fits
AUC-ROC: 0.4177026001138736
AUPRC: 0.0011922561400336121


## Naive Bayes

In [4]:
import pandas as pd
import numpy as np
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.metrics import auc, roc_auc_score, roc_curve, precision_recall_curve
import category_encoders as ce

seed_value = 0
np.random.seed(seed_value)

def get_unique_snps(data: pd.DataFrame) -> dict:
    return {snp: idx for idx, snp in enumerate(data['id'].unique())}

def preprocess_snp_features(data: pd.DataFrame, snp_to_idx: dict) -> pd.DataFrame:
    gene_cols = [f'gene_{i}' for i in range(22)]
    cols_to_extract = ['id', '#chrom', 'pos', 'ref', 'alt', 'mlogp', 'beta', 'sebeta', 
                       'af_alt', 'af_alt_cases', 'af_alt_controls', 'lead_r2', 'cs_99','prob'] + gene_cols
    
    snp_features = data.loc[data['id'].isin(snp_to_idx.keys()), cols_to_extract].set_index('id').sort_index()
    
    # Columns to be binary encoded
    categorical_cols = ['ref', 'alt'] + gene_cols
    
    # Applying Binary Encoder to the string columns
    encoder = ce.BinaryEncoder(cols=categorical_cols)
    snp_features = encoder.fit_transform(snp_features)
    
    snp_features = snp_features.fillna(0)
    return snp_features


# Assume data is your DataFrame
snp_to_idx = get_unique_snps(data)
snp_features = preprocess_snp_features(data, snp_to_idx)

X = snp_features.values
y = data['causal'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35, random_state=42)

# Create the Naive Bayes classifier
clf = GaussianNB()
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
y_pred_proba = clf.predict_proba(X_test)[:, 1]

roc_auc = roc_auc_score(y_test, y_pred_proba)
fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
roc_auc = auc(fpr, tpr)

precision, recall, _ = precision_recall_curve(y_test, y_pred_proba)
auprc = auc(recall, precision)

print(f'AUC-ROC: {roc_auc}')
print(f'AUPRC: {auprc}')


  elif pd.api.types.is_categorical_dtype(cols):
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  

AUC-ROC: 0.4575464983867906
AUPRC: 0.022672090467823196


  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)


## SVM+Grid

In [6]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.metrics import auc, roc_auc_score, roc_curve, precision_recall_curve
import category_encoders as ce

# Set random seed
seed_value = 0
np.random.seed(seed_value)

def get_unique_snps(data: pd.DataFrame) -> dict:
    return {snp: idx for idx, snp in enumerate(data['id'].unique())}

# ... [Rest of the preprocessing functions remain the same]

# Assume data is your DataFrame
snp_to_idx = get_unique_snps(data)
snp_features = preprocess_snp_features(data, snp_to_idx)

X = snp_features.values
y = data['causal'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35, random_state=42)

# Create the SVM classifier
clf = SVC(probability=True, random_state=42)

# Set up the parameter grid to search
param_grid = {
    'C': [0.1, 1, 10, 100],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto']
}

# Create Grid Search object and fit to data
grid_search = GridSearchCV(clf, param_grid, cv=5, n_jobs=-1, scoring='roc_auc', verbose=2)
grid_search.fit(X_train, y_train)

best_clf = grid_search.best_estimator_

y_pred = best_clf.predict(X_test)
y_pred_proba = best_clf.predict_proba(X_test)[:, 1]

roc_auc = roc_auc_score(y_test, y_pred_proba)
fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
roc_auc = auc(fpr, tpr)

precision, recall, _ = precision_recall_curve(y_test, y_pred_proba)
auprc = auc(recall, precision)

print(f'AUC-ROC: {roc_auc}')
print(f'AUPRC: {auprc}')


  elif pd.api.types.is_categorical_dtype(cols):
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  

Fitting 5 folds for each of 16 candidates, totalling 80 fits


KeyboardInterrupt: 

## KNN+Grid

In [7]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.metrics import auc, roc_auc_score, roc_curve, precision_recall_curve
import category_encoders as ce

# Set random seed
seed_value = 0
np.random.seed(seed_value)

def get_unique_snps(data: pd.DataFrame) -> dict:
    return {snp: idx for idx, snp in enumerate(data['id'].unique())}

# ... [Rest of the preprocessing functions remain the same]

# Assume data is your DataFrame
snp_to_idx = get_unique_snps(data)
snp_features = preprocess_snp_features(data, snp_to_idx)

X = snp_features.values
y = data['causal'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35, random_state=42)

# Create the KNN classifier
clf = KNeighborsClassifier()

# Set up the parameter grid to search
param_grid = {
    'n_neighbors': [3, 5, 7, 9, 11],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
}

# Create Grid Search object and fit to data
grid_search = GridSearchCV(clf, param_grid, cv=5, n_jobs=-1, scoring='roc_auc', verbose=2)
grid_search.fit(X_train, y_train)

best_clf = grid_search.best_estimator_

y_pred = best_clf.predict(X_test)
y_pred_proba = best_clf.predict_proba(X_test)[:, 1]

roc_auc = roc_auc_score(y_test, y_pred_proba)
fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
roc_auc = auc(fpr, tpr)

precision, recall, _ = precision_recall_curve(y_test, y_pred_proba)
auprc = auc(recall, precision)

print(f'AUC-ROC: {roc_auc}')
print(f'AUPRC: {auprc}')


  elif pd.api.types.is_categorical_dtype(cols):
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  

Fitting 5 folds for each of 20 candidates, totalling 100 fits
AUC-ROC: 0.622295501992788
AUPRC: 0.1563136971399834


## XGBoost

In [8]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.metrics import auc, roc_auc_score, roc_curve, precision_recall_curve
import category_encoders as ce

# Set random seed
seed_value = 0
np.random.seed(seed_value)

def get_unique_snps(data: pd.DataFrame) -> dict:
    return {snp: idx for idx, snp in enumerate(data['id'].unique())}

# ... [Rest of the preprocessing functions remain the same]

# Assume data is your DataFrame
snp_to_idx = get_unique_snps(data)
snp_features = preprocess_snp_features(data, snp_to_idx)

X = snp_features.values
y = data['causal'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35, random_state=42)

# Create the XGBoost classifier
clf = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)

# Set up the parameter grid to search
param_grid = {
    'n_estimators': [50, 100, 150],
    'learning_rate': [0.01, 0.05, 0.1, 0.5],
    'max_depth': [3, 5, 7],
    'subsample': [0.8, 1],
    'colsample_bytree': [0.8, 1]
}

# Create Grid Search object and fit to data
grid_search = GridSearchCV(clf, param_grid, cv=5, n_jobs=-1, scoring='roc_auc', verbose=2)
grid_search.fit(X_train, y_train)

best_clf = grid_search.best_estimator_

y_pred = best_clf.predict(X_test)
y_pred_proba = best_clf.predict_proba(X_test)[:, 1]

roc_auc = roc_auc_score(y_test, y_pred_proba)
fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
roc_auc = auc(fpr, tpr)

precision, recall, _ = precision_recall_curve(y_test, y_pred_proba)
auprc = auc(recall, precision)

print(f'AUC-ROC: {roc_auc}')
print(f'AUPRC: {auprc}')


  elif pd.api.types.is_categorical_dtype(cols):
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  

Fitting 5 folds for each of 144 candidates, totalling 720 fits
AUC-ROC: 0.6217973049914595
AUPRC: 0.0020367770305682207


## Logistic Regression

In [9]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import auc, roc_auc_score, roc_curve, precision_recall_curve
import category_encoders as ce

# Set random seed
seed_value = 0
np.random.seed(seed_value)

def get_unique_snps(data: pd.DataFrame) -> dict:
    return {snp: idx for idx, snp in enumerate(data['id'].unique())}

# ... [Rest of the preprocessing functions remain the same]

# Assume data is your DataFrame
snp_to_idx = get_unique_snps(data)
snp_features = preprocess_snp_features(data, snp_to_idx)

X = snp_features.values
y = data['causal'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35, random_state=42)

# Create the logistic regression classifier
clf = LogisticRegression(random_state=42, max_iter=1000)

# Set up the parameter grid to search
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
    'penalty': ['l1', 'l2'],
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
}

# Create Grid Search object and fit to data
grid_search = GridSearchCV(clf, param_grid, cv=5, n_jobs=-1, scoring='roc_auc', verbose=2)
grid_search.fit(X_train, y_train)

best_clf = grid_search.best_estimator_

y_pred = best_clf.predict(X_test)
y_pred_proba = best_clf.predict_proba(X_test)[:, 1]

roc_auc = roc_auc_score(y_test, y_pred_proba)
fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
roc_auc = auc(fpr, tpr)

precision, recall, _ = precision_recall_curve(y_test, y_pred_proba)
auprc = auc(recall, precision)

print(f'AUC-ROC: {roc_auc}')
print(f'AUPRC: {auprc}')


  elif pd.api.types.is_categorical_dtype(cols):
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  return pd.api.types.is_categorical_dtype(dtype)
  

Fitting 5 folds for each of 70 candidates, totalling 350 fits


105 fits failed out of a total of 350.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
35 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Windows\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Windows\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Windows\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1168, in fit
    solver = _check_so

AUC-ROC: 0.38437084835832225
AUPRC: 0.0012129098906465728




## TabTransformer

In [11]:
print(snp_features.columns)


Index(['#chrom', 'pos', 'ref_0', 'ref_1', 'ref_2', 'ref_3', 'ref_4', 'ref_5',
       'ref_6', 'ref_7', 'ref_8', 'alt_0', 'alt_1', 'alt_2', 'alt_3', 'alt_4',
       'alt_5', 'alt_6', 'alt_7', 'mlogp', 'beta', 'sebeta', 'af_alt',
       'af_alt_cases', 'af_alt_controls', 'lead_r2', 'cs_99', 'prob',
       'gene_0_0', 'gene_0_1', 'gene_0_2', 'gene_0_3', 'gene_0_4', 'gene_0_5',
       'gene_0_6', 'gene_0_7', 'gene_1_0', 'gene_1_1', 'gene_1_2', 'gene_1_3',
       'gene_1_4', 'gene_2_0', 'gene_3_0', 'gene_4_0', 'gene_5_0', 'gene_6_0',
       'gene_7_0', 'gene_8_0', 'gene_9_0', 'gene_10_0', 'gene_11_0',
       'gene_12_0', 'gene_13_0', 'gene_14_0', 'gene_15_0', 'gene_16_0',
       'gene_17_0', 'gene_18_0', 'gene_19_0', 'gene_20_0', 'gene_21_0'],
      dtype='object')


In [17]:
import pandas as pd
import numpy as np
import time
import warnings
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, roc_auc_score, precision_recall_curve, roc_curve, auc
import pytorch_tabular
from pytorch_tabular.models.tab_transformer import TabTransformerConfig
from pytorch_tabular import TabularModel
from pytorch_tabular.config import DataConfig, TrainerConfig, OptimizerConfig
import category_encoders as ce

warnings.filterwarnings("ignore")

# Set random seed
seed_value = 0
np.random.seed(seed_value)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed_value)

def get_unique_snps(data: pd.DataFrame) -> dict:
    return {snp: idx for idx, snp in enumerate(data['id'].unique())}

# Preprocessing
snp_to_idx = get_unique_snps(data)
snp_features = preprocess_snp_features(data, snp_to_idx)

# Splitting data 
X = snp_features
y = data['causal']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35, random_state=42)

# Encode labels
le = LabelEncoder()
y_train_encoded = le.fit_transform(y_train)
y_test_encoded = le.transform(y_test)

# Convert datasets to DataFrames
train_df = pd.concat([X_train, pd.Series(y_train_encoded, name='causal', index=X_train.index)], axis=1)
test_df = pd.concat([X_test, pd.Series(y_test_encoded, name='causal', index=X_test.index)], axis=1)

# Extract categorical columns and continuous columns from the data
continuous_cols = list(snp_features.columns)

# Configurations
data_config = DataConfig(
    target=['causal'],
    continuous_cols=continuous_cols,
    categorical_cols=[]
)
trainer_config = TrainerConfig(
    auto_lr_find=True,
    batch_size=1024,
    max_epochs=100_000,
    gpus=1 if torch.cuda.is_available() else 0,
    gradient_clip_val=1,
    early_stopping_patience=100
)
optimizer_config = OptimizerConfig()

# TabTransformer Model Configuration
model_config = TabTransformerConfig(
    task="classification",
    num_heads=8,
    num_attn_blocks=8,
    transformer_head_dim=512,
    share_embedding=True,
    share_embedding_strategy='fraction',
    shared_embedding_fraction=0.5,
    attn_dropout=0.2,
    add_norm_dropout=0.2,
    ff_dropout=0.2,
    embedding_dropout=0.2,
    batch_norm_continuous_input=True
)

tabular_model = TabularModel(
    data_config=data_config,
    model_config=model_config,
    optimizer_config=optimizer_config,
    trainer_config=trainer_config
)

# Training
tabular_model.fit(train=train_df, test=test_df)

# Getting predictions on the test dataset
test_predictions = tabular_model.predict(test_df)

# Extracting the predicted probabilities for the positive class
test_pred_proba = test_predictions['1_probability'].values

# ROC-AUC Score
roc_auc = roc_auc_score(y_test_encoded, test_pred_proba)
fpr, tpr, _ = roc_curve(y_test_encoded, test_pred_proba)
roc_auc_val = auc(fpr, tpr)

# AUPRC Score
precision, recall, _ = precision_recall_curve(y_test_encoded, test_pred_proba)
auprc = auc(recall, precision)

print(f"ROC-AUC Score on Test Data: {roc_auc_val:.8f}")
print(f"AUPRC Score on Test Data: {auprc:.8f}")


Global seed set to 42


Auto select gpus: [0]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


You are using a CUDA device ('NVIDIA RTX A6000') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_steps=100` reached.
Learning rate set to 0.030199517204020192
Restoring states from the checkpoint path at C:\Users\Windows\Desktop\Research\PhD\GeoGWAS\FinnGen\notebooks\causal\.lr_find_85ac5d39-b803-4f21-9fe4-24ae310ed935.ckpt
Restored all states from the checkpoint file at C:\Users\Windows\Desktop\Research\PhD\GeoGWAS\FinnGen\notebooks\causal\.lr_find_85ac5d39-b803-4f21-9fe4-24ae310ed935.ckpt


You are using a CUDA device ('NVIDIA RTX A6000') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Output()

Output()

ROC-AUC Score on Test Data: 0.30356804
AUPRC Score on Test Data: 0.00114668
