In [1]:
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import polars as pl
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression, Ridge
from sklearn.metrics import average_precision_score, roc_auc_score
from sklearn.model_selection import GroupKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.pipeline import Pipeline
from tqdm import tqdm

In [3]:
dataset = "omim_matched_9"
base_dir = f"../../results/dataset/{dataset}"
V = pd.read_parquet(f"{base_dir}/test.parquet")
V

Unnamed: 0,chrom,pos,ref,alt,OMIM,consequence,label,tss_dist,match_group
0,1,1425822,C,G,,PLS,False,48,PLS_4
1,1,1615869,C,T,,PLS,False,35,PLS_0
2,1,1659060,G,A,,PLS,False,47,PLS_4
3,1,2050958,T,C,,5_prime_UTR_variant,False,149,5_prime_UTR_variant_7
4,1,2074688,G,A,,5_prime_UTR_variant,False,0,5_prime_UTR_variant_2
...,...,...,...,...,...,...,...,...,...
3865,X,155613005,C,T,,PLS,False,52,PLS_62
3866,X,155719093,C,A,,5_prime_UTR_variant,False,4,5_prime_UTR_variant_105
3867,X,155881342,A,C,,PLS,False,2,PLS_68
3868,X,155881414,C,T,,5_prime_UTR_variant,False,35,5_prime_UTR_variant_115


In [4]:
feature_names = [
    "GPN-MSA_LLR",
    "GPN-MSA_InnerProducts",
]

all_features = []
for features in feature_names:
    df = pd.read_parquet(f"{base_dir}/features/{features}.parquet")
    df.columns = [f"{features}_{col}" for col in df.columns]
    all_features += df.columns.tolist()
    V = pd.concat([V, df], axis=1)
V

Unnamed: 0,chrom,pos,ref,alt,OMIM,consequence,label,tss_dist,match_group,GPN-MSA_LLR_score,...,GPN-MSA_InnerProducts_embedding_758,GPN-MSA_InnerProducts_embedding_759,GPN-MSA_InnerProducts_embedding_760,GPN-MSA_InnerProducts_embedding_761,GPN-MSA_InnerProducts_embedding_762,GPN-MSA_InnerProducts_embedding_763,GPN-MSA_InnerProducts_embedding_764,GPN-MSA_InnerProducts_embedding_765,GPN-MSA_InnerProducts_embedding_766,GPN-MSA_InnerProducts_embedding_767
0,1,1425822,C,G,,PLS,False,48,PLS_4,3.439453,...,128.732819,159.598480,74.421951,73.623795,38.545086,73.916550,142.523422,145.709167,58.040916,158.960846
1,1,1615869,C,T,,PLS,False,35,PLS_0,0.529785,...,147.545822,171.706024,46.736801,43.170914,42.430355,54.364761,158.388382,71.366440,41.952412,157.829102
2,1,1659060,G,A,,PLS,False,47,PLS_4,-0.802246,...,88.503456,236.405533,35.817936,73.384224,48.623886,55.552979,62.625595,68.485321,53.198612,119.029610
3,1,2050958,T,C,,5_prime_UTR_variant,False,149,5_prime_UTR_variant_7,1.386719,...,92.255478,253.419617,60.395531,50.736336,29.582539,90.220230,90.312042,96.651627,31.374359,154.576050
4,1,2074688,G,A,,5_prime_UTR_variant,False,0,5_prime_UTR_variant_2,0.347168,...,62.511314,223.224915,85.186905,95.258102,47.534737,63.498165,70.662926,192.659454,33.813946,167.253983
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3865,X,155613005,C,T,,PLS,False,52,PLS_62,1.066406,...,83.412796,218.358795,57.655621,71.613922,37.866199,70.055969,123.362350,97.474365,58.502895,160.284714
3866,X,155719093,C,A,,5_prime_UTR_variant,False,4,5_prime_UTR_variant_105,0.505859,...,107.956505,508.437225,47.225998,240.112335,107.949043,108.419098,195.913254,287.693695,270.265503,65.540878
3867,X,155881342,A,C,,PLS,False,2,PLS_68,-3.859375,...,81.035919,113.946785,69.781052,73.995590,53.434326,80.101067,151.518570,93.543678,81.852676,143.471146
3868,X,155881414,C,T,,5_prime_UTR_variant,False,35,5_prime_UTR_variant_115,-1.322266,...,59.554832,183.954376,78.128311,61.767647,42.667374,56.545998,114.332726,124.544868,47.406349,160.134476


In [35]:
V["train_mask"] = V.chrom.isin([str(i) for i in range(1, 23, 2)] + ["X"])
V_train = V[V.train_mask]
V_test = V[~V.train_mask]
len(V_train), len(V_test)

(2900, 970)

In [36]:
X_train = V_train[all_features]
y_train = V_train["label"]
X_test = V_test[all_features]
y_test = V_test["label"]

In [33]:
clf = Pipeline([
    ('imputer', SimpleImputer(
        missing_values=np.nan, strategy='mean', keep_empty_features=True,
    )),
    ('scaler', StandardScaler()),
    ('linear', LogisticRegression(
        class_weight="balanced",
        random_state=42,
        #penalty="elasticnet",
        #l1_ratio=0.5,
        penalty="l1",
        solver="saga",
        #C=1e-3,
        C=2.7e-3,
        #C=1,
        n_jobs=-1,
    ))
])
clf.fit(X_train, y_train)
linear = clf.named_steps["linear"]
coef = pd.DataFrame({
    "feature": X_train.columns,
    "coef": linear.coef_[0],
}).sort_values("coef", ascending=False, key=abs)
print(coef.head(10))
print(f"{(coef.coef != 0).sum()=}")

                                 feature      coef
6      GPN-MSA_InnerProducts_embedding_5  0.024365
0                      GPN-MSA_LLR_score  0.000000
517  GPN-MSA_InnerProducts_embedding_516  0.000000
507  GPN-MSA_InnerProducts_embedding_506  0.000000
508  GPN-MSA_InnerProducts_embedding_507  0.000000
509  GPN-MSA_InnerProducts_embedding_508  0.000000
510  GPN-MSA_InnerProducts_embedding_509  0.000000
511  GPN-MSA_InnerProducts_embedding_510  0.000000
512  GPN-MSA_InnerProducts_embedding_511  0.000000
513  GPN-MSA_InnerProducts_embedding_512  0.000000
(coef.coef != 0).sum()=1


In [34]:
# train on odd
# C, n_selected, AUPRC
# 1e-3, 1, 0.541
# 1, 713, 0.645

# train on even
# 1, 561, 0.53
average_precision_score(y_test, clf.predict_proba(X_test)[:, 1])

0.39916359390050987

In [24]:
average_precision_score(y_test, -V_test["GPN-MSA_LLR_score"])

0.6917278715099429