In [None]:
import sys
path = '/gpfs/commons/groups/gursoy_lab/mstoll/'
sys.path.append(path)

import pandas as pd
import numpy as np 

from codes.models.data_form.DataForm import DataTransfo_1SNP, PatientList
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import statsmodels.api as sm
import matplotlib.pyplot as plt
from codes.models.metrics import calculate_roc_auc
from sklearn.decomposition import PCA




In [None]:
import featurewiz as gwiz


In [None]:
### data constants:
CHR = 1
SNP = 'rs673604'
pheno_method = 'Abby' # Paul, Abby
rollup_depth = 4
Classes_nb = 2 #nb of classes related to an SNP (here 0 or 1)
vocab_size = None # to be defined with data
padding_token = 0
prop_train_test = 0.8
load_data = True
save_data = False
remove_none = True
decorelate = False
threshold_corr = 0.9
threshold_rare = 50
remove_rare = 'all' # None, 'all', 'one_class'
compute_features = True
padding = False
equalize = True
### data format
batch_size = 20
data_share = 1/1000

In [None]:
dataT = DataTransfo_1SNP(SNP=SNP,
                         CHR=CHR,
                         method=pheno_method,
                         padding=padding,  
                         pad_token=padding_token, 
                         load_data=True, 
                         save_data=save_data, 
                         compute_features=compute_features,
                         data_share=data_share,
                         prop_train_test=prop_train_test,
                         remove_none=True,
                         rollup_depth=rollup_depth,
                         equalize_label=equalize,
                         decorelate=decorelate,
                         threshold_corr=threshold_corr,
                         threshold_rare=threshold_rare,
                         remove_rare=remove_rare)
patient_list = dataT.get_patientlist()


In [None]:
data, labels, indices_env, name_envs = dataT.get_tree_data(with_env=False)

In [None]:
pheno = np.array(data)[300:500]
label = np.array(labels)[300:500]

In [None]:
pheno.shape

In [None]:
labels

In [None]:
pheno_df = pd.DataFrame(data=pheno)
label_df = pd.Series(data=label, name = 'SNP_label')

In [None]:
label_df

In [None]:
wiz =gwiz.FeatureWiz(verbose=1)

pheno_df = pd.DataFrame(data=pheno)
label_df = pd.Series(data=label, name = 'SNP_label')

X_train, y_train = wiz.fit_transform(pheno_df, label_df)


In [None]:
X_train, y_train = wiz.fit_transform(pheno_df, label_df)


In [None]:
X_test = wiz.transform(pheno_df)


In [None]:
scaler = StandardScaler()
pheno = scaler.fit_transform(pheno)


In [None]:
lasso_model = Lasso(alpha=0.005)  # You can adjust the regularization parameter (alpha) as needed
lasso_model.fit(pheno, label)


In [None]:
selected_features = np.where(lasso_model.coef_ != 0)[0]


In [None]:
len(selected_features)

In [None]:
# Apply PCA
pca = PCA(n_components=100)
principal_components = pca.fit_transform(pheno)

# Explained variance ratio
explained_variance_ratio = pca.explained_variance_ratio_
print("Explained Variance Ratio:", np.sum(explained_variance_ratio))


In [None]:
pheno = pheno[:, selected_features]

In [None]:
indices = np.arange(len(patient_list))
np.random.shuffle(indices)

In [None]:
#pheno_data, label_data = patient_list.get_tree_data()
pheno_data_train = np.array(pheno)[indices[:int(prop_train_test*len(patient_list))]]
label_data_train = np.array(label)[indices[:int(prop_train_test*len(patient_list))]]
label_data_test = np.array(label)[indices[int(prop_train_test*len(patient_list)):]]
pheno_data_test = np.array(pheno)[indices[int(prop_train_test*len(patient_list)):]]


In [None]:
column_one_train = np.ones((pheno_data_train.shape[0],1 ))
column_one_test = np.ones((pheno_data_test.shape[0],1 ))

pheno_data_train_with_constant=  np.concatenate([column_one_train, pheno_data_train], axis = 1)
pheno_data_test_with_constant=  np.concatenate([column_one_test, pheno_data_test], axis = 1)

In [None]:
logit_model = sm.Logit(label_data_train, pheno_data_train_with_constant)
result = logit_model.fit(method='bfgs', disp=True)

In [None]:
### visualisation des donnes avec df
proba_test = result.predict(pheno_data_test_with_constant)
proba_train = result.predict(pheno_data_train_with_constant)

In [None]:
labels_pred_test = (proba_test > 0.5).astype(int)
nb_positive_test = np.sum(labels_pred_test==0)
nb_negative_test = np.sum(labels_pred_test==1)
labels_pred_train = (proba_train > 0.5).astype(int)
nb_positive_train = np.sum(labels_pred_train==0)
nb_negative_train = np.sum(labels_pred_train==1)

In [None]:
TP_test = np.sum((label_data_test==0 )& (labels_pred_test == 0)) / nb_positive_test
FP_test = np.sum((label_data_test==1 )& (labels_pred_test == 0)) / nb_positive_test
TN_test = np.sum((label_data_test==1 )& (labels_pred_test == 1)) / nb_negative_test
FN_test = np.sum((label_data_test== 0)& (labels_pred_test == 1)) / nb_negative_test

TP_train = np.sum((label_data_train==0 )& (labels_pred_train == 0)) / nb_positive_train
FP_train = np.sum((label_data_train==1 )& (labels_pred_train == 0)) / nb_positive_train
TN_train = np.sum((label_data_train==1 )& (labels_pred_train == 1)) / nb_negative_train
FN_train = np.sum((label_data_train== 0)& (labels_pred_train == 1)) / nb_negative_train


auc_test = calculate_roc_auc(label_data_test, proba_test)
auc_train = calculate_roc_auc(label_data_train, proba_train)

In [None]:
print(f'{TP_test=}') 
print(f'{FP_test=}')
print(f'{TN_test=}')
print(f'{FN_test=}')
print(f'{TP_train=}') 
print(f'{FP_train=}')
print(f'{TN_train=}')
print(f'{FN_train=}')

print(f'{auc_test=}')
print(f'{auc_train=}')