In [1]:
from util.gen_utils import *
from util.ml_utils import *

%load_ext autoreload
%autoreload 2
%matplotlib inline

# Load training data

In [2]:
train_all = rnaseq_and_meta_data("../../data/rnaseq_discovery_data/sample_meta_postQC.csv", 
                                 "../../data/rnaseq_discovery_data/htseq_postQC.csv", 
                                 "../../data/rnaseq_discovery_data/TMM_postQC.csv", mygene_db = mygene.MyGeneInfo())

train_all.filter_to_gene_types(['protein_coding']) #Validation will be done with qPCR so limit to protein_coding

querying 1-1000...done.
querying 1001-2000...done.
querying 2001-3000...done.
querying 3001-4000...done.
querying 4001-5000...done.
querying 5001-6000...done.
querying 6001-7000...done.
querying 7001-7041...done.
Finished.
15 input query terms found no hit:
	['ENSG00000133808', 'ENSG00000150076', 'ENSG00000155640', 'ENSG00000199837', 'ENSG00000213846', 'ENS
querying 1-15...done.
Finished.
8 input query terms found no hit:
	['AL356053.1', 'none', 'none', 'RP11-367G6.3', 'none', 'none', 'none', 'none']


In [3]:
#Train mask
training_frac = 1.0 #Bcz all PE_STAN data was used in DE analysis and that is used for feature selection, all should be used for training

masks = data_masks(train_frac = training_frac, seed = 1041, label_col = 'case') 
masks.add_mask('is_collected_pre17wks', (train_all.meta.ga_at_collection <= 16))
masks.add_mask('is_training', masks.get_sampled_mask(train_all.meta, addtnl_mask_label = 'is_collected_pre17wks', blocking_col = 'subject'))
masks.add_mask('is_pp', (train_all.meta.is_pp == 1)) #Want to filter post-partum samples

#Logical combinations
masks.add_mask_logical_and_combinations('is_training', 'is_collected_pre17wks')
masks.add_mask_logical_and_combinations('is_training', 'is_pp')

#Filter train_all to relevant samples
train_mask = masks.masks['is_training_and_is_collected_pre17wks']
train_late_mask = np.logical_and(masks.masks['is_training_and_not_is_collected_pre17wks'], masks.masks['is_training_and_not_is_pp'])
train_all.filter_samples(train_mask)

# Feature selection and model training
* Start with features identified in DE and identify logFC changes pre and post 17 weeks
* Filter initial list based on:
    * Coefficient of variation cutoff [Want genes that appear to have stable logFC]
    * logFC cutoff [Want genes that appear to be sig different between PE and control]
    * CPM cutoff [Want highly expressed genes such that detection in qPCR or in smaller plasma sample is expected]
* To choose appro cutoff for each filter, do param sweep + LR model

In [4]:
de_PE = de_data("out/de/DE_PEspecific_changes_over_gestation.csv", alpha = 0.05, de_type = 'PE preg changes')
sig_protein_coding_genes = de_PE.de.index[de_PE.is_sig_mask].intersection(train_all.rnaseq.logCPM.index)

logFC_pre17_sig = logFC_data_by_group(sig_protein_coding_genes, {True : 'Pre 17 weeks'}, group_col = 'pre17_weeks', CV_cutoff = 1.0, logFC_cutoff = 0.5)
logFC_pre17_sig.get_logFC_and_CI_by_group(train_all.rnaseq.logCPM.loc[sig_protein_coding_genes, :], 
                                          train_all.meta.join(train_mask.rename('pre17_weeks')))

Now calculating logFC for Pre 17 weeks
Now estimating logFC confidence interval for Pre 17 weeks
1000 resampling iterations completed
2000 resampling iterations completed
Identifying when during gestation we observe changes


In [5]:
cv_cutoff_vals = np.arange(0.2, 1.1, 0.1) #Upper limit = 1.0 since CV > 1.0 means logFC switched signs
logFC_cutoff_vals = np.arange(0, 1.25, 0.5) #logFC thresholding, visualizing data previously, mean |logFC| < 1.0 so sweep between 0 - 1.0
high_expr_cutoff_vals = np.array([200]) #Min = 200 since ~10K copies

best_fit = training_pipeline(train_all, logFC_pre17_sig, ['Pre 17 weeks'], cv_cutoff_vals, logFC_cutoff_vals, high_expr_cutoff_vals, n_max_coef = 25)

Best score = 0.94 with 11 features and CV cutoff = 0.70, logFC cutoff = 0.00, CPM cutoff = 200


# Check model performance
* Check with training data
* Check with independent dataset - Del Vecchio et al
* Check with qPCR dataset from separate samples

In [6]:
train_data = ML_data(rnaseq_meta = train_all, y_col = 'case', to_batch_correct = True, group_col = 'subject', features = best_fit['features'])

get_classification_results('Training', best_fit['model'], train_data)

Training results:
ROC AUC = 0.94
Report:
              precision    recall  f1-score   support

           0       0.89      1.00      0.94        25
           1       1.00      0.88      0.93        24

    accuracy                           0.94        49
   macro avg       0.95      0.94      0.94        49
weighted avg       0.95      0.94      0.94        49

Confusion matrix:
[[25  0]
 [ 3 21]]



In [7]:
w_pred_train = train_all.meta.join(pd.Series(best_fit['model'].predict(train_data.X), index = train_data.y.index, name = 'prediction')).join(pd.Series(best_fit['model'].predict_proba(train_data.X)[:, 1], index = train_data.y.index, name = 'score'))
w_pred_train.loc[w_pred_train.case != w_pred_train.prediction]

Unnamed: 0_level_0,term,subject,ga_at_collection,is_pe,is_pp,case,group,pe_onset_ga_wk,pe_onset_ga_day,pe_onset_at_delivery,...,labor_start,del_mode,pe_feature,mult_gest,bmi,prev_preg_n,pe_type,weeks_post_del,prediction,score
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
PE_STAN_1960008088,1,19600,8.0,1,0,1,PE,39.0,4.0,0.0,...,Augmented,C-Section,mild,,34.41,0,late,,0,0.231192
PE_STAN_1053008158,2,10530,15.0,1,0,1,PE,36.0,2.0,0.0,...,,C-Section,severe,,28.4,3,late,,0,0.314586
PE_STAN_2007008088,1,20070,8.0,1,0,1,PE,38.0,4.0,0.0,...,Induced,NSVD,severe,,50.5,1,late,,0,0.429738


In [8]:
delvecchio_all = rnaseq_and_meta_data("../../data/delvecchio_data/sample_meta.csv", 
                                 "../../data/delvecchio_data/htseq_merged.csv", 
                                 "../../data/delvecchio_data/TMM.csv",  mL_plasma = 0.2)

#ML data should only contains samples <= 16 weeks which per Del Vecchio et al are T1 samples
delvecchio_all.filter_samples((delvecchio_all.meta.term == 1))
delvecchio_ml_data = ML_data(delvecchio_all, y_col = 'case', to_batch_correct = True, features = best_fit['features'])

get_classification_results('Delvecchio [PE vs Normotensive AND Other APOs]', best_fit['model'], delvecchio_ml_data)

only_pe_v_NT = delvecchio_all.meta.loc[delvecchio_all.meta.complication_during_pregnancy.isin(['No Complications', 'Preeclampsia/gestational hypertension'])].index
delvecchio_ml_data_only_PEvNT = delvecchio_ml_data.filter_samples(only_pe_v_NT)

get_classification_results('Delvecchio [PE vs Normotensive]', best_fit['model'], delvecchio_ml_data_only_PEvNT)

Delvecchio [PE vs Normotensive AND Other APOs] results:
ROC AUC = 0.88
Report:
              precision    recall  f1-score   support

           0       1.00      0.76      0.87        17
           1       0.67      1.00      0.80         8

    accuracy                           0.84        25
   macro avg       0.83      0.88      0.83        25
weighted avg       0.89      0.84      0.85        25

Confusion matrix:
[[13  4]
 [ 0  8]]

Delvecchio [PE vs Normotensive] results:
ROC AUC = 0.94
Report:
              precision    recall  f1-score   support

           0       1.00      0.88      0.93         8
           1       0.89      1.00      0.94         8

    accuracy                           0.94        16
   macro avg       0.94      0.94      0.94        16
weighted avg       0.94      0.94      0.94        16

Confusion matrix:
[[7 1]
 [0 8]]



In [9]:
w_pred = delvecchio_all.meta.join(pd.Series(best_fit['model'].predict(delvecchio_ml_data.X), index = delvecchio_ml_data.y.index, name = 'prediction')).join(pd.Series(best_fit['model'].predict_proba(delvecchio_ml_data.X)[:, 1], index = delvecchio_ml_data.y.index, name = 'score')).dropna()
w_pred.loc[w_pred.case != w_pred.prediction]

Unnamed: 0_level_0,subj_id,sample_type,BioSample,complication_during_pregnancy,term,case,prediction,score
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
SRR12214626,Chronic_hypertension_Patient_1,1st_Trimester,SAMN15523998,Chronic hypertension,1.0,0,1,0.962146
SRR12214576,Gestational_diabetes_Patient_6,1st_Trimester,SAMN15524018,Gestational Diabetes,1.0,0,1,0.927093
SRR12214538,Normal_Pregnancy_Patient_7,1st_Trimester,SAMN15524065,No Complications,1.0,0,1,0.948128
SRR12214553,Gestational_diabetes_Patient_1,1st_Trimester,SAMN15524081,Gestational Diabetes,1.0,0,1,0.908372


# Check model performance later during pregnancy
* Training data - Later samples - T2 or T3 pre diagnosis
* Munchel et al S2, S3 for at diagnosis

In [10]:
train_late_all = rnaseq_and_meta_data("../../data/rnaseq_discovery_data/sample_meta_postQC.csv", 
                                 "../../data/rnaseq_discovery_data/htseq_postQC.csv", 
                                 "../../data/rnaseq_discovery_data/TMM_postQC.csv")

#ML data should only contains samples <= 17 weeks
train_late_all.filter_samples(train_late_mask)
train_late_ml_data = ML_data(train_late_all, y_col = 'case', to_batch_correct = True, features = best_fit['features'])

get_classification_results('Train', best_fit['model'], train_late_ml_data)

Train results:
ROC AUC = 0.79
Report:
              precision    recall  f1-score   support

           0       0.73      0.85      0.79        13
           1       0.85      0.73      0.79        15

    accuracy                           0.79        28
   macro avg       0.79      0.79      0.79        28
weighted avg       0.79      0.79      0.79        28

Confusion matrix:
[[11  2]
 [ 4 11]]



In [11]:
munchel_S2_all = rnaseq_and_meta_data("../../data/munchel_data/S2_meta.csv", 
                                 "../../data/munchel_data/S2_counts_only.csv", 
                                 "../../data/munchel_data/S2_TMM.csv", counts_index_cols = [0], mL_plasma = 4.5)

munchel_S2_ml_data = ML_data(munchel_S2_all, y_col = 'case', to_batch_correct = True, features = best_fit['features'].droplevel('gene_num'))

get_classification_results('Munchel S2', best_fit['model'], munchel_S2_ml_data)

Munchel S2 results:
ROC AUC = 0.60
Report:
              precision    recall  f1-score   support

           0       0.75      0.52      0.61        73
           1       0.44      0.68      0.53        40

    accuracy                           0.58       113
   macro avg       0.59      0.60      0.57       113
weighted avg       0.64      0.58      0.58       113

Confusion matrix:
[[38 35]
 [13 27]]



In [12]:
munchel_s3_all = rnaseq_and_meta_data("../../data/munchel_data/S3_meta.csv", 
                                 "../../data/munchel_data/S3_counts_only.csv", 
                                 "../../data/munchel_data/S3_TMM.csv", counts_index_cols = [0], mL_plasma = 4.5)

munchel_s3_ml_data = ML_data(munchel_s3_all, y_col = 'case', to_batch_correct = True, features = best_fit['features'].droplevel('gene_num'))

get_classification_results('Munchel S3', best_fit['model'], munchel_s3_ml_data)

Munchel S3 results:
ROC AUC = 0.62
Report:
              precision    recall  f1-score   support

           0       0.62      0.62      0.62        24
           1       0.62      0.62      0.62        24

    accuracy                           0.62        48
   macro avg       0.62      0.62      0.62        48
weighted avg       0.62      0.62      0.62        48

Confusion matrix:
[[15  9]
 [ 9 15]]



# Save model and features

In [13]:
write_pkl(best_fit['model'], 'out/ml/fitted_model.pkl')
write_pkl(masks, 'out/ml/train_data_masks.pkl')
best_fit['features'].to_frame().to_csv('out/ml/fitted_model_features.csv', index = False)