In [1]:
from util.gen_utils import *
from util.ml_utils import *
from util.plot_utils import *

import os

%load_ext autoreload
%autoreload 2
%matplotlib inline

# Load training data

In [2]:
train_rnaseq = rnaseq_data("../../data/rnaseq_discovery_data/htseq_postQC.csv", "../../data/rnaseq_discovery_data/TMM_postQC.csv", mygene_db = mygene.MyGeneInfo())
train_rnaseq.filter_to_gene_types(['protein_coding']) #Validation will be done with qPCR so limit to protein_coding

train_meta = read_sample_meta_table("../../data/rnaseq_discovery_data/sample_meta_postQC.csv")

querying 1-1000...done.
querying 1001-2000...done.
querying 2001-3000...done.
querying 3001-4000...done.
querying 4001-5000...done.
querying 5001-6000...done.
querying 6001-7000...done.
querying 7001-7041...done.
Finished.
15 input query terms found no hit:
	['ENSG00000133808', 'ENSG00000150076', 'ENSG00000155640', 'ENSG00000199837', 'ENSG00000213846', 'ENS
querying 1-15...done.
Finished.
8 input query terms found no hit:
	['AL356053.1', 'none', 'none', 'RP11-367G6.3', 'none', 'none', 'none', 'none']


In [3]:
#Data mask
training_frac = 1.0 #Bcz all PE_STAN data was used in DE analysis and that is used for feature selection, all should be used for training

masks = data_masks(train_frac = training_frac, seed = 1041, label_col = 'case') 
masks.add_mask('is_collected_pre17wks', (train_meta.ga_at_collection <= 16))
masks.add_mask('is_training', masks.get_sampled_mask(train_meta, addtnl_mask_label = 'is_collected_pre17wks', blocking_col = 'subject'))

#Logical combinations
masks.add_mask_logical_and_combinations('is_training', 'is_collected_pre17wks')

# Feature selection and model training
* Start with features identified in DE and identify logFC changes pre and post 17 weeks
* Filter initial list based on:
    * Coefficient of variation cutoff [Want genes that appear to have stable logFC]
    * logFC cutoff [Want genes that appear to be sig different between PE and control]
    * CPM cutoff [Want highly expressed genes such that detection in qPCR or in smaller plasma sample is expected]
        * Based on standard curve with qPCR, want at least CPM = 200 for average depth of 50 million reads
            * 200 CPM = 10000/50
* To choose appro cutoff for each filter, do param sweep + LR model

In [4]:
de_PE = de_data("out/de/DE_PEspecific_changes_over_gestation.csv", alpha = 0.05, de_type = 'PE preg changes')
sig_protein_coding_genes = de_PE.de.index[de_PE.is_sig_mask].intersection(train_rnaseq.logCPM.index)

logFC_pre17_sig = logFC_data_by_group(sig_protein_coding_genes, {True : 'Pre 17 weeks', False : 'Post 17 weeks'}, group_col = 'pre17_weeks', CV_cutoff = 1.0, logFC_cutoff = 0.5)
logFC_pre17_sig.get_logFC_and_CI_by_group(train_rnaseq.logCPM.loc[sig_protein_coding_genes, :], train_meta.join(masks.masks['is_training_and_is_collected_pre17wks'].rename('pre17_weeks')))

Now calculating logFC for Pre 17 weeks
Now calculating logFC for Post 17 weeks
Now estimating logFC confidence interval for Pre 17 weeks
1000 resampling iterations completed
2000 resampling iterations completed
Now estimating logFC confidence interval for Post 17 weeks
1000 resampling iterations completed
2000 resampling iterations completed
Identifying when during gestation we observe changes


In [None]:
cv_cutoff_vals = np.arange(0.2, 1.1, 0.05) #Upper limit = 1.0 since CV > 1.0 means logFC switched signs
logFC_cutoff_vals = np.arange(0, 1.25, 0.25) #logFC thresholding, visualizing data previously, mean |logFC| < 1.0 so sweep between 0 - 1.0
high_expr_cutoff_vals = np.array([200]) #Min = 200 based on qPCR standard curve expt

best_fit = training_pipeline(train_meta.loc[masks.masks['is_training_and_is_collected_pre17wks']], train_rnaseq, logFC_pre17_sig, ['Pre 17 weeks'], 
						cv_cutoff_vals, logFC_cutoff_vals, high_expr_cutoff_vals, n_max_coef = 25)

# Check model performance
* Check with training data
* Check with independent dataset - Del Vecchio et al
* Check with qPCR dataset from separate samples

In [None]:
train_data = ML_data(meta = train_meta.loc[masks.masks['is_training_and_is_collected_pre17wks']], rnaseq_inst = train_rnaseq, y_col = 'case', 
                     to_batch_correct = True, group_col = 'subject', features = best_fit['features'])

get_classification_results('Training', best_fit['model'], train_data)

In [None]:
w_pred_train = train_meta.loc[masks.masks['is_training_and_is_collected_pre17wks']].join(pd.Series(best_fit['model'].predict(train_data.X), index = train_data.y.index, name = 'prediction'))
w_pred_train.loc[w_pred_train.case != w_pred_train.prediction]#, ['ga_at_collection', 'pe_onset_ga_wk', 'pe_onset_ga_day', 'ga_wk_del', 'ga_day_del']]

In [None]:
delvecchio_sample_meta = read_delvecchio_meta(biosample_results_path = '../../data/delvecchio_data/biosample_result.txt', sra_results_path = '../../data/delvecchio_data/SraRunTable.txt')
delvecchio_rnaseq = rnaseq_data("../../data/delvecchio_data/htseq_merged.csv", "../../data/delvecchio_data/TMM.csv", mL_plasma = 0.2)

#ML data should only contains samples <= 17 weeks which per Del Vecchio et al are T1 samples
delvecchio_ml_data = ML_data(meta = delvecchio_sample_meta.loc[delvecchio_sample_meta.term == 1], rnaseq_inst = delvecchio_rnaseq, y_col = 'case', 
                     to_batch_correct = True, features = best_fit['features'])

#only_pe_no_comp = delvecchio_sample_meta.loc[np.logical_and(delvecchio_sample_meta.term == 1, delvecchio_sample_meta.complication_during_pregnancy.isin(['No Complications', 'Preeclampsia/gestational hypertension']))].index
#delvecchio_ml_data.X, delvecchio_ml_data.y = delvecchio_ml_data.filter_samples(only_pe_no_comp)

get_classification_results('Delvecchio', best_fit['model'], delvecchio_ml_data)

In [None]:
w_pred = delvecchio_sample_meta.join(pd.Series(best_fit['model'].predict(delvecchio_ml_data.X), index = delvecchio_ml_data.y.index, name = 'prediction')).dropna()
w_pred.loc[w_pred.case != w_pred.prediction]

In [None]:
plot_logCPM([train_data.X, delvecchio_ml_data.X], [train_data.y, delvecchio_ml_data.y], ['train', 'delvecchio'], best_fit['features'])

In [10]:
def plot_logCPM(CPM_list, meta_list, data_names, genes, colors = ["#92c5de", "#ca0020"], ylabel = "$log_2(CPM + 1)$"):
    fig, ax = plt.subplots(1, len(genes), sharey = 'row', sharex = 'col', figsize = (40, 5))

    col_i = 0
    for val in genes:
        name = val[0] 
        name = val if name == "none" else name

        to_plot = None
        for i in range(len(meta_list)):
            to_plot_i = pd.DataFrame(meta_list[i]) #meta_list[i].loc[:, ["case"]].sort_values(by=["case"])
            to_plot_i.insert(to_plot_i.shape[1], name, CPM_list[i].loc[:, val])
            to_plot_i.insert(to_plot_i.shape[1], 'type', data_names[i])
            to_plot = to_plot_i if to_plot is None else pd.concat((to_plot, to_plot_i), axis=0)
            
        sns.boxplot(x="type", y=name, hue="case",  dodge=True, palette=colors, data=to_plot, showfliers = True, ax = ax[col_i])  # split=True,
        #plt.locator_params(axis="x", nbins=5)
        #labels = ['Cntrl', 'PE']
        #labels = ['T1', 'T2']
        #ax[i, col_i].set_xticklabels(labels)

        if col_i == 0:
            ax[col_i].set_ylabel(ylabel)
        else:
            ax[col_i].set_ylabel("")
        
        ax[col_i].set_title(name)
        
        #ax[icol_i].set_xlabel("Term")

                
        sns.despine(left=True)
        #plt.legend(ncol=3, frameon=0, bbox_to_anchor=(1.03, 1.12))
        col_i += 1