In [13]:
# !pip install pandas

In [14]:
import numpy as np
import pandas as pd

In [231]:
print("Loading data...")
patients_data = pd.read_csv('preterm_data/anoSC2_v20_nokey.csv')
platform1_data = pd.read_csv('preterm_data/eset_HuGene21ST.csv')
platform2_data = pd.read_csv('preterm_data/HTA20_RMA.csv')
print("Data successfully loaded.")

Loading data...
Data successfully loaded.


In [223]:
print("Merging data from different platforms...")
platform1_data.columns = ['GeneID'] + platform1_data.columns.tolist()[1:]
platform2_data.columns = ['GeneID'] + platform2_data.columns.tolist()[1:]

common_genes = list(set(platform1_data.GeneID.values).intersection(set(platform2_data.GeneID.values)))

platform1_data = platform1_data[platform1_data.GeneID.isin(common_genes)]
platform2_data = platform2_data[platform2_data.GeneID.isin(common_genes)]

merged = pd.merge(platform1_data, platform2_data, on='GeneID', how='inner')
cols = ['GeneID'] + patients_data.SampleID.tolist()
merged = merged[cols]
print("Data successfully merged.")

Merging data from different platforms...
Data successfully merged.


In [225]:
print("Scaling data...")

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaled_features = scaler.fit_transform(merged.T.values[1:])
transformed_data = pd.DataFrame(data=scaled_features.T, columns=merged.columns.tolist()[1:], index=merged.index.tolist())
transformed_data["GeneID"] = merged.GeneID.tolist()
print("Data successfully scaled.")

Scaling data...
Data successfully scaled.


In [226]:
print("Extracting sPTD/Control and PPROM/Control train samples..")
sPTD_samples = patients_data[patients_data["Group"].isin(["Control", "sPTD"]) & (patients_data["Train"] == 1)].SampleID.values
PPROM_samples = patients_data[patients_data["Group"].isin(["Control", "PPROM"]) & (patients_data["Train"] == 1)].SampleID.values
print("Samples extracted.")

Extracting sPTD/Control and PPROM/Control train samples..
Samples extracted.


In [227]:
print("Fetching latest sPTD & PPROM data, per one person...")
def get_latest_data_for(data_set):
    result = pd.DataFrame()
    for individual_id in pd.unique(data_set.IndividualID):
        latest_sample_data = data_set[data_set.IndividualID == individual_id].sort_values('GA').iloc[-1:]
        result = result.append(latest_sample_data)
    return result
    
sPTD_data = patients_data[patients_data["SampleID"].isin(sPTD_samples)]
latest_sptd_data = get_latest_data_for(sPTD_data)

PPROM_data = patients_data[patients_data["SampleID"].isin(PPROM_samples)]
latest_pprom_data = get_latest_data_for(PPROM_data)
print("Latest data fetched.")

Fetching latest sPTD & PPROM data, per one person...
Latest data fetched.


In [228]:
print("Fetching gene expression data for latest samples & their delivery outcomes...")
sPTD_genes = transformed_data[latest_sptd_data.SampleID.values]
PPROM_genes = transformed_data[latest_pprom_data.SampleID.values]
sPTD_outcomes = latest_sptd_data.Group.replace('Control', 0).replace('sPTD', 1).values
PPROM_outcomes = latest_pprom_data.Group.replace('Control', 0).replace('PPROM', 1).values
print("Data successfully fetched.")

Fetching gene expression data for latest samples & their delivery outcomes...
Data successfully fetched.


In [229]:
from sklearn.linear_model import Lasso
    
print("Performing feature selection by extracting 100 most relevant sPTD/PPROM genes...")
from sklearn.feature_selection import SelectFromModel

sPTD_feats = SelectFromModel(Lasso(alpha=0.01), max_features=50)
sPTD_feats.fit(sPTD_genes.T, sPTD_outcomes)
sPTD_gene_mask = sPTD_feats.get_support()

PPROM_feats = SelectFromModel(Lasso(alpha=0.01), max_features=50)
PPROM_feats.fit(PPROM_genes.T, PPROM_outcomes)
PPROM_gene_mask = PPROM_feats.get_support()

sPTD_train_test_set = sPTD_genes.loc[sPTD_gene_mask].T
PPROM_train_test_set = PPROM_genes.loc[PPROM_gene_mask].T
print(f'sPTD train-test set shape: {sPTD_train_test_set.shape}')
print("==========")
print(f'sPTD train-test set shape: {PPROM_train_test_set.shape}')

Performing feature selection by extracting 100 most relevant sPTD/PPROM genes...
sPTD train-test set shape: (153, 50)
sPTD train-test set shape: (175, 50)


In [230]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(random_state=4)

print("--- sPTD vs Control scores ---")
scores = cross_validate(model, sPTD_train_test_set, sPTD_outcomes, cv=5, scoring=scoring_metrics)
print(f"Precision score mean: {np.mean(scores['test_precision'])}")
print(f"Recall score mean: {np.mean(scores['test_recall'])}")
print(f"F1 score mean: {np.mean(scores['test_f1'])}")
print(f"Roc auc score mean: {np.mean(scores['test_roc_auc'])}")

print("---- PPROM vs Control scores ----")
scores = cross_validate(model, PPROM_train_test_set, PPROM_outcomes, cv=5, scoring=scoring_metrics)
print(f"Precision score mean: {np.mean(scores['test_precision'])}")
print(f"Recall score mean: {np.mean(scores['test_recall'])}")
print(f"F1 score mean: {np.mean(scores['test_f1'])}")
print(f"Roc auc score mean: {np.mean(scores['test_roc_auc'])}")

--- sPTD vs Control scores ---
Precision score mean: 1.0
Recall score mean: 0.95
F1 score mean: 0.9714285714285715
Roc auc score mean: 1.0
---- PPROM vs Control scores ----
Precision score mean: 1.0
Recall score mean: 0.975
F1 score mean: 0.9866666666666667
Roc auc score mean: 1.0
