In [191]:
import pandas as pd
import numpy as np

from combat.pycombat import pycombat

from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RepeatedStratifiedKFold, cross_val_score
from imblearn.over_sampling import SMOTE

from sklearn.linear_model import LogisticRegression, Lasso
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler

from xgboost import XGBClassifier
from time import time

In [84]:
df_imputed = pd.read_csv("./Data/data files/imputed/df_protein_lumbar_imputed_75_5_etr.csv")
X = df_imputed.to_numpy()
df_classes = pd.read_csv("./Data/data files/iNPH_data_protein_median.csv", usecols=["Cortical_biopsy_grouping", "CSF_type", "TMT Set"])
#y = y[y["CSF_type"] == "V"].drop(columns=["CSF_type"])["Cortical_biopsy_grouping"]#.to_numpy()#.reset_index(drop=True)

In [85]:
def run_combat(df, TMT_set_indices):
    """Run ComBat to reduce batch effect on a dataframe.

    :param df: Dataframe that ComBat is runned on
    :param TMT_set_indices: Labels of which TMT batch each row belongs to.
    :return: DF with ComBat applied to it
    """    
    return pycombat(df.T, TMT_set_indices).T

In [86]:
def add_noise(df, n_samples, std = 0.1):
    """Helper function for simple perturbation function. Adds gaussian noise to randomly sampled rows of the df passed as an argument.

    :param df: df filtered on class
    :param n_samples: Number of samples to perform random sampling on
    :param std: Standard deviation. Defaults to 0.1.
    :return: df with added perturbed samples
    """      
    sampled_df = df.sample(n = n_samples, replace = True)
    gaussian_noise = np.random.normal(0, std, size=sampled_df.shape)
    return sampled_df + gaussian_noise

In [87]:
def simple_perturbation(X, y, n_samples_per_class = None, std = 0.1):
    """Performs a simple perturbation by sampling random rows of classes and adds gaussian noise to them

    :param X: Dataframe with samples to get perturbed
    :param y: Class labels
    :param n_samples_per_class: Decides how many samples per class that are going to be sampled, defaults to None. 
    If none -> all classes will get equal weight according to size of current largest class.
    :param std: How much the noise can deviate from the mean (Standard dev.), defaults to 0.1
    :return: A df with the perturbed samples. A list stating which row belongs to which class.
    """    
    classes = y.value_counts()
    if n_samples_per_class is None:
        largest_class = classes.argmax()
        largest_n_samples = classes.pop(largest_class)
    else:
        largest_n_samples = n_samples_per_class
    df_list = []
    y_new_classes = list(y)
    for idx, n_samples in classes.items():
        perturbed = add_noise(X[y == idx], largest_n_samples - n_samples, std=std)
        df_list.append(perturbed)
        y_new_classes = y_new_classes + [idx] * len(perturbed)
    return pd.concat([X] + df_list, axis=0), y_new_classes#, ignore_index=True)

In [None]:
xgboost = XGBClassifier()
xgboost.name = "XGBoost"
lr = LogisticRegression()
lr.name = "LogisticRegression"
rf = RandomForestClassifier()
rf.name = "RandomForest"

In [153]:
xgboost_params = {"min_child_weight": [1, 5, 10],
                  "gamma": [0.5, 1, 1,5, 2, 5],
                  "subsample": [0.6, 0.8, 1.0],
                  "colsample_bytree": [0.6, 0.8, 1.0],
                  "max_depth": [3, 4, 5],
                  "n_estimators": [100, 500, 1000]}
lr_params = [{"penalty": ["l1"],
              "C": np.arange(0.2, 3.1, 0.2),
              "solver": ["saga"],
              "multi_class": ["multinomial"],
              "max_iter": np.arange(1000, 10001, 2000)
             }]
rf_params = {"n_estimators": np.arange(100, 501, 100),
             "criterion": ["gini", "entropy", "log_loss"],
             "max_depth": [None, 10, 20, 40],
             "max_features": ["sqrt", "log2"],
             "min_samples_leaf": [1, 2, 4]}

In [178]:
models = [xgboost, lr, rf]
params = [xgboost_params, lr_params, rf_params]

In [186]:
def pipeline_pt1(X, df_classes, csf_type, n_samples_per_class = 100, do_combat = True, do_std_scaler = True,):
    type = df_classes[df_classes["CSF_type"] == csf_type].reset_index(drop=True)
    y = type["Cortical_biopsy_grouping"]
    tmt_set = type["TMT Set"].values
    if do_combat:
        X = run_combat(X, tmt_set)
    X, y = simple_perturbation(X, y, n_samples_per_class = n_samples_per_class)
    if do_std_scaler:
        std_scaler = StandardScaler()
        X = std_scaler.fit_transform(X)
    return X, y

In [187]:
def pipeline_pt2(X, y, models, params, cv = 5, n_jobs = -1):
    best_params = {}
    for model, param in zip(models, params):
        start_time = time()
        print(f"{model.name} started.")
        clf = GridSearchCV(model, param_grid=param, cv=cv, n_jobs=n_jobs)
        clf.fit(X, y)
        best_params[model.name] = clf.best_params_
        print(f"{model.name} is done in {time() - start_time} seconds.\n" )
    return best_params
    

In [188]:
X, y = pipeline_pt1(df_imputed, df_classes, "L")

Found 15 batches.
Adjusting for 0 covariate(s) or covariate level(s).
Standardizing Data across genes.
Fitting L/S model and finding priors.
Finding parametric adjustments.
Adjusting the Data


In [183]:
best_params = pipeline_pt2(X, y, models, params)

In [137]:
pipeline_pt2()

Counter({1: 100, 0: 100, 2: 100})

In [176]:
def pipeline_pt3(X, y, models, params, tmt_set, K = 5, n_samples_per_class = 100):
    X = run_combat(X, tmt_set)
    k_fold = KFold(n_splits=K, shuffle=True)
    for i, (train_index, test_index) in enumerate(k_fold.split(X)):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        X_train, y_train = simple_perturbation(X_train, y_train, n_samples_per_class=n_samples_per_class)
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)
        for model in 
        

In [198]:
k_fold = KFold(n_splits=5, shuffle=True)
k_fold.get_n_splits(X=X, y=y)

5

In [199]:
for i, (train_index, test_index) in enumerate(k_fold.split(X)):
    print(f'Fold: {i}')
    print(f'Train: {train_index}')
    print(f'Test: {test_index}')

Fold: 0
Train: [ 0  1  2  3  4  5  6  7  9 10 11 13 16 17 21 23 24 25 26 27 28 29 30 31
 32 33 34 37 38 39 41 42 44 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60
 61 62 63 64 65 66 69 70 71 73 74 75 76 77 78 80 81 82 83 84]
Test: [ 8 12 14 15 18 19 20 22 35 36 40 43 45 67 68 72 79]
Fold: 1
Train: [ 0  2  3  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 25 27 29
 31 32 33 34 35 36 37 39 40 42 43 44 45 46 47 48 49 53 54 55 56 57 58 59
 61 62 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 82 83]
Test: [ 1  4  5 24 26 28 30 38 41 50 51 52 60 63 64 81 84]
Fold: 2
Train: [ 1  3  4  5  7  8  9 10 11 12 13 14 15 17 18 19 20 22 23 24 26 28 29 30
 34 35 36 37 38 40 41 42 43 44 45 46 49 50 51 52 53 54 55 56 57 60 61 62
 63 64 65 66 67 68 70 71 72 73 74 75 76 77 78 79 81 82 83 84]
Test: [ 0  2  6 16 21 25 27 31 32 33 39 47 48 58 59 69 80]
Fold: 3
Train: [ 0  1  2  3  4  5  6  8 11 12 13 14 15 16 18 19 20 21 22 23 24 25 26 27
 28 30 31 32 33 34 35 36 38 39 40 41 42 43 45 46 47 48 49 50 51 52

In [201]:
test_index

array([ 3, 11, 13, 23, 34, 42, 46, 49, 55, 56, 57, 61, 65, 71, 74, 75, 82])