In [44]:
import pandas as pd
import numpy as np

from sklearn.model_selection import *
from sklearn.metrics import *
from sklearn.preprocessing import *

from sklearn.pipeline import Pipeline
from sklearn.svm import SVC, LinearSVC

from sklearn.tree  import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [91]:
# helper function - true-positive rate 
def TPR(y_true, y_pred):
    M = confusion_matrix(y_true, y_pred)
    TN, FP, FN, TP = M[0, 0], M[0, 1], M[1, 0], M[1, 1]
    return TP / (TP + FN)

# helper function - true-negative rate 
def TNR(y_true, y_pred):
    M = confusion_matrix(y_true, y_pred)
    TN, FP, FN, TP = M[0, 0], M[0, 1], M[1, 0], M[1, 1]
    return TN / (TN + FP)

# scoring values
# TODO(Sokolov): разобраться в интерпретации этих величин
scoring = {
    "AUC": "roc_auc",
    "Balanced Accuracy": "balanced_accuracy",
    "Sensivity": make_scorer(TPR),
    "Specificity": make_scorer(TNR),
}

In [92]:
# Load TCGA RNA-seq data
df = pd.read_csv("../../data/breast_cancer/TCGA.tsv", sep="\t", index_col=0)
print(df[0:2])

                 MT-CO3     MT-CO2     MT-CO1     MT-ND4    MT-ATP6  \
TCGA-D8-A73W  13.824123  13.617516  14.068893  13.513785  12.975990   
TCGA-A2-A0YC  14.386709  14.625298  14.615428  14.438262  13.695031   

                MT-RNR2     MT-CYB     MT-ND1   MT-ND3     MT-ND2  ...  \
TCGA-D8-A73W  13.648528  12.942293  13.155674  12.5284  12.607828  ...   
TCGA-A2-A0YC  14.356808  13.094957  13.287214  13.0466  13.142747  ...   

              hsa-miR-431-5p  hsa-miR-188-5p  hsa-miR-3917  hsa-miR-129-5p  \
TCGA-D8-A73W        1.828233        0.422378      0.511132        1.239119   
TCGA-A2-A0YC        0.877462        0.877462      0.877462        1.419152   

              hsa-miR-548v  hsa-miR-192-3p  hsa-miR-134-3p  hsa-miR-551b-3p  \
TCGA-D8-A73W      0.887789        0.748694        1.014647         0.511134   
TCGA-A2-A0YC      0.000000        0.000000        1.419152         0.877462   

              hsa-miR-20b-3p  Class  
TCGA-D8-A73W        0.327804      1  
TCGA-A2-A0YC  

In [93]:
# Take gene subset (or all genes)

#genes = ["CDK1", "FOXM1", "LRIG2", "MSH2", "PLK1", "RACGAP1", "RRM2", "TMPO"]
genes =  list( df.columns.values)

In [94]:
# Form data
X = df[genes].to_numpy()
y = df["Class"].to_numpy()

In [95]:
# classification algo and params_for_opt
#--------------------------------------------------

# 1. SVM
#ml_algo = ("SVM", SVC(kernel="linear", class_weight="balanced"))
#opt_params = {"SVM__C": np.logspace(-4, 4, 9, base=4)}

# 2. Random Forest
#ml_algo = ("rf", RandomForestClassifier( class_weight="balanced" ))
ml_algo = ("rf", RandomForestClassifier() )
opt_params = {
    "rf__n_estimators": [200],
    "rf__max_depth": [25, 30]
}

# 3. Decision Tree
#ml_algo = ("dt", DecisionTreeClassifier( class_weight="balanced" ))
#opt_params = {
#    "dt__max_depth": [5, 10, 20]
#}

In [96]:
# Establish a classification pipeline
scaler = StandardScaler()
classifier = Pipeline([
    ("scaler", scaler),
    ml_algo ])

In [None]:
# Optimization cylce
splitter = RepeatedStratifiedKFold(n_splits=5, n_repeats=20)

CV = GridSearchCV( 
    classifier,
    opt_params,
    scoring=scoring,
    cv=splitter,
    refit=False)
CV.fit(X, y)

# Infer best parameter
mean_test_scoring_values = {s: CV.cv_results_["mean_test_" + s] for s in scoring}
max_BA_index = np.argmax(mean_test_scoring_values["Balanced Accuracy"])
best_scores = {s: mean_test_scoring_values[s][max_BA_index] for s in scoring}

print({s: mean_test_scoring_values[s][max_BA_index] for s in scoring})
print(CV.cv_results_["params"][max_BA_index])