# **Methylation Biomarkers for Predicting Cancer**

## **Random Forest for Feature Selection**

**Author:** Meg Hutch

**Date:** February 14, 2020

**Objective:** Use random forest to select genes for features in our deep learning classifier.

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, roc_auc_score, accuracy_score, auc, precision_recall_fscore_support, f1_score, log_loss
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, GridSearchCV

In [2]:
mcTrain = pd.read_csv('C:\\Users\\User\\Box Sync/Projects/Multi_Cancer_DL/02_Processed_Data/mcTrain_70_30.csv')

**Drop Un-neccessary columns**

In [3]:
mcTrain = mcTrain.drop(columns=["dilute_library_concentration", "age", "gender", "frag_mean"])

**Split Data into X inputs and Y outputs (diagnosis classification)**

In [4]:
mcTrain_x = mcTrain.drop(columns=["diagnosis"])
mcTrain_y = mcTrain[['seq_num','diagnosis']]

**Code the Categorical Data**

In [5]:
# Replace each outcome target with numerical value
mcTrain_y = mcTrain_y.replace('HEA', 0)
mcTrain_y = mcTrain_y.replace('CRC', 1)
mcTrain_y = mcTrain_y.replace('ESCA', 2)
mcTrain_y = mcTrain_y.replace('HCC', 3)
mcTrain_y = mcTrain_y.replace('STAD', 4)
mcTrain_y = mcTrain_y.replace('GBM', 5)
mcTrain_y = mcTrain_y.replace('BRCA', 6)

**Convert seq_num id to index**

In [6]:
mcTrain_x = mcTrain_x.set_index('seq_num')
mcTrain_y = mcTrain_y.set_index('seq_num')

**Split Training Data into a training/validation**

In [7]:
from sklearn.model_selection import train_test_split
np.random.seed(21420)
X_train, X_test, y_train, y_test = train_test_split(mcTrain_x, mcTrain_y, test_size=0.25, random_state=25, shuffle = True, stratify = mcTrain_y)

**Examine Disease Distributions**

In [8]:
y_train_perc = y_train.groupby(['diagnosis']).size()/len(y_train)*100
y_test_perc = y_test.groupby(['diagnosis']).size()/len(y_test)*100

print(y_train_perc)
print(y_test_perc)

diagnosis
0    25.414365
1    18.784530
2     9.944751
3    20.441989
4    10.497238
5     7.734807
6     7.182320
dtype: float64
diagnosis
0    24.590164
1    18.032787
2     9.836066
3    19.672131
4    11.475410
5     8.196721
6     8.196721
dtype: float64


**Define Random Forest Hypertuning Function**

In [9]:
def hypertuning_fxn(X, y, nfolds, model , param_grid, scoring='auc_roc', verbose=False): 
    """function that uses GridSearchCV to test a specified param_grid of hyperparameters and choose the optimal one based on nfolds cross-validation results. 

    Keyword arguments:
    model -- a 'fitted' sklearn model object 
    X -- predictor matrix (dtype='numpy array', required)
    y -- outcome vector (dtype='numpy array', required)
    cv -- if True, prints a the roc_auc score from 10-fold crossvalidation (dtype='boolean', default='True')
    """
    
    from sklearn.model_selection import KFold, GridSearchCV
    np.random.seed(12345)

    grid_search = GridSearchCV(estimator= model,
                                     param_grid=param_grid,
                                     cv=KFold(nfolds),
                                     scoring=scoring,
                                     return_train_score=True,
                                     n_jobs = -1)

        
    grid_search.fit(X, y)    
    print(" scorer function: {}".format(scoring))
    print(" ##### CV performance: mean & sd scores #####")

    means = grid_search.cv_results_['mean_test_score']
    stds = grid_search.cv_results_['std_test_score']
    print('best cv score: {:0.3f}'.format(grid_search.best_score_))
    print('best cv params: ', grid_search.best_params_)

    worst_index=np.argmin(grid_search.cv_results_['mean_test_score'])
    print('worst cv score: {:0.3f}'.format(grid_search.cv_results_['mean_test_score'][worst_index]))
    print('worst cv params: ', grid_search.cv_results_['params'][worst_index])
    ##
    if verbose==True:
        for mean, std, params in zip(means, stds, grid_search.cv_results_['params']):
            print("%0.3f (+/-%0.03f) for %r"% (mean, std * 2, params))
   
    return(grid_search)

**One-hot encode y classes**

In [10]:
from sklearn import preprocessing
y_train = preprocessing.label_binarize(y_train, classes=[0, 1, 2, 3, 4, 5, 6])
y_test = preprocessing.label_binarize(y_test, classes=[0, 1, 2, 3, 4, 5, 6])

**Convert to arrays**

In [11]:
X_train = X_train.values
X_test = X_test.values

**Tune Hyperparameters**

In [12]:
### tuning RF hyperparameters
# Number of trees in random forest
n_estimators = [100] # 300, 500, 100
# Number of features to consider at every split
max_features = [3,10,'auto']
# Maximum number of levels in tree
max_depth = [5,25]
# Minimum number of samples required to split a node
min_samples_split = [2, 5]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 5]


param_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}

model = RandomForestClassifier(criterion='entropy', random_state=12345)

rf_hyper=hypertuning_fxn(X_train, y_train, nfolds=5, model=model , param_grid=param_grid, scoring='roc_auc')



 scorer function: roc_auc
 ##### CV performance: mean & sd scores #####
best cv score: 0.666
best cv params:  {'max_depth': 25, 'max_features': 'auto', 'min_samples_leaf': 5, 'min_samples_split': 2, 'n_estimators': 100}
worst cv score: 0.587
worst cv params:  {'max_depth': 5, 'max_features': 3, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100}


**Return the Best Estimator**

In [14]:
rf = rf_hyper.best_estimator_
rf

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=25, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=5, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=12345, verbose=0,
            warm_start=False)

**Evaluate Model Performance**

In [15]:
# graphs for roc
def ez_roc(model, x, y, pos_label=1):
    """prints a basic Recievor Operator Curve (ROC). 

    Keyword arguments:
    model -- a 'fitted' sklearn model object 
    x -- predictor matrix (dtype='numpy array', required)
    y -- outcome vector (dtype='numpy array', required)
    pos_label --binary label considered positive in y  (dtype='int', default=1)
    """
    from sklearn.metrics import roc_curve, auc

    model_name=type(model).__name__ # defining model name as the __name__ characteristic held by sklearn models

    y_proba = model.predict_proba(x)[:,1]
        
    fpr, tpr, thresholds = roc_curve(y, y_proba, pos_label=pos_label)
    roc_auc = auc(fpr, tpr)
    
    plt.title('ROC curve')
    ax1= plt.plot(fpr, tpr, 'b', label = '%s AUC = %0.3f' % (model_name, roc_auc), linewidth=2)
    plt.legend(loc = 'lower right')
    plt.plot([0, 1], [0, 1],'r--')
    plt.xlim([0, 1])
    plt.ylim([0, 1])
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    return()

In [16]:
# graphs for precision-recall curve
def ez_prc(model, x, y, pos_label=1):
    """prints a basic Precision-Recall curve. 
    recall: the porportion of positives in the dataset that were correctly classified. (true_pos/ (true_pos + false_neg))
    precision: the porportion of predicted y=1 values that are correct (true_pos/ (true_pos + false_pos))

    Keyword arguments:
    model -- a 'fitted' sklearn model object 
    x -- predictor matrix (dtype='numpy array', required)
    y -- outcome vector (dtype='numpy array', required)
    pos_label --binary label considered positive in y  (dtype='int', default=1)
    """
    from sklearn.metrics import average_precision_score, precision_recall_curve

    model_name=type(model).__name__ # defining model name as the __name__ characteristic held by sklearn models

    y_proba = model.predict_proba(x)[:,1]

    precision, recall, thresholds =precision_recall_curve(y, y_proba, pos_label=1, sample_weight=None)
    avg_p=average_precision_score(y, y_proba, pos_label=1, sample_weight=None)
    
    plt.title('Precision-Recall curve')
    ax1= plt.plot(precision, recall, 'b', label = '%s AP = %0.3f' % (model_name, avg_p), linewidth=2)
    plt.legend(loc = 'lower left')
    plt.xlim([0, 1])
    plt.ylim([0, 1])
    plt.ylabel('Precision')
    plt.xlabel('Recall')

In [17]:
def evaluate_model(model,x,y, cv=True):
    """prints common binary classification evaluation metrics and an ROC curve. 

    Keyword arguments:
    model -- a 'fitted' sklearn model object 
    x -- predictor matrix (dtype='numpy array', required)
    y -- outcome vector (dtype='numpy array', required)
    cv -- if True, prints a the roc_auc score from 10-fold crossvalidation (dtype='boolean', default='True')
    """
    import sklearn.metrics
    from sklearn.metrics import log_loss, average_precision_score, precision_recall_curve
    from sklearn.model_selection import cross_val_score

    if cv==True:
        cv_results= cross_val_score(model, x, y, scoring='roc_auc', cv=10)
        print("across 10 fold cv on trainingset, the model had \n", 
             "mean auroc: {:0.3f}".format(np.mean(cv_results)), "\n",
             "std auroc: {:0.3f}".format(np.std(cv_results))
             )

        base_cv_score=np.mean(cross_val_score(model, x, y, scoring='roc_auc', cv=10)) 

    print("###metrics on provided dataset:###")
    ##basic model performance
    y_hat = model.predict(x) # predicted classes using default 0.5 threshold
    y_proba = model.predict_proba(x)[:,1] #predicted probabilities
    errors = abs(y_hat - y)
    mape = 100 * np.mean(errors / y) # mean absolute percentage error
    accuracy = 100 - mape 
    auc=roc_auc_score(y, y_proba)
    loss= log_loss(y, y_hat)

    print ('the AUC is: {:0.3f}'.format(auc))
    print ('the logloss is: {:0.3f}'.format(loss))
    print("confusion matrix:\n ", confusion_matrix(y, y_hat))
    print("classification report:\n ", classification_report(y,y_hat, digits=3))

    ez_roc(model, x, y, pos_label=1) #plotting roc curve
    plt.show()
    ez_prc(model, x, y, pos_label=1) #plotting roc curve
    plt.show()

In [21]:
evaluate_model(model, X_train,y_train, cv=False)



###metrics on provided dataset:###


TypeError: list indices must be integers or slices, not tuple

**Evaluate Model on the Test Set**

In [19]:
print('\n RandomForests:')
evaluate_model(rf, X_test,y_test,cv=False)


 RandomForests:
###metrics on provided dataset:###


TypeError: list indices must be integers or slices, not tuple

# **Examine Important Features**

**Just run based on what the best estimator ends up being?**

In [None]:
## 02.17.2020: Is selectfrommodel code neccessary? 
#from sklearn.feature_selection import SelectFromModel
# First feature selection test 
#sel = SelectFromModel(RandomForestClassifier(n_estimators = 100))
#sel = SelectFromModel(rf_hyper.best_estimator_)
#sel.fit(X_train, y_train)
#sel.get_support()
#selected_feat= X_train.columns[(sel.get_support())]
#print(len(selected_feat)) 
#print(selected_feat)

# rerun rf model w/out grid search
#model = RandomForestClassifier(criterion='entropy', random_state=12345)
rf = rf.fit(X_train, y_train)

## histogram of the top 10 features by importance
# create a list of features/column names
features = list(X_train.columns.values) 

feat_importances = pd.Series(rf.feature_importances_, index=X_train.columns)  
feat_importances.nlargest(25).plot(kind='barh')