# Modeling through Random Forest

## Load Filtered Training Data

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import metrics
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
from imblearn.over_sampling import SMOTE

# Notice that install "umap-learn" instead of "umap"
# Default parameters for umap.UMAP(): 
# n_components: 2, n_neighbors: 15, min_dist: 0.1, metric: euclidean

import umap.umap_ as umap

In [2]:
train = pd.read_csv('Data/4klog2Norm.tsv', sep = '\t', index_col = [0])
train = train.transpose()
train.index.name = 'r_id'
train.reset_index(inplace = True)
train['MGS_LEVEL'] = np.array([(int(s[-1]) - 1) for s in list(train['r_id'])])
train.head()

external_gene_name,r_id,TSPAN6,FUCA2,GCLC,ANKIB1,KRIT1,CD99,TMEM176A,CASP10,NDUFAF7,...,RP11-392E22.9,RP11-248C1.4,RP11-84A14.6,RP11-1037J10.1,RP11-5O24.1,RP11-244E17.1,RP11-419I17.1,AC013271.5,RP11-158M9.1,MGS_LEVEL
0,205_2,9.69873,9.00792,11.074547,11.255878,10.578447,9.69873,9.853723,5.876924,9.770795,...,9.213259,0.0,0.0,0.0,2.014479,0.0,6.376592,0.0,3.339286,1
1,251_1,9.181011,8.48025,10.224124,10.854084,10.851306,9.135288,9.296297,5.330312,9.875868,...,8.289156,0.0,2.814728,0.0,2.006454,0.0,6.358106,0.0,2.814728,0
2,474_3,8.99818,8.114741,10.315019,10.832309,10.474078,8.410391,9.031668,6.190813,9.971107,...,9.550266,0.0,2.808214,0.0,0.0,0.0,6.285467,0.0,3.32283,2
3,178_4,8.623853,7.775116,10.114775,10.970147,10.669332,8.469199,8.489462,6.511398,10.07845,...,9.06575,0.0,0.0,0.0,2.587999,0.0,7.421474,0.0,3.090678,3
4,313_1,9.26574,8.237808,10.334186,11.156822,10.967349,8.282984,9.32454,5.706331,10.097601,...,9.771254,0.0,0.0,0.0,2.887836,0.0,6.282053,0.0,4.087801,0


## Train Different Random Forest Models

In [3]:
# k: The number of folds in cross validation (at least 2)
# tree_num: The number of trees in random forest
# tree_depth: The maximum depth of trees in random forest
# ranking_limit: The number of genes you want to check from top to bottom in the ranking of importance
# seed: Random seed tha ensures our result is reproducible
# visualize: If you want to visualize the dataset before modeling or not

def modeling(k = 10, tree_num = 40, tree_depth = 5, ranking_limit = 0, seed = 123, visualize = False):
    
    X = np.vstack([train[train.MGS_LEVEL == i].drop(['r_id', 'MGS_LEVEL'], axis = 1) for i in range(1, 5)])
    y = np.concatenate([np.full(len(train[train.MGS_LEVEL == i]), i) for i in range(1, 5)])
    
    if (visualize):
        X_embedded = umap.UMAP().fit_transform(X)
        fig, ax = plt.subplots(figsize = (8, 6), dpi = 80, facecolor = 'w', edgecolor = 'k')
        pc1 = ax.scatter(X_embedded[y == 0, 0], X_embedded[y == 0, 1], label = 'No AMD / normal')
        pc2 = ax.scatter(X_embedded[y == 1, 0], X_embedded[y == 1, 1], label = 'Early stage AMD')
        pc3 = ax.scatter(X_embedded[y == 2, 0], X_embedded[y == 2, 1], label = 'Intermediate AMD')
        pc4 = ax.scatter(X_embedded[y == 3, 0], X_embedded[y == 3, 1], label = 'Advanced AMD')
        ax.set_title("Dimentional Reduction Based on Original Training Data")
        ax.legend()
        plt.show()
    
    # Modeling with original data
    
    # Random Forest
    
    rf = RandomForestClassifier(n_estimators = tree_num, max_depth = tree_depth, random_state = seed)
    
    acc = []
    features = train.drop(['r_id', 'MGS_LEVEL'], axis = 1).columns
    importance = [0 for i in range(len(features))]
    cv = KFold(n_splits = k, random_state = seed, shuffle = True)
    for train_index, test_index in cv.split(X):
        X_train, X_test, y_train, y_test = X[train_index], X[test_index], y[train_index], y[test_index]
        rf.fit(X_train, y_train)
        y_pred = rf.predict(X_test)
        acc.append(metrics.accuracy_score(y_test, y_pred))
        importance = [sum(i) for i in zip(importance, rf.feature_importances_)]     
    
    print("Random Forest (original): \n")
    print("Accuracy Score: " + str(sum(acc) / k) + "\n")
    if (ranking_limit != 0):
        print("Feature ranking: \n")
        importance = list(map(lambda x : x / k, importance))
        indices = np.argsort(importance)[::-1]
        for i in range(ranking_limit):
            print("%d. %s (%f)" % (i + 1, features[indices[i]], importance[indices[i]]))
        print()
    
    # Balanced Random Forest
    
    brf = BalancedRandomForestClassifier(n_estimators = tree_num, max_depth = tree_depth, random_state = seed)
    
    acc = []
    features = train.drop(['r_id', 'MGS_LEVEL'], axis = 1).columns
    importance = [0 for i in range(len(features))]
    cv = KFold(n_splits = k, random_state = seed, shuffle = True)
    for train_index, test_index in cv.split(X):
        X_train, X_test, y_train, y_test = X[train_index], X[test_index], y[train_index], y[test_index]
        brf.fit(X_train, y_train)
        y_pred = brf.predict(X_test)
        acc.append(metrics.accuracy_score(y_test, y_pred))
        importance = [sum(i) for i in zip(importance, brf.feature_importances_)]    
    
    print("Balanced Random Forest (original): \n")
    print("Accuracy Score: " + str(sum(acc) / k) + "\n")
    if (ranking_limit != 0):
        print("Feature ranking: \n")
        importance = list(map(lambda x : x / k, importance))
        indices = np.argsort(importance)[::-1]
        for i in range(ranking_limit):
            print("%d. %s (%f)" % (i + 1, features[indices[i]], importance[indices[i]]))
        print()
        
    # SMOTE
    
    sm = SMOTE(random_state = 123)
    
    # Random Forest
    
    rf = RandomForestClassifier(n_estimators = tree_num, max_depth = tree_depth, random_state = seed)
    
    acc = []
    features = train.drop(['r_id', 'MGS_LEVEL'], axis = 1).columns
    importance = [0 for i in range(len(features))]
    cv = KFold(n_splits = k, random_state = seed, shuffle = True)
    for train_index, test_index in cv.split(X):
        X_train, X_test, y_train, y_test = X[train_index], X[test_index], y[train_index], y[test_index]
        X_train, y_train = sm.fit_resample(X_train, y_train)
        rf.fit(X_train, y_train)
        y_pred = rf.predict(X_test)
        acc.append(metrics.accuracy_score(y_test, y_pred))
        importance = [sum(i) for i in zip(importance, rf.feature_importances_)]    
    
    print("Random Forest (SMOTE): \n")
    print("Accuracy Score: " + str(sum(acc) / k) + "\n")
    if (ranking_limit != 0):
        print("Feature ranking: \n")
        importance = list(map(lambda x : x / k, importance))
        indices = np.argsort(importance)[::-1]
        for i in range(ranking_limit):
            print("%d. %s (%f)" % (i + 1, features[indices[i]], importance[indices[i]]))
        print()
    
    # Balanced Random Forest
    
    brf = BalancedRandomForestClassifier(n_estimators = tree_num, max_depth = tree_depth, random_state = seed)
    
    acc = []
    features = train.drop(['r_id', 'MGS_LEVEL'], axis = 1).columns
    importance = [0 for i in range(len(features))]
    cv = KFold(n_splits = k, random_state = seed, shuffle = True)
    for train_index, test_index in cv.split(X):
        X_train, X_test, y_train, y_test = X[train_index], X[test_index], y[train_index], y[test_index]
        X_train, y_train = sm.fit_resample(X_train, y_train)
        brf.fit(X_train, y_train)
        y_pred = brf.predict(X_test)
        acc.append(metrics.accuracy_score(y_test, y_pred))
        importance = [sum(i) for i in zip(importance, brf.feature_importances_)]  
    
    print("Balanced Random Forest (SMOTE): \n")
    print("Accuracy Score: " + str(sum(acc) / k) + "\n")
    if (ranking_limit != 0):
        print("Feature ranking: \n")
        importance = list(map(lambda x : x / k, importance))
        indices = np.argsort(importance)[::-1]
        for i in range(ranking_limit):
            print("%d. %s (%f)" % (i + 1, features[indices[i]], importance[indices[i]]))
        print()
    

In [4]:
modeling()

Random Forest (original): 

Accuracy Score: 0.5548387096774194

Balanced Random Forest (original): 

Accuracy Score: 0.4548387096774194

Random Forest (SMOTE): 

Accuracy Score: 0.5548387096774194

Balanced Random Forest (SMOTE): 

Accuracy Score: 0.5258064516129034

