In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold

from hyperopt import hp, fmin, tpe, Trials
from hyperopt.pyll import scope
import random, os

Most functions were put into a separate notebook for easier code reusability

In [None]:
%run Functions.ipynb

# Random Forest Anlysis

## SML

### Data preparation

In [None]:
df = pd.read_table("SML_normalized_filtered.tsv.gz", index_col=0)
df

In [None]:
#remove the glumatine at position 0
df['seq'] = df["seq"].apply(lambda seq: seq[1:3]+seq[4:])

In [None]:
feature_names = []
for i in ["-2","-1", "1", "2", "3", "4", "5", "6","7","8", "9"]:
    for char in "ARNDCQEGHILKMFPSTWYV":
        feature_names.append(str(i)+"\n"+char)

In [None]:
X = np.array([aa_to_one_hot(s) for s in df["seq"]])
Y = df["ER"].values

<br>

### Hyperparameter optimization

In [None]:
space={'max_features': hp.choice("max_features", ['auto', 'sqrt', scope.int(hp.quniform("max_features_explicit", 3, 18, 1))] ) ,
       'max_depth' : hp.choice ('max_depth', [None, scope.int(hp.quniform('max_depth_explicit', 2, df.shape[0], 1))]),
       'min_samples_split': hp.uniform ('min_samples_split',0,1),
       'min_samples_leaf' : scope.int(hp.quniform ('min_samples_leaf', 1, 50, 1)),
       'n_estimators': 200,
       'n_jobs':os.cpu_count()
    }

In [None]:
def evaluate_space(space):
    kf = KFold(n_splits=10, shuffle=True)
    
    min_eval_scores = []
    
    cross_validation = 1
    for train_index, test_index in kf.split(X):
        Xtrain = X[train_index]
        Ytrain = Y[train_index]
        Xtest = X[test_index]
        Ytest = Y[test_index]

        model = RandomForestRegressor(**space)
        model.fit(Xtrain,Ytrain)
        
        rmse = mean_squared_error(Ytest, model.predict(Xtest), squared=False)
        min_eval_scores.append(rmse)
        cross_validation+=1
        
    return np.mean(min_eval_scores)

In [None]:
trials = Trials()
fmin(fn=evaluate_space, space=space, algo=tpe.suggest, max_evals=1000, trials=trials)

In [None]:
min_loss = 999
best_values = None

for dt in trials._dynamic_trials:
    try:
        if dt["result"]["loss"]< min_loss:
            min_loss = dt["result"]["loss"]
            best_values = dt["misc"]["vals"]
    except:
        pass
print(min_loss)
print(best_values)

The best values dict contains the values inside of lists, so we'll have to unpack them first

In [None]:
backup = dict(best_values)

In [None]:
if len(best_values['max_depth_explicit'])==0:
    best_values['max_depth'] = None
else:
    best_values['max_depth'] = best_values['max_depth_explicit']

if len(best_values['max_features_explicit'])==0:
    if best_values['max_features'][0] == 0:
        best_values['max_features'][0] = 'auto'
    elif best_values['max_features'][0] == 1:
        best_values['max_features'][0] = 'sqrt'
else:
    best_values['max_features'] = best_values['max_features_explicit']
    
del best_values['max_features_explicit']
del best_values['max_depth_explicit']


In [None]:
for key in best_values.keys():
    try:
        best_values[key] = best_values[key][0]
    except:
        pass
best_values

Despite being integers, some values are stored as float and need to be explicitly stored as int, otherwise RandomForest will complain

In [None]:
try:
    best_values['max_features'] = int( best_values['max_features'])
except:
    pass
try:
    best_values['max_depth'] = int( best_values['max_depth'])
except:
    pass
try:
    best_values['min_samples_leaf'] = int( best_values['min_samples_leaf'])
except:
    pass
best_values

In [None]:
best_values

### Rerun using the tuned parameters

In [None]:
best_values['n_estimators'] = 1000
best_values['n_jobs'] = os.cpu_count()

In [None]:
permutation_results = []

kf = KFold(n_splits=10, shuffle=True)
cross_validation = 1
for train_index, test_index in kf.split(X):
    print("Cross validation:",cross_validation)
    Xtrain = X[train_index]
    Ytrain = Y[train_index]
    Xtest = X[test_index]
    Ytest = Y[test_index]
    
    model = RandomForestRegressor(**best_values)
    model.fit(Xtrain,Ytrain)

    rmse = mean_squared_error(Ytest, model.predict(Xtest), squared=False)
    
    permut_result = calculatePermutationImportance(model, X, Y, feature_names, savefigure=False, filename_suffix="_RF_hypertuned_SML", n_repeats=10)
    permutation_results.append(permut_result)
    cross_validation+=1

In [None]:
mergePermutationImportanceResults(permutation_results, feature_names, "_RF_SML")

<br><br>

### Data preparation (SRL)

In [None]:
df = pd.read_table("SRL_normalized_filtered.tsv.gz", index_col=0)
df

**SRL: X X Q X K L X X X W P X**

In [None]:
#for SRL, we'll only want to keep the amino acid positions that were mutated
df["seq"] = df["seq"].apply(lambda seq: seq[1:3]+seq[4]+seq[7:10]+seq[-1])

In [None]:
feature_names = []
for i in ["-2","-1", "1", "4", "5", "6", "9"]:
    for char in "ARNDCQEGHILKMFPSTWYV":
        feature_names.append(str(i)+"\n"+char)

In [None]:
X = np.array([aa_to_one_hot(s) for s in df["seq"]])
Y = df["ER"].values

<br><br>

In [None]:
space={'max_features': hp.choice("max_features", ['auto', 'sqrt', scope.int(hp.quniform("max_features_explicit", 3, 18, 1))] ) ,
       'max_depth' : hp.choice ('max_depth', [None, scope.int(hp.quniform('max_depth_explicit', 2, df.shape[0], 1))]),
       'min_samples_split': hp.uniform ('min_samples_split',0,1),
       'min_samples_leaf' : scope.int(hp.quniform ('min_samples_leaf', 1, 50, 1)),
       'n_estimators': 200,
       'n_jobs':os.cpu_count()
    }

In [None]:
def evaluate_space(space):
    kf = KFold(n_splits=10, shuffle=True)
    
    min_eval_scores = []
    
    cross_validation = 1
    for train_index, test_index in kf.split(X):
        Xtrain = X[train_index]
        Ytrain = Y[train_index]
        Xtest = X[test_index]
        Ytest = Y[test_index]

        model = RandomForestRegressor(**space)
        model.fit(Xtrain,Ytrain)
        
        rmse = mean_squared_error(Ytest, model.predict(Xtest), squared=False)
        min_eval_scores.append(rmse)
        cross_validation+=1
        
    return np.mean(min_eval_scores)

In [None]:
trials = Trials()
fmin(fn=evaluate_space, space=space, algo=tpe.suggest, max_evals=1000, trials=trials)

In [None]:
min_loss = 999
best_values = None

for dt in trials._dynamic_trials:
    try:
        if dt["result"]["loss"]< min_loss:
            min_loss = dt["result"]["loss"]
            best_values = dt["misc"]["vals"]
    except:
        pass
print(min_loss)
print(best_values)

In [None]:
if len(best_values['max_depth_explicit'])==0:
    best_values['max_depth'] = None
else:
    best_values['max_depth'] = best_values['max_depth_explicit']

if len(best_values['max_features_explicit'])==0:
    if best_values['max_features'][0] == 0:
        best_values['max_features'][0] = 'auto'
    elif best_values['max_features'][0] == 1:
        best_values['max_features'][0] = 'sqrt'
else:
    best_values['max_features'] = best_values['max_features_explicit']
    
del best_values['max_features_explicit']
del best_values['max_depth_explicit']


In [None]:
for key in best_values.keys():
    try:
        best_values[key] = best_values[key][0]
    except:
        pass
best_values

In [None]:
try:
    best_values['max_features'] = int( best_values['max_features'])
except:
    pass
try:
    best_values['max_depth'] = int( best_values['max_depth'])
except:
    pass
try:
    best_values['min_samples_leaf'] = int( best_values['min_samples_leaf'])
except:
    pass
best_values

### Rerun using the tuned parameters

In [None]:
best_values['n_estimators'] = 1000
best_values['n_jobs'] = os.cpu_count()

In [None]:
permutation_results = []

kf = KFold(n_splits=10, shuffle=True)
cross_validation = 1
for train_index, test_index in kf.split(X):
    print("Cross validation:",cross_validation)
    Xtrain = X[train_index]
    Ytrain = Y[train_index]
    Xtest = X[test_index]
    Ytest = Y[test_index]
    
    model = RandomForestRegressor(**best_values)
    #print(model)
    model.fit(Xtrain,Ytrain)

    rmse = mean_squared_error(Ytest, model.predict(Xtest), squared=False)

    permut_result = calculatePermutationImportance(model, X, Y, feature_names, savefigure=False, filename_suffix="_RF_hypertuned_SRL", n_repeats=10)
    permutation_results.append(permut_result)
    cross_validation+=1

In [None]:
mergePermutationImportanceResults(permutation_results, feature_names, "_RF_SRL")