# Imports

In [61]:
import pyreadr
import pandas as pd
import numpy as np
import pickle
import os
import time
from pipelinehelper import PipelineHelper
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.metrics import explained_variance_score

# Load data

In [42]:
scATAC_df = pyreadr.read_r("count_overlap_data/processed_count_overlaps" \
                           "/count_filter_100_combined_count_overlaps.rds")
scATAC_df = scATAC_df[None]
scATAC_df = scATAC_df.T
mutations_df = pd.read_csv("processed_data/mut_count_data.csv", index_col=0)

# Select data

In [43]:
idx_select = ~pd.isnull(mutations_df).any(axis=1)
scATAC_df = scATAC_df[idx_select]
mutations_df = mutations_df[idx_select]

In [44]:
melanoma_mut_idx = [i for i, s in enumerate(mutations_df.columns.values) if 'Melanoma' in s][0]

In [45]:
melanoma_mutations = mutations_df.iloc[:, melanoma_mut_idx]

# Split Train/Test

In [46]:
X_train, X_test, y_train, y_test = train_test_split(scATAC_df, 
                                                    melanoma_mutations, 
                                                    test_size=0.10, 
                                                    random_state=42)

# Cross Validate

In [65]:
filename = "models/grid_search_results.pkl"

In [66]:
pipe = Pipeline([
    ('regressor', PipelineHelper([
        ('rf', RandomForestRegressor(random_state=42)),
    ])),
])

params = {
    'regressor__selected_model': pipe.named_steps['regressor'].generate({
        'rf__n_estimators':[10, 100, 1000],
    })
}

In [70]:
def grid_search(X_train, y_train, pipe, params, num_k_folds):
    start_time = time.time()
    grid_search_object = GridSearchCV(pipe, params, scoring="explained_variance", 
                       cv=KFold(num_k_folds), n_jobs=-1, verbose=100)
    grid_search_object.fit(X_train, y_train)
    print(f"--- {time.time() - start_time} seconds ---")
    return grid_search_object

In [71]:
if (not os.path.exists(filename)): 
    grid_search_results = grid_search(X_train, y_train, pipe, params, 10)
    pickle.dump(grid_search_results, open(filename, 'wb'))

Fitting 10 folds for each of 3 candidates, totalling 30 fits
--- 541.7950336933136 seconds ---


In [73]:
grid_search_results = pickle.load(open(filename, 'rb'))

# Feature Selection

In [74]:
def get_top_n_features(clf, n, features):
    feature_importances = clf.feature_importances_
    feat_importance_idx = np.argsort(feature_importances)[::-1]
    top_n_feats = features[feat_importance_idx][:n]
    return top_n_feats

In [116]:
def print_top_features(top_features):
    n = len(top_features)
    print(f"Top {n} features")
    for idx, feature in enumerate(top_n_feats):
        print(f"{idx+1}. {feature}")

In [124]:
print(f"Best Score: {grid_search_results.best_score_}\n")
best_model = grid_search_results.best_estimator_.get_params()['regressor__selected_model']
n = 20
top_n_feats = get_top_n_features(best_model, n, scATAC_df.columns.values)
#print_top_features(top_n_feats)

Best Score: 0.35446262348094126



In [132]:
def backward_eliminate_features(X_train, y_train, starting_clf, starting_n,
                                pipe, params, num_k_folds):
    top_n_feats = get_top_n_features(starting_clf, starting_n, X_train.columns.values)
    X_train = X_train.loc[:, top_n_feats]
    print_top_features(top_n_feats)
    while (len(top_n_feats) > 1):
        grid_search_results = grid_search(X_train, y_train, pipe, params, num_k_folds)
        best_model = grid_search_results.best_estimator_.get_params()['regressor__selected_model']
        top_n_feats = get_top_n_features(best_model, len(X_train.columns) - 1, X_train.columns)
        X_train = X_train.loc[:, top_n_feats]
        print_top_features(top_n_feats)

In [133]:
backward_eliminate_features(X_train, y_train, best_model, 20, pipe, params, 10)

Top 20 features
1. Artery Aorta Pericyte (General) 3
2. Colon Sigmoid Fibroblast (Gastrointestinal)
3. Skin Sun Exposed Melanocyte
4. Adipose Omentum Adipocyte
5. Artery Aorta Smooth Muscle (General)
6. Colon Sigmoid Smooth Muscle (Colon) 2
7. Adrenal Gland Fibroblast (Liver Adrenal)
8. Colon Sigmoid Fibroblast (General)
9. Skin Sun Exposed Fibroblast (Epithelial)
10. Muscle Type I Skeletal Myocyte
11. Esophagus Mucosa Smooth Muscle (Esophageal Mucosal)
12. Skin Melanocyte
13. Skin Fibroblast (Epithelial)
14. Esophagus Ge Junction Fibroblast (General)
15. Nerve Tibial Smooth Muscle (General)
16. Muscle Type II Skeletal Myocyte
17. Mammary Tissue Fibroblast (Epithelial)
18. Ovary Luteal Cell (Ovarian)
19. Small Intestine Smooth Muscle (General)
20. Heart Lv Ventricular Cardiomyocyte
Fitting 10 folds for each of 3 candidates, totalling 30 fits
--- 27.547184467315674 seconds ---
Top 19 features
1. Artery Aorta Pericyte (General) 3
2. Colon Sigmoid Fibroblast (Gastrointestinal)
3. Skin Sun

KeyboardInterrupt: 

# Test Set Performance

In [None]:
test_preds = best_model.predict(X_test)
print(f"Test set performance: {explained_variance_score(y_test, test_preds)}")