# scRNAseq Random Forest Model Evaluation - SCP1361

In [1]:
import joblib
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import pickle
import scanpy as sc
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report as class_rep
import time

In [2]:
# Prep
dataset_name = 'SCP1361' # label for the dataset
dictionary_dir = 'RFSplitDicts' # dir where we save the split dictionaries
model_dir = 'Model_Splits' # dir where we save the models

In [3]:
# Make sure necessary directories are avalible

# dictionary dir
if not os.path.exists(dictionary_dir):
    os.makedirs(dictionary_dir)
    print(f"Directory {dictionary_dir} created for saving split dictionaries")
    
else:
    print('Directory already exists!')


# model dir
if not os.path.exists(model_dir):
    os.makedirs(model_dir)
    print(f"Directory {model_dir} created for saving the trained models for the splits")
    
else:
    print('Directory already exists!')

Directory already exists!
Directory already exists!


Load the dataset and add split columns

In [4]:
adata = sc.read_h5ad('scp1361_int_minproc_5k_split.h5ad')

In [5]:
metadata = pd.read_csv('../DatasetSplits/Metadata_Splits/SCP1361_metadata_splits.csv')

In [6]:
adata.obs = adata.obs.merge(metadata, left_on='barcodes', right_on='barcodes', copy=False, suffixes=('', '_drop'))
adata.obs = adata.obs[adata.obs.columns[~adata.obs.columns.str.endswith('_drop')]]
adata.obs.index = adata.obs['index']

AnnData expects .obs.index to contain strings, but got values like:
    [0, 1, 2, 3, 4]

    Inferred to be: integer

  value_idx = self._prep_dim_index(value.index, attr)


### Run RandomForest on each split

In [7]:
splits = [f'Split_{i}' for i in range(1,6)]

In [8]:
def runSplitRF(adata, split, cluster_col='celltypes'):
    """
    Running RandomForest on multiple splits of the dataset
    """
    
    print(f"Running a Random Forest Classifier on split: {split}")
    
    # separating the data based on the split column
    adata_train = adata[adata.obs[split] == 'train']
    adata_test = adata[adata.obs[split] == 'test']
    
    # converting data from sparse to dense
    X_train = adata_train.raw.X.todense()
    X_test = adata_test.raw.X.todense()
    
    y_train = adata_train.obs[cluster_col].tolist()
    y_test = adata_test.obs[cluster_col].tolist()
    
    # Running the model and calculating runtime
    start_time = time.time()
    clf = RandomForestClassifier(random_state=2022, n_jobs=4)
    clf.fit(X_train, y_train)
    end_time = time.time()
    total_runtime = (end_time - start_time)
    print(f"RF Runtime: {total_runtime:2.2f}")
    
    # generating the predictions
    y_pred = clf.predict(X_test)
    
    # model evaluation
    acc = accuracy_score(y_test, y_pred)
    print(f'Accuracy: {acc:4.4f}')
    
    # calculating the precision/recall based multi-label F1 score
    macro_score = f1_score(y_test, y_pred, average = 'macro' )
    w_score = f1_score(y_test, y_pred,average = 'weighted' )
    print(f'    -> Non-Weighted F1 Score on validation set: {macro_score:4.4f} ' )
    print(f'    -> Weighted F1 Score on validation set: {w_score:4.4f} ' )
    print(class_rep(y_test,y_pred))
    
    
    return clf, total_runtime, acc, macro_score, w_score

### Running

In [9]:
model_eval = {}

In [10]:
for i in splits:
    
    print(f"Working on Split: {i}")
    
    # create a dict for storing current split information
    split_dict = {'Runtime':[], 
                  'Accuracy':[], 
                  'Macro_Score':[], 
                  'Weighted_Score':[]}
    clf, total_runtime, acc, macro_score, w_score = runSplitRF(adata, 
                                                               i, 
                                                               cluster_col='celltypes')
    
    joblib.dump(clf, f"./{model_dir}/{dataset_name}_{i}_RF.pkl")
    
    split_dict['Runtime'].append(total_runtime)
    split_dict['Accuracy'].append(acc)
    split_dict['Macro_Score'].append(macro_score)
    split_dict['Weighted_Score'].append(w_score)
    
    model_eval[i]=split_dict
    
    

Working on Split: Split_1
Running a Random Forest Classifier on split: Split_1




RF Runtime: 4.40
Accuracy: 0.9333
    -> Non-Weighted F1 Score on validation set: 0.9280 
    -> Weighted F1 Score on validation set: 0.9316 
                    precision    recall  f1-score   support

            B cell       0.98      0.98      0.98       572
            T cell       0.96      0.96      0.96       254
    dendritic cell       0.92      0.96      0.94       195
  endothelial cell       0.98      0.94      0.96       181
        fibroblast       0.87      0.98      0.92      1486
        macrophage       0.98      0.91      0.94       552
  mesothelial cell       0.92      0.70      0.80       487
       neural cell       0.98      0.94      0.96        52
     pericyte cell       0.99      0.72      0.83       140
smooth muscle cell       0.99      0.98      0.98       882

          accuracy                           0.93      4801
         macro avg       0.96      0.91      0.93      4801
      weighted avg       0.94      0.93      0.93      4801

Working on Spli



RF Runtime: 4.31
Accuracy: 0.9402
    -> Non-Weighted F1 Score on validation set: 0.9299 
    -> Weighted F1 Score on validation set: 0.9389 
                    precision    recall  f1-score   support

            B cell       0.98      0.97      0.97       593
            T cell       0.96      0.96      0.96       262
    dendritic cell       0.94      0.96      0.95       214
  endothelial cell       0.99      0.95      0.97       167
        fibroblast       0.88      0.99      0.93      1505
        macrophage       0.98      0.94      0.96       563
  mesothelial cell       0.94      0.73      0.82       466
       neural cell       0.97      0.84      0.90        43
     pericyte cell       1.00      0.75      0.86       134
smooth muscle cell       0.99      0.98      0.98       854

          accuracy                           0.94      4801
         macro avg       0.96      0.91      0.93      4801
      weighted avg       0.94      0.94      0.94      4801

Working on Spli



RF Runtime: 4.33
Accuracy: 0.9333
    -> Non-Weighted F1 Score on validation set: 0.9229 
    -> Weighted F1 Score on validation set: 0.9315 
                    precision    recall  f1-score   support

            B cell       0.97      0.98      0.97       597
            T cell       0.95      0.96      0.96       258
    dendritic cell       0.92      0.92      0.92       193
  endothelial cell       0.99      0.96      0.97       162
        fibroblast       0.87      0.98      0.92      1465
        macrophage       0.98      0.92      0.95       562
  mesothelial cell       0.94      0.70      0.80       464
       neural cell       1.00      0.91      0.95        43
     pericyte cell       0.99      0.68      0.80       130
smooth muscle cell       0.98      0.98      0.98       927

          accuracy                           0.93      4801
         macro avg       0.96      0.90      0.92      4801
      weighted avg       0.94      0.93      0.93      4801

Working on Spli



RF Runtime: 4.29
Accuracy: 0.9311
    -> Non-Weighted F1 Score on validation set: 0.9137 
    -> Weighted F1 Score on validation set: 0.9290 
                    precision    recall  f1-score   support

            B cell       0.98      0.98      0.98       584
            T cell       0.99      0.98      0.98       262
    dendritic cell       0.94      0.95      0.95       171
  endothelial cell       0.97      0.91      0.94       148
        fibroblast       0.86      0.98      0.92      1537
        macrophage       0.97      0.93      0.95       544
  mesothelial cell       0.92      0.71      0.80       479
       neural cell       1.00      0.76      0.86        41
     pericyte cell       1.00      0.64      0.78       148
smooth muscle cell       0.98      0.97      0.98       887

          accuracy                           0.93      4801
         macro avg       0.96      0.88      0.91      4801
      weighted avg       0.94      0.93      0.93      4801

Working on Spli



RF Runtime: 4.29
Accuracy: 0.9361
    -> Non-Weighted F1 Score on validation set: 0.9259 
    -> Weighted F1 Score on validation set: 0.9342 
                    precision    recall  f1-score   support

            B cell       0.98      0.99      0.98       611
            T cell       0.97      0.98      0.97       273
    dendritic cell       0.93      0.92      0.93       195
  endothelial cell       0.98      0.94      0.96       190
        fibroblast       0.87      0.99      0.92      1479
        macrophage       0.98      0.95      0.96       519
  mesothelial cell       0.95      0.69      0.80       460
       neural cell       1.00      0.84      0.91        43
     pericyte cell       1.00      0.73      0.85       153
smooth muscle cell       0.98      0.97      0.97       878

          accuracy                           0.94      4801
         macro avg       0.96      0.90      0.93      4801
      weighted avg       0.94      0.94      0.93      4801





In [11]:
model_eval

{'Split_1': {'Runtime': [4.403515815734863],
  'Accuracy': [0.9333472193293064],
  'Macro_Score': [0.9279795658573183],
  'Weighted_Score': [0.9316438646692144]},
 'Split_2': {'Runtime': [4.313845872879028],
  'Accuracy': [0.9402207873359717],
  'Macro_Score': [0.9298528937637307],
  'Weighted_Score': [0.9389291290143946]},
 'Split_3': {'Runtime': [4.331141710281372],
  'Accuracy': [0.9333472193293064],
  'Macro_Score': [0.9228658332699078],
  'Weighted_Score': [0.9314665347510847]},
 'Split_4': {'Runtime': [4.2901411056518555],
  'Accuracy': [0.9310560299937513],
  'Macro_Score': [0.9137441195313946],
  'Weighted_Score': [0.9289988777511095]},
 'Split_5': {'Runtime': [4.288583040237427],
  'Accuracy': [0.9360549885440533],
  'Macro_Score': [0.925885286338105],
  'Weighted_Score': [0.9342291782960912]}}

### Save the dictionary with metrics

In [12]:
# save the dictionary
def Pickler(data, filename):
    
    outfile = open(filename, 'wb+')
    
    #source destination
    
    pickle.dump(data, outfile)
    
    outfile.close()

Pickler(model_eval, filename=f"{dictionary_dir}/{dataset_name}_RF_EvalDict.pickle")

In [13]:
# to load

def Unpickler(filename):
    
    infile = open(filename, 'rb+')
    
    return_file = pickle.load(infile);
    
    infile.close()

    return return_file

test_loaddict = Unpickler(filename=f"{dictionary_dir}/{dataset_name}_RF_EvalDict.pickle")

In [14]:
test_loaddict

{'Split_1': {'Runtime': [4.403515815734863],
  'Accuracy': [0.9333472193293064],
  'Macro_Score': [0.9279795658573183],
  'Weighted_Score': [0.9316438646692144]},
 'Split_2': {'Runtime': [4.313845872879028],
  'Accuracy': [0.9402207873359717],
  'Macro_Score': [0.9298528937637307],
  'Weighted_Score': [0.9389291290143946]},
 'Split_3': {'Runtime': [4.331141710281372],
  'Accuracy': [0.9333472193293064],
  'Macro_Score': [0.9228658332699078],
  'Weighted_Score': [0.9314665347510847]},
 'Split_4': {'Runtime': [4.2901411056518555],
  'Accuracy': [0.9310560299937513],
  'Macro_Score': [0.9137441195313946],
  'Weighted_Score': [0.9289988777511095]},
 'Split_5': {'Runtime': [4.288583040237427],
  'Accuracy': [0.9360549885440533],
  'Macro_Score': [0.925885286338105],
  'Weighted_Score': [0.9342291782960912]}}