# scRNAseq Random Forest Model Evaluation - GSE_ImmuneCSF

In [1]:
import joblib
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import pickle
import scanpy as sc
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report as class_rep
import time

In [2]:
# Prep
dataset_name = 'GSE_ImmuneCSF' # label for the dataset
dictionary_dir = 'RFSplitDicts' # dir where we save the split dictionaries
model_dir = 'Model_Splits' # dir where we save the models

In [3]:
# Make sure necessary directories are avalible

# dictionary dir
if not os.path.exists(dictionary_dir):
    os.makedirs(dictionary_dir)
    print(f"Directory {dictionary_dir} created for saving split dictionaries")
    
else:
    print('Directory already exists!')


# model dir
if not os.path.exists(model_dir):
    os.makedirs(model_dir)
    print(f"Directory {model_dir} created for saving the trained models for the splits")
    
else:
    print('Directory already exists!')

Directory already exists!
Directory already exists!


Load the dataset and add split columns

In [4]:
adata = sc.read_h5ad('GSE_ImmuneCSF_qc_hvg_anno_5k_raw_train_split.h5ad')

In [5]:
metadata = pd.read_csv('../DatasetSplits/Metadata_Splits/GSE_ImmuneCSF_metadata_splits.csv')

In [8]:
adata.obs = adata.obs.merge(metadata, left_on='barcodes', right_on='barcodes', copy=False, suffixes=('', '_drop'))
adata.obs = adata.obs[adata.obs.columns[~adata.obs.columns.str.endswith('_drop')]]
adata.obs.index = adata.obs['barcodes']

AnnData expects .obs.index to contain strings, but got values like:
    [0, 1, 2, 3, 4]

    Inferred to be: integer

  value_idx = self._prep_dim_index(value.index, attr)


### Run RandomForest on each split

In [9]:
splits = [f'Split_{i}' for i in range(1,6)]

In [15]:
def runSplitRF(adata, split, cluster_col='celltypes'):
    """
    Running RandomForest on multiple splits of the dataset
    """
    
    print(f"Running a Random Forest Classifier on split: {split}")
    
    # separating the data based on the split column
    adata_train = adata[adata.obs[split] == 'train']
    adata_test = adata[adata.obs[split] == 'test']
    
    # converting data from sparse to dense
    X_train = adata_train.raw.X.todense()
    X_test = adata_test.raw.X.todense()
    
    y_train = adata_train.obs[cluster_col].tolist()
    y_test = adata_test.obs[cluster_col].tolist()
    
    # Running the model and calculating runtime
    start_time = time.time()
    clf = RandomForestClassifier(random_state=2022, n_jobs=4)
    clf.fit(X_train, y_train)
    end_time = time.time()
    total_runtime = (end_time - start_time)
    print(f"RF Runtime: {total_runtime:2.2f}")
    
    # generating the predictions
    y_pred = clf.predict(X_test)
    
    # model evaluation
    acc = accuracy_score(y_test, y_pred)
    print(f'Accuracy: {acc:4.4f}')
    
    # calculating the precision/recall based multi-label F1 score
    macro_score = f1_score(y_test, y_pred, average = 'macro' )
    w_score = f1_score(y_test, y_pred,average = 'weighted' )
    print(f'    -> Non-Weighted F1 Score on validation set: {macro_score:4.4f} ' )
    print(f'    -> Weighted F1 Score on validation set: {w_score:4.4f} ' )
    print(class_rep(y_test,y_pred))
    
    
    return clf, total_runtime, acc, macro_score, w_score

### Running

In [16]:
model_eval = {}

In [17]:
for i in splits:
    
    print(f"Working on Split: {i}")
    
    # create a dict for storing current split information
    split_dict = {'Runtime':[], 
                  'Accuracy':[], 
                  'Macro_Score':[], 
                  'Weighted_Score':[]}
    clf, total_runtime, acc, macro_score, w_score = runSplitRF(adata, 
                                                               i, 
                                                               cluster_col='general_celltypes')
    
    joblib.dump(clf, f"./{model_dir}/{dataset_name}_{i}_RF.pkl")
    
    split_dict['Runtime'].append(total_runtime)
    split_dict['Accuracy'].append(acc)
    split_dict['Macro_Score'].append(macro_score)
    split_dict['Weighted_Score'].append(w_score)
    
    model_eval[i]=split_dict
    
    

Working on Split: Split_1
Running a Random Forest Classifier on split: Split_1




RF Runtime: 27.30




Accuracy: 0.9017
    -> Non-Weighted F1 Score on validation set: 0.8279 
    -> Weighted F1 Score on validation set: 0.8925 
              precision    recall  f1-score   support

     B cells       0.97      0.93      0.95       200
         CD4       0.89      0.98      0.93      8092
         CD8       0.88      0.83      0.86      2933
          NK       0.96      0.82      0.89       809
        Treg       0.97      0.16      0.27       428
     cycling       0.98      0.60      0.75       131
granulocytes       0.96      0.74      0.83       265
         mDC       0.95      0.84      0.89       466
   monocytes       0.92      0.97      0.94      1537
         pDC       1.00      0.93      0.96       140
      plasma       0.89      0.79      0.84       107

    accuracy                           0.90     15108
   macro avg       0.94      0.78      0.83     15108
weighted avg       0.90      0.90      0.89     15108

Working on Split: Split_2
Running a Random Forest Classifier o



RF Runtime: 27.27




Accuracy: 0.9045
    -> Non-Weighted F1 Score on validation set: 0.8406 
    -> Weighted F1 Score on validation set: 0.8949 
              precision    recall  f1-score   support

     B cells       0.99      0.96      0.97       203
         CD4       0.89      0.98      0.93      8038
         CD8       0.90      0.83      0.87      3016
          NK       0.96      0.85      0.90       810
        Treg       1.00      0.15      0.26       448
     cycling       0.97      0.66      0.78       132
granulocytes       0.96      0.76      0.85       266
         mDC       0.94      0.84      0.89       424
   monocytes       0.92      0.97      0.95      1505
         pDC       1.00      0.98      0.99       142
      plasma       0.93      0.81      0.86       124

    accuracy                           0.90     15108
   macro avg       0.95      0.80      0.84     15108
weighted avg       0.91      0.90      0.89     15108

Working on Split: Split_3
Running a Random Forest Classifier o



RF Runtime: 27.21




Accuracy: 0.9022
    -> Non-Weighted F1 Score on validation set: 0.8387 
    -> Weighted F1 Score on validation set: 0.8926 
              precision    recall  f1-score   support

     B cells       1.00      0.94      0.97       199
         CD4       0.89      0.98      0.93      8045
         CD8       0.89      0.83      0.86      2996
          NK       0.96      0.84      0.90       782
        Treg       0.99      0.15      0.26       448
     cycling       0.94      0.72      0.81       144
granulocytes       0.95      0.75      0.84       270
         mDC       0.95      0.83      0.89       438
   monocytes       0.92      0.96      0.94      1558
         pDC       1.00      0.95      0.97       133
      plasma       0.94      0.80      0.86        95

    accuracy                           0.90     15108
   macro avg       0.95      0.79      0.84     15108
weighted avg       0.91      0.90      0.89     15108

Working on Split: Split_4
Running a Random Forest Classifier o



RF Runtime: 26.73




Accuracy: 0.8989
    -> Non-Weighted F1 Score on validation set: 0.8292 
    -> Weighted F1 Score on validation set: 0.8886 
              precision    recall  f1-score   support

     B cells       0.99      0.96      0.97       213
         CD4       0.88      0.98      0.93      7956
         CD8       0.90      0.83      0.86      3118
          NK       0.95      0.83      0.88       805
        Treg       1.00      0.15      0.26       472
     cycling       0.96      0.59      0.73       112
granulocytes       0.95      0.71      0.81       276
         mDC       0.93      0.83      0.87       436
   monocytes       0.91      0.97      0.94      1461
         pDC       1.00      0.95      0.97       147
      plasma       0.94      0.84      0.89       112

    accuracy                           0.90     15108
   macro avg       0.95      0.78      0.83     15108
weighted avg       0.90      0.90      0.89     15108

Working on Split: Split_5
Running a Random Forest Classifier o



RF Runtime: 26.94




Accuracy: 0.9034
    -> Non-Weighted F1 Score on validation set: 0.8368 
    -> Weighted F1 Score on validation set: 0.8940 
              precision    recall  f1-score   support

     B cells       0.99      0.95      0.97       201
         CD4       0.89      0.98      0.93      8006
         CD8       0.90      0.83      0.86      3095
          NK       0.95      0.83      0.89       779
        Treg       1.00      0.15      0.27       437
     cycling       0.93      0.66      0.77       113
granulocytes       0.97      0.73      0.84       295
         mDC       0.94      0.86      0.90       448
   monocytes       0.92      0.97      0.94      1475
         pDC       0.99      0.94      0.97       135
      plasma       0.93      0.82      0.87       124

    accuracy                           0.90     15108
   macro avg       0.95      0.79      0.84     15108
weighted avg       0.91      0.90      0.89     15108



In [18]:
model_eval

{'Split_1': {'Runtime': [27.30408811569214],
  'Accuracy': [0.9017077045274027],
  'Macro_Score': [0.8279329588016217],
  'Weighted_Score': [0.892549967997973]},
 'Split_2': {'Runtime': [27.273162841796875],
  'Accuracy': [0.9044876886417792],
  'Macro_Score': [0.8405634490374017],
  'Weighted_Score': [0.8949441561364364]},
 'Split_3': {'Runtime': [27.20952796936035],
  'Accuracy': [0.9021710352131321],
  'Macro_Score': [0.8387291912231846],
  'Weighted_Score': [0.8925738925052926]},
 'Split_4': {'Runtime': [26.731171131134033],
  'Accuracy': [0.8988615303150649],
  'Macro_Score': [0.8291860693050807],
  'Weighted_Score': [0.8885605022574452]},
 'Split_5': {'Runtime': [26.94321084022522],
  'Accuracy': [0.9033624569764364],
  'Macro_Score': [0.8368312591896472],
  'Weighted_Score': [0.8940008984473465]}}

### Save the dictionary with metrics

In [19]:
# save the dictionary
def Pickler(data, filename):
    
    outfile = open(filename, 'wb+')
    
    #source destination
    
    pickle.dump(data, outfile)
    
    outfile.close()

Pickler(model_eval, filename=f"{dictionary_dir}/{dataset_name}_RF_EvalDict.pickle")

In [20]:
# to load

def Unpickler(filename):
    
    infile = open(filename, 'rb+')
    
    return_file = pickle.load(infile);
    
    infile.close()

    return return_file

test_loaddict = Unpickler(filename=f"{dictionary_dir}/{dataset_name}_RF_EvalDict.pickle")

In [21]:
test_loaddict

{'Split_1': {'Runtime': [27.30408811569214],
  'Accuracy': [0.9017077045274027],
  'Macro_Score': [0.8279329588016217],
  'Weighted_Score': [0.892549967997973]},
 'Split_2': {'Runtime': [27.273162841796875],
  'Accuracy': [0.9044876886417792],
  'Macro_Score': [0.8405634490374017],
  'Weighted_Score': [0.8949441561364364]},
 'Split_3': {'Runtime': [27.20952796936035],
  'Accuracy': [0.9021710352131321],
  'Macro_Score': [0.8387291912231846],
  'Weighted_Score': [0.8925738925052926]},
 'Split_4': {'Runtime': [26.731171131134033],
  'Accuracy': [0.8988615303150649],
  'Macro_Score': [0.8291860693050807],
  'Weighted_Score': [0.8885605022574452]},
 'Split_5': {'Runtime': [26.94321084022522],
  'Accuracy': [0.9033624569764364],
  'Macro_Score': [0.8368312591896472],
  'Weighted_Score': [0.8940008984473465]}}