# scRNAseq Random Forest Model Evaluation - GSE144236

In [1]:
import joblib
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import pickle
import scanpy as sc
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report as class_rep
import time

In [2]:
# Prep
dataset_name = 'GSE144236' # label for the dataset
dictionary_dir = 'RFSplitDicts' # dir where we save the split dictionaries
model_dir = 'Model_Splits' # dir where we save the models

In [3]:
# Make sure necessary directories are avalible

# dictionary dir
if not os.path.exists(dictionary_dir):
    os.makedirs(dictionary_dir)
    print(f"Directory {dictionary_dir} created for saving split dictionaries")
    
else:
    print('Directory already exists!')


# model dir
if not os.path.exists(model_dir):
    os.makedirs(model_dir)
    print(f"Directory {model_dir} created for saving the trained models for the splits")
    
else:
    print('Directory already exists!')

Directory already exists!
Directory already exists!


Load the dataset and add split columns

In [4]:
adata = sc.read_h5ad('GSE144236_qc_hvg_anno_5k_raw_train_split.h5ad')

In [5]:
metadata = pd.read_csv('../DatasetSplits/Metadata_Splits/GSE144236_metadata_splits.csv')

In [6]:
adata.obs = adata.obs.merge(metadata, left_on='barcodes', right_on='barcodes', copy=False, suffixes=('', '_drop'))
adata.obs = adata.obs[adata.obs.columns[~adata.obs.columns.str.endswith('_drop')]]
adata.obs.index = adata.obs['barcodes']

AnnData expects .obs.index to contain strings, but got values like:
    [0, 1, 2, 3, 4]

    Inferred to be: integer

  value_idx = self._prep_dim_index(value.index, attr)


### Run RandomForest on each split

In [7]:
splits = [f'Split_{i}' for i in range(1,6)]

In [8]:
def runSplitRF(adata, split, cluster_col='celltypes'):
    """
    Running RandomForest on multiple splits of the dataset
    """
    
    print(f"Running a Random Forest Classifier on split: {split}")
    
    # separating the data based on the split column
    adata_train = adata[adata.obs[split] == 'train']
    adata_test = adata[adata.obs[split] == 'test']
    
    # converting data from sparse to dense
    X_train = adata_train.raw.X.todense()
    X_test = adata_test.raw.X.todense()
    
    y_train = adata_train.obs[cluster_col].tolist()
    y_test = adata_test.obs[cluster_col].tolist()
    
    # Running the model and calculating runtime
    start_time = time.time()
    clf = RandomForestClassifier(random_state=2022, n_jobs=4)
    clf.fit(X_train, y_train)
    end_time = time.time()
    total_runtime = (end_time - start_time)
    print(f"RF Runtime: {total_runtime:2.2f}")
    
    # generating the predictions
    y_pred = clf.predict(X_test)
    
    # model evaluation
    acc = accuracy_score(y_test, y_pred)
    print(f'Accuracy: {acc:4.4f}')
    
    # calculating the precision/recall based multi-label F1 score
    macro_score = f1_score(y_test, y_pred, average = 'macro' )
    w_score = f1_score(y_test, y_pred,average = 'weighted' )
    print(f'    -> Non-Weighted F1 Score on validation set: {macro_score:4.4f} ' )
    print(f'    -> Weighted F1 Score on validation set: {w_score:4.4f} ' )
    print(class_rep(y_test,y_pred))
    
    
    return clf, total_runtime, acc, macro_score, w_score

### Running

In [9]:
model_eval = {}

In [10]:
for i in splits:
    
    print(f"Working on Split: {i}")
    
    # create a dict for storing current split information
    split_dict = {'Runtime':[], 
                  'Accuracy':[], 
                  'Macro_Score':[], 
                  'Weighted_Score':[]}
    clf, total_runtime, acc, macro_score, w_score = runSplitRF(adata, 
                                                               i, 
                                                               cluster_col='celltypes')
    
    joblib.dump(clf, f"./{model_dir}/{dataset_name}_{i}_RF.pkl")
    
    split_dict['Runtime'].append(total_runtime)
    split_dict['Accuracy'].append(acc)
    split_dict['Macro_Score'].append(macro_score)
    split_dict['Weighted_Score'].append(w_score)
    
    model_eval[i]=split_dict
    
    

Working on Split: Split_1
Running a Random Forest Classifier on split: Split_1




RF Runtime: 10.73
Accuracy: 0.9518
    -> Non-Weighted F1 Score on validation set: 0.8838 
    -> Weighted F1 Score on validation set: 0.9510 




                  precision    recall  f1-score   support

            ASDC       0.95      0.30      0.45        67
          B Cell       0.98      0.83      0.90        48
            CD1C       0.78      0.92      0.85       988
          CLEC9A       0.98      0.94      0.96       178
Endothelial Cell       1.00      0.95      0.97        81
      Epithelial       1.00      1.00      1.00      5780
      Fibroblast       0.99      0.99      0.99       159
              LC       0.93      0.84      0.88       743
            MDSC       0.81      0.70      0.75       148
             Mac       0.85      0.82      0.83       661
      Melanocyte       1.00      0.99      1.00       164
              NK       1.00      0.76      0.86        25
             PDC       1.00      0.89      0.94        47
           Tcell       0.97      0.99      0.98       316

        accuracy                           0.95      9405
       macro avg       0.94      0.85      0.88      9405
    weighted



RF Runtime: 10.66
Accuracy: 0.9540
    -> Non-Weighted F1 Score on validation set: 0.8761 
    -> Weighted F1 Score on validation set: 0.9529 




                  precision    recall  f1-score   support

            ASDC       1.00      0.25      0.40        69
          B Cell       1.00      0.89      0.94        46
            CD1C       0.80      0.91      0.85       992
          CLEC9A       0.97      0.88      0.92       175
Endothelial Cell       1.00      0.98      0.99       101
      Epithelial       1.00      1.00      1.00      5753
      Fibroblast       0.98      0.99      0.99       186
              LC       0.93      0.86      0.89       790
            MDSC       0.81      0.74      0.77       142
             Mac       0.83      0.85      0.84       614
      Melanocyte       1.00      0.99      1.00       147
              NK       0.87      0.65      0.74        20
             PDC       1.00      0.90      0.95        49
           Tcell       0.98      0.99      0.98       321

        accuracy                           0.95      9405
       macro avg       0.94      0.85      0.88      9405
    weighted



RF Runtime: 10.68
Accuracy: 0.9564
    -> Non-Weighted F1 Score on validation set: 0.8899 
    -> Weighted F1 Score on validation set: 0.9558 




                  precision    recall  f1-score   support

            ASDC       0.96      0.35      0.52        68
          B Cell       0.95      0.82      0.88        49
            CD1C       0.79      0.92      0.85       963
          CLEC9A       0.97      0.88      0.92       149
Endothelial Cell       1.00      0.98      0.99        92
      Epithelial       1.00      1.00      1.00      5791
      Fibroblast       1.00      1.00      1.00       157
              LC       0.94      0.86      0.89       788
            MDSC       0.81      0.82      0.82       142
             Mac       0.88      0.84      0.86       645
      Melanocyte       1.00      1.00      1.00       158
              NK       1.00      0.67      0.80        24
             PDC       1.00      0.91      0.95        53
           Tcell       0.96      1.00      0.98       326

        accuracy                           0.96      9405
       macro avg       0.95      0.86      0.89      9405
    weighted



RF Runtime: 10.53
Accuracy: 0.9570
    -> Non-Weighted F1 Score on validation set: 0.8804 
    -> Weighted F1 Score on validation set: 0.9560 




                  precision    recall  f1-score   support

            ASDC       1.00      0.26      0.41        66
          B Cell       1.00      0.85      0.92        52
            CD1C       0.80      0.92      0.86       991
          CLEC9A       0.97      0.92      0.95       169
Endothelial Cell       1.00      0.98      0.99        98
      Epithelial       1.00      1.00      1.00      5750
      Fibroblast       0.99      0.99      0.99       166
              LC       0.94      0.85      0.89       775
            MDSC       0.78      0.79      0.79       112
             Mac       0.87      0.86      0.87       630
      Melanocyte       1.00      0.99      1.00       180
              NK       1.00      0.68      0.81        25
             PDC       1.00      0.79      0.88        47
           Tcell       0.97      1.00      0.98       344

        accuracy                           0.96      9405
       macro avg       0.95      0.85      0.88      9405
    weighted



RF Runtime: 10.83
Accuracy: 0.9524
    -> Non-Weighted F1 Score on validation set: 0.8847 
    -> Weighted F1 Score on validation set: 0.9515 




                  precision    recall  f1-score   support

            ASDC       0.86      0.26      0.40        70
          B Cell       1.00      0.91      0.95        56
            CD1C       0.78      0.91      0.84       955
          CLEC9A       0.98      0.91      0.95       173
Endothelial Cell       1.00      0.97      0.99       103
      Epithelial       1.00      1.00      1.00      5749
      Fibroblast       1.00      0.99      1.00       202
              LC       0.92      0.83      0.87       736
            MDSC       0.82      0.75      0.79       146
             Mac       0.86      0.86      0.86       650
      Melanocyte       1.00      1.00      1.00       148
              NK       1.00      0.75      0.86        24
             PDC       1.00      0.86      0.92        57
           Tcell       0.96      0.99      0.98       336

        accuracy                           0.95      9405
       macro avg       0.94      0.86      0.88      9405
    weighted

In [11]:
model_eval

{'Split_1': {'Runtime': [10.730849742889404],
  'Accuracy': [0.9518341307814993],
  'Macro_Score': [0.8837691174484116],
  'Weighted_Score': [0.9509508044181526]},
 'Split_2': {'Runtime': [10.658363819122314],
  'Accuracy': [0.9539606592238171],
  'Macro_Score': [0.8760627390511184],
  'Weighted_Score': [0.9528860112262604]},
 'Split_3': {'Runtime': [10.683604001998901],
  'Accuracy': [0.9564061669324827],
  'Macro_Score': [0.889882981894367],
  'Weighted_Score': [0.9558421097543456]},
 'Split_4': {'Runtime': [10.529392719268799],
  'Accuracy': [0.9570441254651781],
  'Macro_Score': [0.8804198394933491],
  'Weighted_Score': [0.9560314868028702]},
 'Split_5': {'Runtime': [10.831684112548828],
  'Accuracy': [0.9523657628920786],
  'Macro_Score': [0.8846927716378321],
  'Weighted_Score': [0.9514663608159801]}}

### Save the dictionary with metrics

In [12]:
# save the dictionary
def Pickler(data, filename):
    
    outfile = open(filename, 'wb+')
    
    #source destination
    
    pickle.dump(data, outfile)
    
    outfile.close()

Pickler(model_eval, filename=f"{dictionary_dir}/{dataset_name}_RF_EvalDict.pickle")

In [13]:
# to load

def Unpickler(filename):
    
    infile = open(filename, 'rb+')
    
    return_file = pickle.load(infile);
    
    infile.close()

    return return_file

test_loaddict = Unpickler(filename=f"{dictionary_dir}/{dataset_name}_RF_EvalDict.pickle")

In [14]:
test_loaddict

{'Split_1': {'Runtime': [10.730849742889404],
  'Accuracy': [0.9518341307814993],
  'Macro_Score': [0.8837691174484116],
  'Weighted_Score': [0.9509508044181526]},
 'Split_2': {'Runtime': [10.658363819122314],
  'Accuracy': [0.9539606592238171],
  'Macro_Score': [0.8760627390511184],
  'Weighted_Score': [0.9528860112262604]},
 'Split_3': {'Runtime': [10.683604001998901],
  'Accuracy': [0.9564061669324827],
  'Macro_Score': [0.889882981894367],
  'Weighted_Score': [0.9558421097543456]},
 'Split_4': {'Runtime': [10.529392719268799],
  'Accuracy': [0.9570441254651781],
  'Macro_Score': [0.8804198394933491],
  'Weighted_Score': [0.9560314868028702]},
 'Split_5': {'Runtime': [10.831684112548828],
  'Accuracy': [0.9523657628920786],
  'Macro_Score': [0.8846927716378321],
  'Weighted_Score': [0.9514663608159801]}}