# scRNAseq Random Forest Model Evaluation - Lukassen2020_Lung

In [1]:
import joblib
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import pickle
import scanpy as sc
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report as class_rep
import time

In [2]:
# Prep
dataset_name = 'Lukassen2020_Lung' # label for the dataset
dictionary_dir = 'RFSplitDicts' # dir where we save the split dictionaries
model_dir = 'Model_Splits' # dir where we save the models

In [3]:
# Make sure necessary directories are avalible

# dictionary dir
if not os.path.exists(dictionary_dir):
    os.makedirs(dictionary_dir)
    print(f"Directory {dictionary_dir} created for saving split dictionaries")
    
else:
    print('Directory already exists!')


# model dir
if not os.path.exists(model_dir):
    os.makedirs(model_dir)
    print(f"Directory {model_dir} created for saving the trained models for the splits")
    
else:
    print('Directory already exists!')

Directory already exists!
Directory already exists!


Load the dataset and add split columns

In [4]:
adata = sc.read_h5ad('Lukassen2020_Lung_qc_hvg_anno_5k_split.h5ad')

In [6]:
metadata = pd.read_csv('../DatasetSplits/Metadata_Splits/Lukassen2020_Lung_metadata_splits.csv')

In [7]:
adata.obs = adata.obs.merge(metadata, left_on='barcodes', right_on='barcodes', copy=False, suffixes=('', '_drop'))
adata.obs = adata.obs[adata.obs.columns[~adata.obs.columns.str.endswith('_drop')]]
adata.obs.index = adata.obs['index']
# adata.obs.index = adata.obs.index.astype(str)

AnnData expects .obs.index to contain strings, but got values like:
    [0, 1, 2, 3, 4]

    Inferred to be: integer

  value_idx = self._prep_dim_index(value.index, attr)


### Run RandomForest on each split

In [8]:
splits = [f'Split_{i}' for i in range(1,6)]

In [9]:
def runSplitRF(adata, split, cluster_col='celltypes'):
    """
    Running RandomForest on multiple splits of the dataset
    """
    
    print(f"Running a Random Forest Classifier on split: {split}")
    
    # separating the data based on the split column
    adata_train = adata[adata.obs[split] == 'train']
    adata_test = adata[adata.obs[split] == 'test']
    
    # converting data from sparse to dense
    X_train = adata_train.raw.X.todense()
    X_test = adata_test.raw.X.todense()
    
    y_train = adata_train.obs[cluster_col].tolist()
    y_test = adata_test.obs[cluster_col].tolist()
    
    # Running the model and calculating runtime
    start_time = time.time()
    clf = RandomForestClassifier(random_state=2022, n_jobs=4)
    clf.fit(X_train, y_train)
    end_time = time.time()
    total_runtime = (end_time - start_time)
    print(f"RF Runtime: {total_runtime:2.2f}")
    
    # generating the predictions
    y_pred = clf.predict(X_test)
    
    # model evaluation
    acc = accuracy_score(y_test, y_pred)
    print(f'Accuracy: {acc:4.4f}')
    
    # calculating the precision/recall based multi-label F1 score
    macro_score = f1_score(y_test, y_pred, average = 'macro' )
    w_score = f1_score(y_test, y_pred,average = 'weighted' )
    print(f'    -> Non-Weighted F1 Score on validation set: {macro_score:4.4f} ' )
    print(f'    -> Weighted F1 Score on validation set: {w_score:4.4f} ' )
    print(class_rep(y_test,y_pred))
    
    
    return clf, total_runtime, acc, macro_score, w_score

### Running

In [10]:
model_eval = {}

In [11]:
for i in splits:
    
    print(f"Working on Split: {i}")
    
    # create a dict for storing current split information
    split_dict = {'Runtime':[], 
                  'Accuracy':[], 
                  'Macro_Score':[], 
                  'Weighted_Score':[]}
    clf, total_runtime, acc, macro_score, w_score = runSplitRF(adata, 
                                                               i, 
                                                               cluster_col='celltypes')
    
    joblib.dump(clf, f"./{model_dir}/{dataset_name}_{i}_RF.pkl")
    
    split_dict['Runtime'].append(total_runtime)
    split_dict['Accuracy'].append(acc)
    split_dict['Macro_Score'].append(macro_score)
    split_dict['Weighted_Score'].append(w_score)
    
    model_eval[i]=split_dict
    
    

Working on Split: Split_1
Running a Random Forest Classifier on split: Split_1




RF Runtime: 13.97
Accuracy: 0.9177
    -> Non-Weighted F1 Score on validation set: 0.7356 
    -> Weighted F1 Score on validation set: 0.9073 




                      precision    recall  f1-score   support

                 AT1       0.91      0.98      0.94      1347
                 AT2       0.92      0.98      0.95      3049
            Ciliated       0.87      0.90      0.88       401
                Club       0.97      0.71      0.82       258
         Endothelial       0.87      0.91      0.89       823
         Fibroblasts       0.99      0.63      0.77       315
LymphaticEndothelium       1.00      0.04      0.07        57
           Monocytes       0.95      0.96      0.95      1517
              TCells       1.00      0.21      0.34       189

            accuracy                           0.92      7956
           macro avg       0.94      0.70      0.74      7956
        weighted avg       0.92      0.92      0.91      7956

Working on Split: Split_2
Running a Random Forest Classifier on split: Split_2




RF Runtime: 13.61
Accuracy: 0.9143
    -> Non-Weighted F1 Score on validation set: 0.7351 
    -> Weighted F1 Score on validation set: 0.9041 




                      precision    recall  f1-score   support

                 AT1       0.92      0.98      0.94      1389
                 AT2       0.91      0.98      0.94      2999
            Ciliated       0.82      0.90      0.86       389
                Club       0.96      0.63      0.76       271
         Endothelial       0.88      0.89      0.89       811
         Fibroblasts       0.98      0.64      0.78       302
LymphaticEndothelium       1.00      0.07      0.14        54
           Monocytes       0.96      0.96      0.96      1545
              TCells       0.98      0.21      0.35       196

            accuracy                           0.91      7956
           macro avg       0.93      0.70      0.74      7956
        weighted avg       0.92      0.91      0.90      7956

Working on Split: Split_3
Running a Random Forest Classifier on split: Split_3




RF Runtime: 13.74
Accuracy: 0.9170
    -> Non-Weighted F1 Score on validation set: 0.7355 
    -> Weighted F1 Score on validation set: 0.9061 


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                      precision    recall  f1-score   support

                 AT1       0.90      0.97      0.94      1323
                 AT2       0.91      0.99      0.95      3045
            Ciliated       0.86      0.95      0.90       402
                Club       0.97      0.68      0.80       287
         Endothelial       0.88      0.90      0.89       791
         Fibroblasts       1.00      0.63      0.77       323
LymphaticEndothelium       0.00      0.00      0.00        65
           Monocytes       0.96      0.96      0.96      1513
              TCells       1.00      0.26      0.41       207

            accuracy                           0.92      7956
           macro avg       0.83      0.70      0.74      7956
        weighted avg       0.91      0.92      0.91      7956

Working on Split: Split_4
Running a Random Forest Classifier on split: Split_4
RF Runtime: 13.57
Accuracy: 0.9145
    -> Non-Weighted F1 Score on validation set: 0.7470 
    -> Weighted F1 Sc



                      precision    recall  f1-score   support

                 AT1       0.91      0.98      0.94      1339
                 AT2       0.91      0.98      0.94      3017
            Ciliated       0.86      0.92      0.89       399
                Club       0.95      0.65      0.77       281
         Endothelial       0.86      0.88      0.87       824
         Fibroblasts       0.99      0.64      0.78       296
LymphaticEndothelium       1.00      0.07      0.14        54
           Monocytes       0.96      0.95      0.96      1540
              TCells       1.00      0.27      0.43       206

            accuracy                           0.91      7956
           macro avg       0.94      0.71      0.75      7956
        weighted avg       0.92      0.91      0.91      7956

Working on Split: Split_5
Running a Random Forest Classifier on split: Split_5




RF Runtime: 13.68
Accuracy: 0.9173
    -> Non-Weighted F1 Score on validation set: 0.7401 
    -> Weighted F1 Score on validation set: 0.9078 




                      precision    recall  f1-score   support

                 AT1       0.91      0.98      0.94      1294
                 AT2       0.91      0.99      0.95      3055
            Ciliated       0.83      0.94      0.88       413
                Club       0.95      0.65      0.77       276
         Endothelial       0.90      0.88      0.89       849
         Fibroblasts       0.99      0.64      0.78       304
LymphaticEndothelium       1.00      0.04      0.07        55
           Monocytes       0.96      0.96      0.96      1516
              TCells       0.98      0.27      0.42       194

            accuracy                           0.92      7956
           macro avg       0.94      0.70      0.74      7956
        weighted avg       0.92      0.92      0.91      7956



In [12]:
model_eval

{'Split_1': {'Runtime': [13.972443103790283],
  'Accuracy': [0.9176721970839617],
  'Macro_Score': [0.735562983608808],
  'Weighted_Score': [0.9073037931521539]},
 'Split_2': {'Runtime': [13.611600160598755],
  'Accuracy': [0.9142785319255907],
  'Macro_Score': [0.7351163726272508],
  'Weighted_Score': [0.9040790670673445]},
 'Split_3': {'Runtime': [13.739368200302124],
  'Accuracy': [0.9170437405731523],
  'Macro_Score': [0.7355240659058057],
  'Weighted_Score': [0.9060990339787607]},
 'Split_4': {'Runtime': [13.572643995285034],
  'Accuracy': [0.9145299145299145],
  'Macro_Score': [0.7470481347269695],
  'Weighted_Score': [0.9052255034521447]},
 'Split_5': {'Runtime': [13.675641775131226],
  'Accuracy': [0.9172951231774761],
  'Macro_Score': [0.7401087798589558],
  'Weighted_Score': [0.9077939278407757]}}

### Save the dictionary with metrics

In [13]:
# save the dictionary
def Pickler(data, filename):
    
    outfile = open(filename, 'wb+')
    
    #source destination
    
    pickle.dump(data, outfile)
    
    outfile.close()

Pickler(model_eval, filename=f"{dictionary_dir}/{dataset_name}_RF_EvalDict.pickle")

In [14]:
# to load

def Unpickler(filename):
    
    infile = open(filename, 'rb+')
    
    return_file = pickle.load(infile);
    
    infile.close()

    return return_file

test_loaddict = Unpickler(filename=f"{dictionary_dir}/{dataset_name}_RF_EvalDict.pickle")

In [15]:
test_loaddict

{'Split_1': {'Runtime': [13.972443103790283],
  'Accuracy': [0.9176721970839617],
  'Macro_Score': [0.735562983608808],
  'Weighted_Score': [0.9073037931521539]},
 'Split_2': {'Runtime': [13.611600160598755],
  'Accuracy': [0.9142785319255907],
  'Macro_Score': [0.7351163726272508],
  'Weighted_Score': [0.9040790670673445]},
 'Split_3': {'Runtime': [13.739368200302124],
  'Accuracy': [0.9170437405731523],
  'Macro_Score': [0.7355240659058057],
  'Weighted_Score': [0.9060990339787607]},
 'Split_4': {'Runtime': [13.572643995285034],
  'Accuracy': [0.9145299145299145],
  'Macro_Score': [0.7470481347269695],
  'Weighted_Score': [0.9052255034521447]},
 'Split_5': {'Runtime': [13.675641775131226],
  'Accuracy': [0.9172951231774761],
  'Macro_Score': [0.7401087798589558],
  'Weighted_Score': [0.9077939278407757]}}