# scRNAseq scPred SVMRadial Model Evaluation - GSE163005_IMMUNE_CSF

In [1]:
import joblib
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import pickle
import scanpy as sc
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report as class_rep
import time

In [2]:
# Prep
dataset_name = 'GSE163005_IMMUNE_CSF' # label for the dataset
dictionary_dir = 'scPred_SVM_SplitDicts' # dir where we save the split dictionaries

In [3]:
# Make sure necessary directories are avalible

# dictionary dir
if not os.path.exists(dictionary_dir):
    os.makedirs(dictionary_dir)
    print(f"Directory {dictionary_dir} created for saving split dictionaries")
    
else:
    print('Directory already exists!')

Directory already exists!


In [4]:
metadata = pd.read_csv('~/NACT_scPred/GSE163005_IMMUNE_CSF/results/GSE163005_IMMUNE_CSF_metadata_model_svmRadial.csv')

In [5]:
splits = [f'Split_{i}' for i in range(1,6)]

In [6]:
def runSplitEval(metadata, split, cluster_col='celltypes'):
    """
    Running Model Evaluation on multiple splits of the dataset
    """
    
    print(f"Running a Model Evaluation on split: {split}\n")

    
    metadata = metadata[metadata.data_split == split]
    
    y_test = metadata[cluster_col].tolist()
    y_pred = metadata['scpred_prediction'].tolist()
    
    
    # model evaluation
    acc = accuracy_score(y_test, y_pred)
    print(f'Accuracy: {acc:4.4f}')
    
    # calculating the precision/recall based multi-label F1 score
    macro_score = f1_score(y_test, y_pred, average = 'macro' )
    w_score = f1_score(y_test, y_pred,average = 'weighted' )
    print(f'    -> Non-Weighted F1 Score on validation set: {macro_score:4.4f} ' )
    print(f'    -> Weighted F1 Score on validation set: {w_score:4.4f} ' )
    print(class_rep(y_test,y_pred))
    
    return acc, macro_score, w_score

In [7]:
model_eval = {}

In [8]:
for i in splits:
    
    print(f"Working on Split: {i}")
    
    # create a dict for storing current split information
    split_dict = {'Accuracy':[], 
                  'Macro_Score':[], 
                  'Weighted_Score':[]}
    acc, macro_score, w_score = runSplitEval(metadata, i, cluster_col='celltypes')
    
    # joblib.dump(clf, f"./{model_dir}/{dataset_name}_{i}_RF.pkl")
    
    # split_dict['Runtime'].append(total_runtime)
    split_dict['Accuracy'].append(acc)
    split_dict['Macro_Score'].append(macro_score)
    split_dict['Weighted_Score'].append(w_score)
    
    model_eval[i]=split_dict

Working on Split: Split_1
Running a Model Evaluation on split: Split_1

Accuracy: 0.5798
    -> Non-Weighted F1 Score on validation set: 0.2368 
    -> Weighted F1 Score on validation set: 0.6712 


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

     B cells       0.00      0.00      0.00         0
         CD4       0.98      0.73      0.84      8092
         CD8       0.93      0.67      0.78      2933
          NK       0.97      0.60      0.74       809
        Treg       0.75      0.45      0.56       428
     cycling       0.73      0.15      0.24       131
    granulo1       0.00      0.00      0.00       219
    granulo2       0.00      0.00      0.00        46
granulocytes       0.00      0.00      0.00         0
         mDC       0.00      0.00      0.00         0
        mDC1       0.00      0.00      0.00        44
        mDC2       0.00      0.00      0.00       422
       mono1       0.00      0.00      0.00      1232
       mono2       0.00      0.00      0.00       241
       mono3       0.00      0.00      0.00        64
   monocytes       0.00      0.00      0.00         0
     naiveBc       0.00      0.00      0.00       200
         pDC       1.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.5853
    -> Non-Weighted F1 Score on validation set: 0.2424 
    -> Weighted F1 Score on validation set: 0.6759 
              precision    recall  f1-score   support

     B cells       0.00      0.00      0.00         0
         CD4       0.98      0.74      0.84      8045
         CD8       0.93      0.67      0.78      2996
          NK       0.96      0.63      0.76       782
        Treg       0.76      0.48      0.59       448
     cycling       0.88      0.21      0.34       144
    granulo1       0.00      0.00      0.00       243
    granulo2       0.00      0.00      0.00        27
granulocytes       0.00      0.00      0.00         0
         mDC       0.00      0.00      0.00         0
        mDC1       0.00      0.00      0.00        55
        mDC2       0.00      0.00      0.00       383
       mono1       0.00      0.00      0.00      1187
       mono2       0.00      0.00      0.00       300
       mono3       0.00      0.00      0.00        71
   monocyt

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


    -> Non-Weighted F1 Score on validation set: 0.2384 
    -> Weighted F1 Score on validation set: 0.6755 
              precision    recall  f1-score   support

     B cells       0.00      0.00      0.00         0
         CD4       0.98      0.73      0.84      7956
         CD8       0.93      0.67      0.78      3118
          NK       0.98      0.59      0.74       805
        Treg       0.79      0.46      0.58       472
     cycling       0.77      0.15      0.25       112
    granulo1       0.00      0.00      0.00       240
    granulo2       0.00      0.00      0.00        36
granulocytes       0.00      0.00      0.00         0
         mDC       0.00      0.00      0.00         0
        mDC1       0.00      0.00      0.00        53
        mDC2       0.00      0.00      0.00       383
       mono1       0.00      0.00      0.00      1155
       mono2       0.00      0.00      0.00       246
       mono3       0.00      0.00      0.00        60
   monocytes       0.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [9]:
model_eval

{'Split_1': {'Accuracy': [0.5797590680434207],
  'Macro_Score': [0.23676597459832002],
  'Weighted_Score': [0.671162259229957]},
 'Split_2': {'Accuracy': [0.5929308975377283],
  'Macro_Score': [0.23834930226083523],
  'Weighted_Score': [0.6823788872041464]},
 'Split_3': {'Accuracy': [0.5853190362721736],
  'Macro_Score': [0.2423701755947773],
  'Weighted_Score': [0.6758780371887863]},
 'Split_4': {'Accuracy': [0.5828700026476039],
  'Macro_Score': [0.2384480641370982],
  'Weighted_Score': [0.6755086595308321]},
 'Split_5': {'Accuracy': [0.5837304739211014],
  'Macro_Score': [0.2458384768553698],
  'Weighted_Score': [0.6760223766297085]}}

In [10]:
# save the dictionary
def Pickler(data, filename):
    
    outfile = open(filename, 'wb+')
    
    #source destination
    
    pickle.dump(data, outfile)
    
    outfile.close()

Pickler(model_eval, filename=f"{dictionary_dir}/{dataset_name}_svmRadial_EvalDict.pickle")