# scRNAseq scPred SVMRadial Model Evaluation - GSE144236

In [1]:
import joblib
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import pickle
import scanpy as sc
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report as class_rep
import time

In [2]:
# Prep
dataset_name = 'GSE144236' # label for the dataset
dictionary_dir = 'scPred_SVM_SplitDicts' # dir where we save the split dictionaries

In [3]:
# Make sure necessary directories are avalible

# dictionary dir
if not os.path.exists(dictionary_dir):
    os.makedirs(dictionary_dir)
    print(f"Directory {dictionary_dir} created for saving split dictionaries")
    
else:
    print('Directory already exists!')

Directory already exists!


In [4]:
metadata = pd.read_csv('~/NACT_scPred/GSE144236/results/GSE144236_metadata_model_svmRadial.csv')

In [6]:
splits = [f'Split_{i}' for i in range(1,6)]

In [8]:
def runSplitEval(metadata, split, cluster_col='celltypes'):
    """
    Running Model Evaluation on multiple splits of the dataset
    """
    
    print(f"Running a Model Evaluation on split: {split}\n")
    
    metadata = metadata[metadata.data_split == split]
    
    y_test = metadata[cluster_col].tolist()
    y_pred = metadata['scpred_prediction'].tolist()
    
    # model evaluation
    acc = accuracy_score(y_test, y_pred)
    print(f'Accuracy: {acc:4.4f}')
    
    # calculating the precision/recall based multi-label F1 score
    macro_score = f1_score(y_test, y_pred, average = 'macro' )
    w_score = f1_score(y_test, y_pred,average = 'weighted' )
    print(f'    -> Non-Weighted F1 Score on validation set: {macro_score:4.4f} ' )
    print(f'    -> Weighted F1 Score on validation set: {w_score:4.4f} ' )
    print(class_rep(y_test,y_pred))
    
    return acc, macro_score, w_score

In [9]:
model_eval = {}

In [10]:
for i in splits:
    
    print(f"Working on Split: {i}")
    
    # create a dict for storing current split information
    split_dict = {'Accuracy':[], 
                  'Macro_Score':[], 
                  'Weighted_Score':[]}
    acc, macro_score, w_score = runSplitEval(metadata, i, cluster_col='celltypes')
    
    # joblib.dump(clf, f"./{model_dir}/{dataset_name}_{i}_RF.pkl")
    
    # split_dict['Runtime'].append(total_runtime)
    split_dict['Accuracy'].append(acc)
    split_dict['Macro_Score'].append(macro_score)
    split_dict['Weighted_Score'].append(w_score)
    
    model_eval[i]=split_dict

Working on Split: Split_1
Running a Model Evaluation on split: Split_1

Accuracy: 0.8789
    -> Non-Weighted F1 Score on validation set: 0.7101 
    -> Weighted F1 Score on validation set: 0.9196 
                  precision    recall  f1-score   support

            ASDC       0.66      0.63      0.64        67
          B Cell       1.00      0.33      0.50        48
            CD1C       0.94      0.66      0.78       988
          CLEC9A       0.97      0.89      0.93       178
Endothelial Cell       0.89      0.31      0.46        81
      Epithelial       1.00      0.98      0.99      5780
      Fibroblast       1.00      0.67      0.80       159
              LC       0.97      0.76      0.85       743
            MDSC       0.84      0.70      0.76       148
             Mac       0.90      0.77      0.83       661
      Melanocyte       1.00      0.63      0.77       164
              NK       0.88      0.60      0.71        25
             PDC       0.94      0.62      0.74 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


    -> Non-Weighted F1 Score on validation set: 0.6744 
    -> Weighted F1 Score on validation set: 0.9101 
                  precision    recall  f1-score   support

            ASDC       0.64      0.57      0.60        69
          B Cell       0.93      0.30      0.46        46
            CD1C       0.93      0.63      0.75       992
          CLEC9A       0.97      0.87      0.92       175
Endothelial Cell       0.86      0.06      0.11       101
      Epithelial       1.00      0.99      0.99      5753
      Fibroblast       1.00      0.70      0.83       186
              LC       0.98      0.75      0.85       790
            MDSC       0.86      0.75      0.80       142
             Mac       0.91      0.75      0.82       614
      Melanocyte       0.99      0.65      0.79       147
              NK       0.71      0.50      0.59        20
             PDC       1.00      0.67      0.80        49
           Tcell       1.00      0.66      0.80       321
      unassigned     

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.8704
    -> Non-Weighted F1 Score on validation set: 0.6884 
    -> Weighted F1 Score on validation set: 0.9146 
                  precision    recall  f1-score   support

            ASDC       0.75      0.62      0.68        66
          B Cell       1.00      0.15      0.27        52
            CD1C       0.95      0.65      0.77       991
          CLEC9A       0.96      0.90      0.93       169
Endothelial Cell       0.97      0.33      0.49        98
      Epithelial       1.00      0.99      0.99      5750
      Fibroblast       1.00      0.57      0.73       166
              LC       0.98      0.76      0.85       775
            MDSC       0.90      0.82      0.86       112
             Mac       0.91      0.76      0.83       630
      Melanocyte       1.00      0.65      0.79       180
              NK       0.93      0.52      0.67        25
             PDC       1.00      0.51      0.68        47
           Tcell       1.00      0.68      0.81       344
    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [11]:
model_eval

{'Split_1': {'Accuracy': [0.8788942052099947],
  'Macro_Score': [0.7101187424965911],
  'Weighted_Score': [0.9195994514351301]},
 'Split_2': {'Accuracy': [0.8670919723551302],
  'Macro_Score': [0.6743585874533823],
  'Weighted_Score': [0.9101208541429887]},
 'Split_3': {'Accuracy': [0.8741095162147794],
  'Macro_Score': [0.6935124955451566],
  'Weighted_Score': [0.9160762558407194]},
 'Split_4': {'Accuracy': [0.870388091440723],
  'Macro_Score': [0.6884370726911215],
  'Weighted_Score': [0.9145990290495539]},
 'Split_5': {'Accuracy': [0.8679425837320575],
  'Macro_Score': [0.6896283332481872],
  'Weighted_Score': [0.911017496672155]}}

In [12]:
# save the dictionary
def Pickler(data, filename):
    
    outfile = open(filename, 'wb+')
    
    #source destination
    
    pickle.dump(data, outfile)
    
    outfile.close()

Pickler(model_eval, filename=f"{dictionary_dir}/{dataset_name}_svmRadial_EvalDict.pickle")