# scANNA Tutorial: Using scANNA's Interpretability for Selecting Most Important Global Features
## For Lukassen2020_Lung Data

In this notebook, we will run additional comparisons against two feature selection tools. **We will be skipping loading and running scANNA since we have already evaluated its performance.**

- [Triku](https://doi.org/10.1093/gigascience/giac017) is a python tool that uses nearest-neighbors for identification of features

In [2]:
# from nact.utilities import *
# from nact import AttentionQuery, scanpy_to_dataloader
import numpy as np
import os
import pandas as pd
import scanpy as sc
from sklearn.metrics import f1_score, accuracy_score
from sklearn.neighbors import NearestCentroid, KNeighborsClassifier
import torch
import time
import triku as tk
from typing import Union
from xgboost import XGBClassifier 

In [3]:
%load_ext autoreload
%autoreload 2

Here we define some general functions for our analyses

In [4]:
def get_logp1_variance(data:np.array)->float:
    """ Function for calculating the variance of a Log(x+1) transformed data"""
    return np.var(np.log(data + 1), axis=0).sum()

def explained_variance_ratio(full_dimension_data:np.array, 
                             train_data:np.array, 
                             test_data:np.array)->float:
    """ Utility function for calculating the fraction of total variance"""
    all_data = np.concatenate((train_data, test_data), axis=0)
    return (get_logp1_variance(all_data) / 
            get_logp1_variance(full_dimension_data))

def measure_performance(X_train:np.array, 
                y_train:np.array, 
                X_test:np.array, 
                y_test:np.array, 
                classifer: Union[NearestCentroid, KNeighborsClassifier, XGBClassifier], 
                scoring : str = 'weighted', 
                classifier_name : str = None):
    """ Function for automating classification and calculating F1 score"""
    start_time = time.time()
    classifer.fit(X_train, y_train)
    y_pred = classifer.predict(X_test)
    f1_accuracy = f1_score(y_test, y_pred, average=scoring)
    if classifier_name is not None:
        print(f"For {classifier_name} classifier:")
        
    print(f"F1 ({scoring}) Score: {f1_accuracy}")
    print(f"Training and classification time took: {time.time() - start_time}")
    print()
    return f1_accuracy
    

# Define Various Classification Models for Validation

In [5]:
nearest_centroid_classifier=NearestCentroid()

knn_classifier= KNeighborsClassifier(n_neighbors=3, 
                                     n_jobs=-1)

xgboost_classifier = XGBClassifier(n_estimators=50, 
                                   max_depth=2, 
                                   learning_rate=0.09, 
                                   objective='binary:logistic', 
                                   n_jobs=-1)

In [6]:
# label for the dataset folder we want to make
dataset_name = "Lukassen2020_Lung"
# directory for specific dataset
dataset_dir = f"./{dataset_name}" 
# subdir where we store cluster attention and attention means
results = f"./{dataset_name}/results" 
# subdir where we store cluster enrichment plots
plots = f"./{dataset_dir}/plots" 

In [7]:
path_to_data = ("/Users/oscardavalos/Documents/Upload2Cluster/Lukassen2020_Lung_qc_hvg_anno_5k_split.h5ad")


# Top Feature Selection with Triku

In [9]:
adata = sc.read(path_to_data)
adata_t = adata

In [10]:
sc.pp.log1p(adata_t)
sc.pp.pca(adata_t)
sc.pp.neighbors(adata_t, metric='cosine', n_neighbors=int(0.5 * len(adata_t) ** 0.5))

In [12]:
top_n = [10, 25, 50, 100, 200, 300]
start = time.time()

triku_nc_acc_dict = {}
triku_knn_acc_dict = {}
triku_xgb_acc_dict = {}
triku_variance_dict = {}


for n_genes in top_n:
    print("----------------------------------")
    print(f" Triku Results for {n_genes} top genes:")
    
    # sc.pp.log1p(adata)
    # sc.pp.pca(adata)
    # sc.pp.neighbors(adata, metric='cosine', n_neighbors=int(0.5 * len(adata) ** 0.5))
    tk.tl.triku(adata_t,
                n_features=n_genes, 
                use_raw=True)
    
    hvg_adata = adata[:, adata_t.var.highly_variable]
    adata_train = hvg_adata[hvg_adata.obs.split=="train"]
    adata_test = hvg_adata[hvg_adata.obs.split=="test"]
    
    data_train_for_triku = np.array(adata_train.X.todense())
    labels_train_for_triku  = adata_train.obs.cluster.to_numpy()

    data_test_for_triku = np.array(adata_test.X.todense())
    labels_test_for_triku  = adata_test.obs.cluster.to_numpy()

    triku_nc_acc_dict[n_genes] = measure_performance(data_train_for_triku, 
                                        labels_train_for_triku ,
                                        data_test_for_triku, 
                                        labels_test_for_triku, 
                                        classifer=nearest_centroid_classifier,
                                        classifier_name = "Nearest Centroid",
                                        )

    triku_knn_acc_dict[n_genes] = measure_performance(data_train_for_triku, 
                                                       labels_train_for_triku ,
                                                       data_test_for_triku, 
                                                       labels_test_for_triku, 
                                                       classifer=knn_classifier,
                                                       classifier_name = "KNN",
                                                       )

    triku_xgb_acc_dict[n_genes] = measure_performance(data_train_for_triku, 
                                                   labels_train_for_triku ,
                                                   data_test_for_triku, 
                                                   labels_test_for_triku, 
                                                   classifer=xgboost_classifier,
                                                   classifier_name = "XGBoost",
                                                   )
    var_frac_explained = explained_variance_ratio(
            full_dimension_data=np.array(adata.X.todense()), 
            train_data = data_train_for_triku, 
            test_data = data_test_for_triku)
    triku_variance_dict[n_genes] = var_frac_explained
    print(f"Percentage of Total Variance Explained: {var_frac_explained*100}%")

    print("----------------------------------")

----------------------------------
 Triku Results for 10 top genes:
For Nearest Centroid classifier:
F1 (weighted) Score: 0.26580916153582534
Training and classification time took: 0.005373954772949219

For KNN classifier:
F1 (weighted) Score: 0.2738537070415746
Training and classification time took: 1.524209976196289

For XGBoost classifier:
F1 (weighted) Score: 0.2666739883698217
Training and classification time took: 0.6612100601196289

Percentage of Total Variance Explained: 0.15339512610808015%
----------------------------------
----------------------------------
 Triku Results for 25 top genes:
For Nearest Centroid classifier:
F1 (weighted) Score: 0.270836540799584
Training and classification time took: 0.005874156951904297

For KNN classifier:
F1 (weighted) Score: 0.29717468562620614
Training and classification time took: 12.627127170562744

For XGBoost classifier:
F1 (weighted) Score: 0.26611050178968454
Training and classification time took: 0.8684608936309814

Percentage of T