In [1]:
import warnings
from warnings import filterwarnings
import math
import scanpy as sc

import pandas as pd
import numpy as np
from scipy import io
import gzip
import os 
import pandas as pd
from scipy.io import mmread
import time
import re
import random
from tqdm import tqdm
import anndata
import anndata as ad
import h5py
import scipy.sparse
from scipy.sparse import csc_matrix
from scipy.sparse import csr_matrix
filterwarnings("ignore")
seed = 0
random.seed(seed)
np.random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)

## GSE241184 Papillary thyroid cancer

In [2]:
interest_pred_label = [3, 13, 20, 26, 28]  # These are the labels for the cell types of interest

In [3]:
from collections import Counter

def get_top_genes_attention_matrices(patient_id, gene_names, cell_names, pred_label, label, interest_label):
    file_path = f'../Data'
    
    all_genes_attention = []
    
    interest_indices = [i for i, (pred, actual) in enumerate(zip(pred_label, label)) if pred in interest_label and actual == "Metastatic"]
    print(f'\nNumber of cells of interest: {len(interest_indices)}\n')
    
    for head in range(1, 9):
        attention_matrix_path = file_path + f'head-{head}.mtx'
        attention_matrix = mmread(attention_matrix_path)
        attention_matrix_csr = csr_matrix(attention_matrix)
        attention_matrix_filtered = attention_matrix_csr[:, interest_indices].transpose()
        df_attention_filtered = pd.DataFrame(attention_matrix_filtered.toarray(), index=np.array(cell_names)[interest_indices], columns=np.array(gene_names))
        
        top_100_genes_attention = df_attention_filtered.abs().mean(axis=0).nlargest(100)
        
        all_genes_attention.append(top_100_genes_attention)
    
    df_all_genes_attention = pd.concat(all_genes_attention, axis=1)
    df_all_genes_attention.columns = [f'Head{head}' for head in range(1, 9)]
    
    return df_all_genes_attention

all_samples_genes_attention = pd.DataFrame()

In [4]:
import pandas as pd

# Define the path to the saved CSV file
genes_attention_file = "../Data/all_samples_genes_attention.csv"

# Read the saved CSV file, ensuring that both index and columns are correctly read
all_samples_genes_attention = pd.read_csv(genes_attention_file, index_col=0)

In [5]:
gene_frequencies = all_samples_genes_attention.index.value_counts()

all_samples_genes_attention['Frequency'] = all_samples_genes_attention.index.map(gene_frequencies)

print("\nConcatenated genes attention values and frequencies for all samples:")
print('\nTotal gene number: ', all_samples_genes_attention.shape[0])

all_samples_genes_attention_unique = pd.DataFrame()

all_samples_genes_attention['Attention'] = all_samples_genes_attention.iloc[:, :-2].max(axis=1)  

all_samples_genes_attention_sorted = all_samples_genes_attention.sort_values(by=['Frequency', 'Attention'], ascending=False)

all_samples_genes_attention_sorted_reset = all_samples_genes_attention_sorted.reset_index()

all_samples_genes_attention_unique = all_samples_genes_attention_sorted_reset.drop_duplicates(subset='index', keep='first')


all_samples_genes_attention_unique.set_index('index', inplace=True)


print("Unique sorted genes with max attention values and frequencies:")

mask = ~all_samples_genes_attention_unique.index.str.startswith("MT")


all_samples_genes_attention_unique = all_samples_genes_attention_unique[mask]

all_samples_genes_attention_unique.head(10)


Concatenated genes attention values and frequencies for all samples:

Total gene number:  153
Unique sorted genes with max attention values and frequencies:


Unnamed: 0_level_0,Head1,Head2,Head3,Head4,Head5,Head6,Head7,Head8,Frequency,Attention
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
TMSB4X,0.090708,0.067812,0.078776,0.060121,0.066706,0.153387,0.108212,0.094211,1,0.153387
FTL,0.090646,0.065284,0.082648,0.053249,0.061994,0.145474,0.098759,0.090211,1,0.145474
GPX1,0.085124,0.060787,0.08716,0.052342,0.066169,0.141465,0.107718,0.093674,1,0.141465
HSP90AA1,0.084256,0.064417,0.067545,0.053394,0.060216,0.138425,0.091092,0.075767,1,0.138425
TXNIP,0.072761,0.062524,0.078618,0.05015,0.061881,0.130085,0.097237,0.077165,1,0.130085
RPS29,,0.048935,0.059699,0.045068,0.04383,0.127557,0.077837,0.075358,1,0.127557
HSP90B1,0.096415,0.087193,0.097994,0.070965,0.081657,0.125703,0.11458,0.086947,1,0.125703
TMSB10,0.08229,0.066111,0.066662,0.052755,0.05827,0.125657,0.0978,0.083252,1,0.125657
CYBA,0.077959,0.05885,0.069038,0.047743,0.059836,0.125076,0.114022,0.083372,1,0.125076
RPL35A,0.113907,0.084001,0.106049,0.069709,0.073304,0.116279,0.119139,0.103377,1,0.119139


In [15]:
all_samples_genes_attention_unique.to_csv("../Data/Papi_M_N.csv")