In [1]:
import pandas as pd
import os
import numpy as np

# Step 1: Load the dataframe
input_list_path = '../../../out_results/out_shap_values/list_from_shap_correlated_env.tsv'
input_matrices_dir = '../../00_matrices/'
df = pd.read_csv(input_list_path, sep='\t')

# Step 2 and 3: Process each TF and matrix type
input_matrices_dir = '../../00_matrices/'

# Create a function to process the TFs from a column
def process_tfs(tfs_column):
    results = []
    for index, row in df.iterrows():
        matrix_type = row['matrix_type']
        # Process only matrix types M0 and M1
        if matrix_type in ['M0', 'M1']:
            if pd.notna(row[tfs_column]):  # Check that the column is not NaN
                tfs = row[tfs_column].split(',')
                for tf in tfs:
                    tf = tf.strip()  # Remove extra spaces
                    file_path = f"{input_matrices_dir}/reg_genes_{matrix_type}_by_TF/reg_genes_{matrix_type}_{tf}.tsv"
                    if os.path.exists(file_path):
                        # Read the gene matrix
                        matrix = pd.read_csv(file_path, sep='\t', index_col=0)
                        # Count non-zero values
                        non_zero_counts = (matrix != 0).sum(axis=0)
                        sorted_genes = non_zero_counts.sort_values(ascending=False)
                        top_1_percent = sorted_genes.head(len(sorted_genes) // 100)
                        results.append((matrix_type, tf, list(top_1_percent.index)))
                    else:
                        print(f"File not found {file_path}")
    return results

# Example of how to process one of the columns
results_shap_top_tfs = process_tfs('shap_top_tfs')
# Repeat for 'shap_top_tfs_correlated' and 'top_tfs_correlated' as needed

# Save the results or process according to steps 4 and 5


In [7]:
results_shap_top_tfs

[('M0',
  'AgaR',
  ['lysc',
   'tsad',
   'folp',
   'ubia',
   'dnaa',
   'uvra',
   'msba',
   'pola',
   'tyrs',
   'leus',
   'mnma',
   'smpb',
   'aroc',
   'pure',
   'gyrb',
   'purd',
   'gros',
   'nrda',
   'rsmi',
   'rdgb',
   'arok',
   'def',
   'grpe',
   'msra',
   'lepa',
   'rne',
   'ilve',
   'glpk',
   'prfa',
   'mraz',
   'capm',
   'hemd',
   'metc',
   'metg',
   'faba',
   'cara',
   'pyrd',
   'hslv',
   'asps',
   'fabg',
   'bcp',
   'panc',
   'uvrb',
   'fabd',
   'rlud',
   'lptf',
   'dapa',
   'nade',
   'fole',
   'vals',
   'msrb',
   'pyre',
   'pepa',
   'fur',
   'pros',
   'ychf',
   'rho',
   'seca',
   'lgt',
   'succ',
   'aspc',
   'folk',
   'nth',
   'mrp',
   'ycea',
   'uppp',
   'tig',
   'rnhb',
   'pgk',
   'moaa',
   'recg',
   'trps',
   'mfd',
   'purp',
   'ribf',
   'argg',
   'purm',
   'aroq',
   'ma20_12960',
   'gyra',
   'arob',
   'fjo14',
   'prfb',
   'ctaa',
   'rpe',
   'rmuc',
   'dnax',
   'ma20_36760',
   'rsmh',
  