In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
from scipy.stats import ttest_ind

In [2]:
# Generate list of files based on filename pattern
path = 'res/'

class_names = ['Tip_Cells', 'activated_capillary', 'Immature_Phenotype',
               'capillary_I', 'capillary_II', 'Activated_EC', 'TandNK', 
               'Epithelial', 'Myeloid', 'Fibro_Peri', 'B']

for cell in class_names:
    file_list = [f for f in os.listdir(path) 
                 if f.startswith("gcamplus_result_" + cell + "_v") and f.endswith(".txt")]

    dfs = []
    
    # Iterate over the file list, create dataframes and append to the list
    for file in sorted(file_list):
        df = pd.read_csv(path + file, sep='\t')
        df = df.sort_values(by='Normalized_Weight', ascending=False).reset_index(drop=True)
        dfs.append(df)
    
    # Ensure dfs is not empty before calculating ref_value
    if dfs:
        ref_value = int(len(dfs[0].index) * 0.05)
    
    filtered_dfs = []
    for df in dfs:
        # Filter by top ref_value
        filtered_df = df.head(n=ref_value)
        filtered_dfs.append(filtered_df)
    
    # Calculate the occurrence of gene pairs
    gene_pairs = []
    tumor = df.columns[0]
    other = df.columns[1]
    
    # Count the number of TumorCell and TEC pairs
    for df in filtered_dfs:
        for index, row in df.iterrows():
            gene_pairs.append((row[tumor], row[other]))
    
    # Convert list to dataframe
    common_pairs_df = pd.DataFrame(gene_pairs, columns=[tumor, other])
    
    # Calculate the frequency of TumorCell and TEC pairs
    common_pairs_df['Count'] = common_pairs_df.groupby([tumor, other])[other].transform('count')
    
    # Remove duplicates
    common_pairs_df = common_pairs_df.drop_duplicates().reset_index(drop=True)
    
    # Sort by count in descending order
    common_pairs_df_50 = common_pairs_df.sort_values(by='Count', ascending=False).reset_index(drop=True)

    # Count the occurrences of gene pairs
    gene_pair_counts = {}
    for df in filtered_dfs:
        for index, row in df.iterrows():
            pair = (row[df.columns[0]], row[df.columns[1]])
            gene_pair_counts[pair] = gene_pair_counts.get(pair, 0) + 1
    
    # Collect Normalized_Weight values for each gene pair
    gene_pair_weights = {pair: [] for pair in gene_pair_counts.keys()}
    for df in filtered_dfs:
        for index, row in df.iterrows():
            pair = (row[df.columns[0]], row[df.columns[1]])
            if pair in gene_pair_weights:
                gene_pair_weights[pair].append(row[df.columns[2]])
    
    # Perform statistical analysis (mean, variance, std deviation, median)
    stats_results = []
    for pair, weights in gene_pair_weights.items():
        if weights:  # Only compute if weights list is not empty
            mean_weight = np.mean(weights)
            variance_weight = np.var(weights)
            std_dev_weight = np.std(weights)  # Compute std deviation
            median_weight = np.median(weights)  # Compute median
            cv_weight = std_dev_weight / mean_weight * 100 if mean_weight else 0  # Compute CV
            stats_results.append({
                df.columns[0]: pair[0],
                df.columns[1]: pair[1],
                'Mean Normalized_Weight': mean_weight,
                'Variance Normalized_Weight': variance_weight,
                'Std Dev Normalized_Weight': std_dev_weight,  # Added std deviation
                'Median Normalized_Weight': median_weight,  # Added median
                'CV Normalized_Weight': cv_weight,  # Added CV
                'Count': gene_pair_counts[pair]
            })
    
    stats_df = pd.DataFrame(stats_results)
    
    # Sort the results dataframe by count
    stats_df_sorted = stats_df.sort_values(by='Count', ascending=False).reset_index(drop=True)

    # Save the results
    stats_df_sorted.set_index(stats_df_sorted.columns[0], inplace=True)
    file_name = "gcam_" + cell + "_res.csv"
    stats_df_sorted.to_csv(path + file_name)
    print(f"Completed: {file_name}")
