In [1]:
import csv
import math 
import pandas as pd
import numpy

from typing import List
from typing_extensions import TypedDict


class DVStat(TypedDict):
    statistic: float
    pvalue: float
    adj_pvalue: float
    ratio: float
    control_size: int
    ecig_size: int
    region: str
       

class DVgene(TypedDict):
    idx: int
    name: str
    dv_stats: List[DVStat]
    

In [10]:
def find_unique_common_dvgenes(csv_path: str, number_of_regions: int = 10):   
    df = pd.read_csv(
        csv_path,
        delimiter=",",
        encoding="utf-8",
        index_col=None
    )
    
    gene_to_stats = {}
    for index, row in df.iterrows():
        dv_stat: DVStat = {
            'statistic': row['t_statistic'],
            'pvalue': row['p_value'],
            'adj_pvalue': row['adjusted_p_value'],
            'control_size': row['size_control'],
            'ecig_size': row['size_treated'],
            'ratio': row['difference'],
            'region': row['region']            
        }
        if row['gene'] not in gene_to_stats.keys():
            dv_gene: DVGene = {
                'idx': index,
                'name': row['gene'],
                'dv_stats': [dv_stat]
            }
            gene_to_stats[row['gene']] = dv_gene
        else:
            gene_to_stats[row['gene']]['dv_stats'].append(dv_stat)
            
    unique_dv_genes = [dv_gene for dv_gene in list(gene_to_stats.keys()) if len(gene_to_stats[dv_gene]['dv_stats']) == 1]
    common_dv_genes = [dv_gene for dv_gene in list(gene_to_stats.keys()) if len(gene_to_stats[dv_gene]['dv_stats']) == number_of_regions]
    return gene_to_stats, unique_dv_genes, common_dv_genes


def group_dvg_genes_per_region(dvg_db, unique_dvg, common_dvg):
    unique_region_to_genes = {}
    for dv_gene in unique_dvg:
        region = dvg_db[dv_gene]['dv_stats'][0]['region']
        if region in unique_region_to_genes.keys():
            unique_region_to_genes[region].append(dv_gene)
        else:
            unique_region_to_genes[region] = [dv_gene]
    
    for region, genes in unique_region_to_genes.items():
        print(f"======== {len(genes)} unique DV Genes for {region} ======== ")
        #for name in genes:
            #print(name)

    print(f"======== {len(common_dvg)} common DVGenes ========")
    #for name in common_dvg:
        #print(name)


#dvg_db, unique_dvg, common_dvg = find_unique_common_dvgenes("./output/plim10eminus6/mp_dva_ext_heatmap_data.csv", 10)
#group_dvg_genes_per_region(dvg_db, unique_dvg, common_dvg)


In [11]:
for val in ["fa", "ma", "fp", "mp"]:
    dvg_db, unique_dvg, common_dvg = find_unique_common_dvgenes("./output/final/v0/" + str(val) + "_dva_ext_heatmap_data.csv", 10)
    print(val)
    group_dvg_genes_per_region(dvg_db, unique_dvg, common_dvg)


fa
ma
fp
mp


In [4]:
### MA


In [12]:
dvg_db, unique_dvg, common_dvg = find_unique_common_dvgenes("./output/final/v0/ma_dva_ext_heatmap_data.csv")
group_dvg_genes_per_region(dvg_db, unique_dvg, common_dvg)




In [7]:
dvg_db, unique_dvg, common_dvg = find_unique_common_dvgenes("./output/final/v0/fp_dva_ext_heatmap_data.csv")
group_dvg_genes_per_region(dvg_db, unique_dvg, common_dvg)




In [8]:
dvg_db, unique_dvg, common_dvg = find_unique_common_dvgenes("./output/final/v0/mp_dva_ext_heatmap_data.csv")
group_dvg_genes_per_region(dvg_db, unique_dvg, common_dvg)




In [None]:
#Plim = 10e-3

In [13]:
dvg_db, unique_dvg, common_dvg = find_unique_common_dvgenes("./output/final/v0/fa_dva_ext_heatmap_data.csv")
group_dvg_genes_per_region(dvg_db, unique_dvg, common_dvg)

