In [1]:
import pandas as pd
from glob import glob
from collections import OrderedDict
import pickle
import os

In [2]:
complete_annotation_df = pd.read_csv('../../active_files/complete_annotation.csv')

In [3]:
complete_annotation_df

Unnamed: 0,TTHERM_ID,seed_ortholog,evalue,score,eggNOG_OGs,max_annot_lvl,COG_category,Description,Preferred_name,GOs,...,BRITE,KEGG_TC,CAZy,BiGG_Reaction,PFAMs,TGD2021_description,peptide,common_name,InterPro,InterPro_description
0,TTHERM_00840110,5911.EAS05042,0.0,1278.5,"2E5M0@1|root,2SCE9@2759|Eukaryota,3ZEBN@5878|C...",5878|Ciliophora,-,-,-,-,...,-,-,-,-,-,hypothetical protein,MISSNQTADQENKVENKVANAEHVNQQSYDSIPQSLSPAVIAQIMD...,Unnamed,-,-
1,TTHERM_01082930,31033.ENSTRUP00000031008,1.9999999999999997e-28,94.2,"COG5078@1|root,KOG0418@2759|Eukaryota,38KYZ@33...",33208|Metazoa,O,Belongs to the ubiquitin-conjugating enzyme fa...,UBE2K,"GO:0000209,GO:0003674,GO:0003824,GO:0004842,GO...",...,"ko00000,ko00001,ko01000,ko04121",-,-,-,"UBA,UQ_con",ubiquitin-conjugating enzyme E2,MHKNILIILFQIFCCQIYTTIYTFYYFMANIVFIIHNVKLDLFSCF...,Unnamed,"IPR000608,IPR016135,IPR023313,IPR050113","Ubiquitin-conjugating enzyme E2,Ubiquitin-conj..."
2,TTHERM_01081610,5911.EAR82090,0.0,2110.7,"2A5FX@1|root,2RY9I@2759|Eukaryota",2759|Eukaryota,-,-,-,-,...,-,-,-,-,PRESAN,transmembrane protein putative,MSSQSPAKLNNQNCAAANQYYNDLESCVQGYCIKQQSGSGARGCFP...,Unnamed,-,-
3,TTHERM_00059210,5911.EAR87408,3.2999999999999998e-298,983.9,"COG0575@1|root,KOG1440@2759|Eukaryota,3ZAR9@58...",5878|Ciliophora,I,Cytidylyltransferase family,-,-,...,"ko00000,ko00001,ko00002,ko01000",-,-,-,CTP_transf_1,phosphatidate cytidylyltransferase,MSQVTNRSQKKSHQKRDEKSEEDSSDEKTDDFSEEELDKLQEAQKK...,Unnamed,"IPR000374,IPR016720","Phosphatidate cytidylyltransferase,Phosphatida..."
4,TTHERM_00535200,5911.EAS03184,0.0,2448.3,"2E77S@1|root,2SDUU@2759|Eukaryota",5911.EAS03184|-,S,Src homology 3 domains,-,-,...,-,-,-,-,-,beta-Pak interactive eXchange factor Src-like ...,MFTKSNSRSALAGLNSIVNSQNDSLTSRAQHQNYAKKDLTISNSTS...,Unnamed,"IPR001452,IPR036028,IPR051569","SH3 domain,SH3-like domain superfamily,SH3 and..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26682,TTHERM_00657320,-,-,-,-,-,-,-,-,-,...,-,-,-,-,-,-,MNIIQKLVCINQNKQNDIYVDIKRDLQSYLILEIQPEFSRQIGDLQ...,Unnamed,-,-
26683,TTHERM_00656000,-,-,-,-,-,-,-,-,-,...,-,-,-,-,-,hypothetical protein,MFQVSRNIKYLQAISNKVVLQNAQFHLFNFQKFDLSTINKDETTRN...,Unnamed,-,-
26684,TTHERM_00989420,-,-,-,-,-,-,-,-,-,...,-,-,-,-,-,Nucleotide-binding oligomerization domain-cont...,MIPPSKRIDFGRNIVPTQFLPKDGYKARNLKTEAAQQLTENKRYQT...,Unnamed,-,-
26685,TTHERM_01245650,-,-,-,-,-,-,-,-,-,...,-,-,-,-,-,transmembrane protein putative,MLFNQQFLIFSQIINQSCSNLQMTSPKKKKGQTLKNSYKKTGLRTS...,Unnamed,-,-


In [4]:
microarray_exprs_df = pd.read_csv('../../active_files/allgood_filt_agg_tidy_2021aligned_qc_rma_expression_full.csv')
rna_seq_exprs_df = pd.read_csv('../../active_files/rna_seq.csv')

In [5]:
term_cols = ['COG_category', 'GOs', 'KEGG_ko', 'EC', 'PFAMs', 'InterPro']

annotation_stats_dict = OrderedDict()

for k, df in OrderedDict([('microarray', microarray_exprs_df), ('rna_seq', rna_seq_exprs_df),]).items():
    if k not in annotation_stats_dict:
        annotation_stats_dict[k] = OrderedDict()

    num_total_genes = df.shape[0]
    annotation_stats_dict[k][f'# total genes'] = num_total_genes

    for col in term_cols:

        num_term_genes = complete_annotation_df.loc[
            (complete_annotation_df['TTHERM_ID'].isin(df['TTHERM_ID'].values))
            &
            (complete_annotation_df[col] != '-')
        ].shape[0]

        annotation_stats_dict[k][f'# genes with {col} terms'] = num_term_genes

        annotation_stats_dict[k][f'fraction genes with {col} terms'] = num_term_genes / num_total_genes

annotation_stats_dict

OrderedDict([('microarray',
              OrderedDict([('# total genes', 20428),
                           ('# genes with COG_category terms', 11235),
                           ('fraction genes with COG_category terms',
                            0.5499804190327002),
                           ('# genes with GOs terms', 1593),
                           ('fraction genes with GOs terms',
                            0.0779812022713922),
                           ('# genes with KEGG_ko terms', 6554),
                           ('fraction genes with KEGG_ko terms',
                            0.32083414920697084),
                           ('# genes with EC terms', 2891),
                           ('fraction genes with EC terms',
                            0.14152144115919327),
                           ('# genes with PFAMs terms', 9908),
                           ('fraction genes with PFAMs terms',
                            0.4850205600156648),
                           ('# ge

In [6]:
pd.DataFrame.from_dict(annotation_stats_dict, orient='index').to_clipboard()

In [7]:
cluster_stats_files = sorted([fp for fp in glob('./*.pkl') if 'rna_seq' in fp or 'microarray' in fp])

cluster_stats_dict = OrderedDict()

for path in cluster_stats_files:
    with open(path, 'rb') as f:
        cluster_stats_dict[os.path.splitext(os.path.basename(path))[0]] = pickle.load(f)

cluster_stats_dict

OrderedDict([('min_max_normalization_microarray_full',
              {'partition_type': 'EXP',
               'dimensionality': 'baseline',
               'metric': 'manhattan',
               'graph': 'umap_fuzzy_simplicial_set',
               'nns': 3,
               'clustering': 'leiden_cpm',
               'parameter': 0.005,
               'silhouette_score': -0.022923077635525518,
               'modularity': 0.7700419406476356,
               'nclusters': 636,
               'mean_cluster_size': 32.119496855345915,
               'median_cluster_size': 30.0,
               'sd_cluster_size': 12.645189395227938,
               'q1_cluster_size': 23.0,
               'q3_cluster_size': 40.0,
               'max_cluster_size': 82,
               'min_cluster_size': 3,
               'ngenes': 20428,
               'nenriched_clusters': 250,
               'mean_enriched_cluster_size': 34.572,
               'median_enriched_cluster_size': 32.0,
               'sd_enriched_cluster

In [8]:
pd.DataFrame.from_dict(cluster_stats_dict, orient='index').to_clipboard()