In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import glob
import pickle

## Load trees from results directory

In [5]:
results = glob.glob("../nextflow/results/*")[-1]
results

'../nextflow/results/2022-04-01'

In [3]:
def gc_key(file):
    PR_str = file.split("/")[-2].split("-")
    return int(PR_str[1]), int(PR_str[3])

files = sorted(glob.glob(f"{results}/gctrees/PR*/gctree.p"), key=gc_key)
trees = {"_".join(f"{x}{y}" for x, y in zip(("mouse", "GC"), gc_key(file))): pickle.load(open(file, "rb"))
         for file in files}

print(f"loaded {len(trees)} GC trees")

loaded 77 GC trees


## Data frame with a row for each node on each tree, excluding 10-week mice

In [7]:
rows = []
for gc, tree in trees.items():
    # no 10-week mice
    if gc.startswith("mouse7") or gc.startswith("mouse8"):
        continue
    for node in tree.tree.traverse():
            descendant_distances = [node.get_distance(descendant)
                                    for descendant in node.traverse()
                                    if descendant.abundance]
            rows.append([gc,
                         node.name,
                         ";".join(node.mutations),
                         len(node.children) + node.abundance,
                         node.LBI,
                         node.get_distance(tree.tree),
                         min(descendant_distances),
                         max(descendant_distances),
                         node.delta_bind,
                         node.delta_expr,
                         node.delta_psr])

df = pd.DataFrame(rows, columns=["GC",
                                 "name",                        
                                 "aa_mutations",
                                 "polytomy_degree",
                                 "LBI",
                                 "root_distance",
                                 "closest_leaf_distance",
                                 "farthest_leaf_distance",
                                 "delta_bind",
                                 "delta_expr",
                                 "delta_psr"])

# no 10-week mice
df = df.loc[~df.GC.str.startswith("mouse7") & ~df.GC.str.startswith("mouse8")]

df

Unnamed: 0,GC,name,aa_mutations,polytomy_degree,LBI,root_distance,closest_leaf_distance,farthest_leaf_distance,delta_bind,delta_expr,delta_psr
0,mouse1_GC1,naive,,5,9.949465,0.0,1.0,12.0,0.00000,0.00000,0.00000
1,mouse1_GC1,seq37,Y103(H)F;S20(L)N;L112(L)P,1,1.163915,4.0,0.0,0.0,-0.06082,0.14827,-0.13364
2,mouse1_GC1,seq47,E51(H)G;Y52(H)F;Y66(L)N;V71(L)I;D74(L)A;Q95(L)...,1,1.000149,11.0,0.0,0.0,-3.45668,-1.05627,3.34321
3,mouse1_GC1,seq56,Y58(H)D,2,5.252151,1.0,0.0,2.0,-0.80796,-0.15104,-0.15010
4,mouse1_GC1,25,N40(H)H;N108(L)D;Y110(L)S;E121(L)Q,2,3.107390,4.0,3.0,5.0,-7.71074,-0.77924,2.61806
...,...,...,...,...,...,...,...,...,...,...,...
5587,mouse6_GC77,seq37,S57(H)N;S64(H)R;T65(H)I;L89(H)V;A105(H)G,4,2.629712,8.0,0.0,1.0,1.08960,-0.12466,-0.23740
5588,mouse6_GC77,seq19,S57(H)N;A40(L)G;Y42(L)H;T77(L)I,1,1.528039,9.0,0.0,0.0,0.75399,-0.72151,-0.37767
5589,mouse6_GC77,seq40,S57(H)N;A40(L)G;Y42(L)H;T77(L)I;S109(L)N,2,3.395186,8.0,0.0,2.0,0.71905,-0.86401,-0.35812
5590,mouse6_GC77,seq56,S57(H)N;S64(H)R;T65(H)I;L89(H)V;Y103(H)F;A105(H)G,1,1.599537,9.0,0.0,0.0,1.13203,-0.13537,0.10666


In [13]:
# df.to_csv("~/Downloads/node_data.csv", index=False)