In [None]:
from db_inference.simple_sql_db import SimpleSqlDb
from db_inference import calc_icity
from tqdm import tqdm
from utils import notebook_util, ggdb_logging
from collections import defaultdict

import os
import json
notebook_util.disp_notebook_full_width()

In [None]:
db = SimpleSqlDb()

In [None]:
INPUT_FILE = "/GeneGraphDB/data/jacob_baits_20220202/cas1.txt"
OUTPUT_FILE = os.path.join("/GeneGraphDB/data/icity_results/", os.path.basename(INPUT_FILE).replace(".txt", ".json"))

os.makedirs(os.path.dirname(OUTPUT_FILE), exist_ok=True)


with open("/GeneGraphDB/data/jacob_baits_20220202/cas1.txt", "r") as f:
    baits = [line.strip() for line in f.readlines()]
    
# jacob said files here accidentally have 20 chars instead of 18
baits = [b[:18] for b in baits]

In [None]:
ggdb_logging.info(f"Found {len(baits)} baits in file {INPUT_FILE}")

baits = baits[0:10]
len(baits)

In [None]:
%%time
p100_to_p30 = {}
icity_results = {}

for bait in baits:
    ggdb_logging.info(f"Running bait {bait}")
    if bait not in p100_to_p30:
        bait_p30 = db.get_p30_cluster_for_p100(bait)["p30"]
        p100_to_p30[bait] = bait_p30
    
    bait_p30 = p100_to_p30[bait]
    
    bait_neighbors = db.get_p100_windowed_neighbors(bait)
    ggdb_logging.info(f"Bait {bait} has {len(bait_neighbors)} neighbors")
    
    for tgt in bait_neighbors:
        if tgt not in p100_to_p30:
            tgt_p30_row = db.get_p30_cluster_for_p100(tgt)
            if tgt_p30_row is None:
                ggdb_logging.info("Skipping missing target p30")
                continue
            p100_to_p30[tgt] = tgt_p30_row["p30"]
        tgt_p30 = p100_to_p30[tgt]
        
        tgt_first_key = f"{tgt_p30}|{bait_p30}"
        if tgt_first_key in icity_results:
            ggdb_logging.info(f"cache hit for {tgt_first_key}")
            continue
        ggdb_logging.info(f"Computing icity for {tgt_first_key}")
        
        icity_graph = calc_icity.build_icity_graph(db, tgt_p30, bait_p30)
        
        tgt_first_icity = calc_icity.compute_icity_on_graph(icity_graph, tgt_p30)
        tgt_first_icity["bait_hash"] = bait_p30
        icity_results[tgt_first_key] = tgt_first_icity
        
        bait_first_key = f"{bait_p30}|{tgt_p30}"
        bait_first_icity = calc_icity.compute_icity_on_graph(icity_graph, bait_p30)
        bait_first_icity["bait_hash"] = tgt_p30
        icity_results[bait_first_key] = bait_first_icity


In [None]:
with open(OUTPUT_FILE, 'w',) as fp:
    json.dump(icity_results, fp, indent=2)
    
ggdb_logging.info(f"Wrote to file {OUTPUT_FILE}")