## Curate Clusters

In [None]:
import trenchripper.trenchripper as tr

import anndata
import warnings
import random
import ast
import re
import goatools
import goatools.base
import urllib.request

import numpy as np
import pandas as pd

from goatools.base import download_go_basic_obo
from goatools.obo_parser import GODag
from goatools.anno.gaf_reader import GafReader
from dna_features_viewer import BiopythonTranslator
from Bio.SeqFeature import SeqFeature,FeatureLocation
from Bio import SeqIO
from Bio.SeqUtils import nt_search
from Bio.Seq import Seq

from matplotlib import pyplot as plt

random.seed(42)
np.random.seed(42)

warnings.filterwarnings(action='once',category=UserWarning)

In [None]:
class sgRNA_Explorer(BiopythonTranslator):
    """Custom translator implementing the following theme:

    - Color terminators in green, CDS in blue, all other features in gold.
    - Do not display features that are restriction sites unless they are BamHI
    - Do not display labels for restriction sites.
    - For CDS labels just write "CDS here" instead of the name of the gene.

    """
    
    def __init__(self,ignored_features_types=["CDS"]):
        self.ignored_features_types = ignored_features_types
        super(sgRNA_Explorer, self).__init__()

    def compute_feature_color(self, feature):
        if feature.type == "CDS":
            return '#1f77b4'
        elif feature.type == "terminator":
            return '#279e68'
        elif feature.type == "promoter":
            return "#aa40fc"
        elif feature.type == "Subset-sgRNA":
            return "#ff6a6a"
        elif feature.type == "Cluster-sgRNA":
            return "#eee685"
        elif feature.type == "Dataset-sgRNA":
            return "#20b2aa"
        else:
            return "#aec7e8"
        
def add_promoters_to_genbank(genome_record,promoter_df):
    promoter_feature_list = []
    for index, promoter in promoter_df.iterrows():
        if promoter["Strand"] == "forward":
            promoter_feature = SeqFeature(location=FeatureLocation(promoter["TSS"],promoter["TSS"]),type="promoter",strand=1)
            promoter_feature.qualifiers["gene"] = promoter["Name"]
            promoter_feature_list.append(promoter_feature)
        else:
            promoter_feature = SeqFeature(location=FeatureLocation(promoter["TSS"],promoter["TSS"]),type="promoter",strand=-1)
            promoter_feature.qualifiers["gene"] = promoter["Name"]
            promoter_feature_list.append(promoter_feature)
    genome_record.features = genome_record.features + promoter_feature_list
    return genome_record

def display_target_sites(genome_record,target_site_list,translator,view_pad=1000):
    strand_dict = {"+": 1, "-": -1}

    n_targets = len(target_site_list)

    for target_site in target_site_list:
        start_coord = target_site[0]-view_pad
        end_coord = target_site[1] + view_pad

        sub_genome_record = genome_record[start_coord:end_coord]
        sgRNA = SeqFeature(location=FeatureLocation(view_pad,view_pad+20),type="sgRNA",strand=strand_dict[target_site[2]])

        sgRNA.qualifiers["gene"] = "sgRNA"

        sub_genome_record.features = sub_genome_record.features + [sgRNA]
        graphic_record = translator.translate_record(sub_genome_record)

        ax, _ = graphic_record.plot(figure_width=10, strand_in_label_threshold=7)
                
def display_target_sites_single_locus(genome_record,subset_target_site_dict,cluster_target_site_dict,dataset_target_site_dict,translator,view_pad=2000,outer_context_pad=20000):
    strand_dict = {"+": 1, "-": -1}
    
    n_targets = len(subset_target_site_dict)
    first_key = list(subset_target_site_dict.keys())[0]
    
    outer_start_coord = subset_target_site_dict[first_key][0]-outer_context_pad
    outer_end_coord = subset_target_site_dict[first_key][1]+outer_context_pad
    
    start_coord = subset_target_site_dict[first_key][0]-outer_start_coord
    end_coord = subset_target_site_dict[first_key][1]-outer_start_coord
    
    sub_genome_record = genome_record[outer_start_coord:outer_end_coord]
    
    for targetid,target_site in subset_target_site_dict.items():
        sgRNA_subset = SeqFeature(location=FeatureLocation(target_site[0]-outer_start_coord,target_site[1]-outer_start_coord),type="Subset-sgRNA",strand=strand_dict[target_site[2]])
        sgRNA_subset.qualifiers["gene"] = str(targetid)
        sub_genome_record.features = sub_genome_record.features + [sgRNA_subset]
        
    for targetid,target_site in cluster_target_site_dict.items():
        sgRNA_cluster = SeqFeature(location=FeatureLocation(target_site[0]-outer_start_coord,target_site[1]-outer_start_coord),type="Cluster-sgRNA",strand=strand_dict[target_site[2]])
        sgRNA_cluster.qualifiers["gene"] = str(targetid)
        sub_genome_record.features = sub_genome_record.features + [sgRNA_cluster]
        
    for targetid,target_site in dataset_target_site_dict.items():
        sgRNA_dataset = SeqFeature(location=FeatureLocation(target_site[0]-outer_start_coord,target_site[1]-outer_start_coord),type="Dataset-sgRNA",strand=strand_dict[target_site[2]])
        sgRNA_dataset.qualifiers["gene"] = str(targetid)
        sub_genome_record.features = sub_genome_record.features + [sgRNA_dataset]
        
    graphic_record = translator.translate_record(sub_genome_record)
    cropped_record = graphic_record.crop((start_coord-view_pad, end_coord+view_pad))
    ax, _ = cropped_record.plot(figure_width=10, strand_in_label_threshold=7)

def target_subset_to_target_dict(df,selected_df,targetid_subset_list):
    targetid_in_subset = selected_df[selected_df["TargetID"].isin(targetid_subset_list)]["TargetID"].unique().tolist()
    targetid_in_cluster = selected_df["TargetID"].unique().tolist()
    targetid_in_dataset = df["TargetID"].unique().tolist()
    
    targetid_in_cluster_not_subset = sorted(list(set(targetid_in_cluster)-set(targetid_in_subset)))
    targetid_in_dataset_not_cluster = sorted(list(set(targetid_in_dataset)-set(targetid_in_cluster)))

    subset_target_site_dict = df[df["TargetID"].isin(targetid_in_subset)].groupby("TargetID").apply(lambda x: x.iloc[0])["Target Sites"].to_dict()
    subset_target_site_dict = {key:item for key,val in subset_target_site_dict.items() for item in val}# unwrapping target sites
    
    cluster_target_site_dict = df[df["TargetID"].isin(targetid_in_cluster_not_subset)].groupby("TargetID").apply(lambda x: x.iloc[0])["Target Sites"].to_dict()
    cluster_target_site_dict = {key:item for key,val in cluster_target_site_dict.items() for item in val}# unwrapping target sites
    
    dataset_target_site_dict = df[df["TargetID"].isin(targetid_in_dataset_not_cluster)].groupby("TargetID").apply(lambda x: x.iloc[0])["Target Sites"].to_dict()
    dataset_target_site_dict = {key:item for key,val in dataset_target_site_dict.items() for item in val}# unwrapping target sites

    return subset_target_site_dict,cluster_target_site_dict,dataset_target_site_dict

def gene_to_target_subset(selected_df,gene_name):
    
    subset_df = selected_df[selected_df["Gene"]==gene_name]
    targetid_subset_list = subset_df.groupby("TargetID").apply(lambda x: x.iloc[0])["TargetID"].tolist()
    
    return targetid_subset_list

def gene_to_target_dict(df,selected_df,gene_name):
    
    targetid_subset_list = gene_to_target_subset(selected_df,gene_name)
    subset_target_site_dict,cluster_target_site_dict,dataset_target_site_dict = target_subset_to_target_dict(df,selected_df,targetid_subset_list)
    
    return subset_target_site_dict,cluster_target_site_dict,dataset_target_site_dict

def display_cluster_polar_effects(df,selected_df,min_gene_count=min_gene_count,view_pad=4000):

    genes,count = np.unique(selected_df["Gene"].tolist(),return_counts=True)
    genes_to_display = genes[count>=min_gene_count].tolist()

    for gene in genes_to_display:
        print(gene)
        subset_target_site_dict,cluster_target_site_dict,dataset_target_site_dict = gene_to_target_dict(df,selected_df,gene)
        display_target_sites_single_locus(genome_record,subset_target_site_dict,cluster_target_site_dict,dataset_target_site_dict,translator,view_pad=view_pad)
        plt.show()

def search_offtargets(selected_df,genome_record,gene_name,view_pad=2000):
    genome_record_f = str(genome_record.seq)
    genome_record_r = str(genome_record.seq.reverse_complement())
    targetseq_df = selected_df.groupby("Target Sequence").apply(lambda x: x.iloc[0])
    gene_df = targetseq_df[targetseq_df["Gene"]==gene_name]

    pam_adj_seq_dict = {row["TargetID"]:row["Target Sequence"][:10] for idx,row in gene_df.iterrows()}
    gene_target_site_list = [item for val in gene_df["Target Sites"].tolist() for item in val]

    all_target_sites_dict = {}
    for targetid,pam_adj_seq in pam_adj_seq_dict.items():
        fwd_search = nt_search(genome_record_f,pam_adj_seq)[1:]
        rev_search = nt_search(genome_record_r,pam_adj_seq)[1:]
        record_len = len(genome_record_r)
        target_site_dict_fwd = {str(targetid) + "_Off_" + str(i):(site+1,site+20,"+") for i,site in enumerate(fwd_search)}
        target_site_dict_rev = {str(targetid) + "_Off_" + str(i+len(target_site_dict_fwd)):(record_len-site-20,record_len-site-1,"-") for i,site in enumerate(rev_search)}
        target_site_dict = {**target_site_dict_fwd,**target_site_dict_rev}
        all_target_sites_dict.update(target_site_dict)

    nongene_target_sites_dict = {key:val for key,val in all_target_sites_dict.items() if val not in gene_target_site_list}
    nongene_target_sites_list = list(nongene_target_sites_dict.values())
    nongene_target_sites_dict_split = [{key:val} for key,val in nongene_target_sites_dict.items()]

    subset_target_site_dict,cluster_target_site_dict,dataset_target_site_dict = gene_to_target_dict(clustree_pandas_df,selected_df,gene_name)
    display_target_sites_single_locus(genome_record,subset_target_site_dict,cluster_target_site_dict,dataset_target_site_dict,translator,view_pad=view_pad)

    for item in nongene_target_sites_dict_split:
        try:
            display_target_sites_single_locus(genome_record,item,cluster_target_site_dict,dataset_target_site_dict,translator,view_pad=view_pad)
        except:
            print("Out of bounds")
            pass
def display_cluster_offtargets(selected_df,genome_record,min_gene_count=min_gene_count):
    genes,count = np.unique(selected_df["Gene"].tolist(),return_counts=True)
    genes_to_display = genes[count>=min_gene_count].tolist()
    for gene in genes_to_display:
        print(gene)
        search_offtargets(selected_df,genome_record,gene)
        plt.show()


def find_loc(search_str,reference):
    ref_len = len(reference)
    comp_search_str = str(Seq(search_str).reverse_complement())
    search = re.finditer(search_str,reference)
    comp_search = re.finditer(comp_search_str,reference)
    start_end_list = []
    for item in search:
        start_end_list.append((item.start(), item.end(), "+"))
    for item in comp_search:
        start_end_list.append((ref_len - item.end(), ref_len - item.start(), "-"))
    return start_end_list

def display_target_sites_single_locus_with_fitness(gene,genome_record,subset_target_site_dict,cluster_target_site_dict,dataset_target_site_dict,EcoWG1_data,translator,\
                                                   log_fc_min_diff=3.,view_pad=2000,outer_context_pad=10000,figsize=(12, 3)):
    strand_dict = {"+": 1, "-": -1}
    
    n_targets = len(subset_target_site_dict)
    first_key = list(subset_target_site_dict.keys())[0]
    
    outer_start_coord = subset_target_site_dict[first_key][0]-outer_context_pad
    outer_end_coord = subset_target_site_dict[first_key][1]+outer_context_pad
    
    start_coord = subset_target_site_dict[first_key][0]-outer_start_coord
    end_coord = subset_target_site_dict[first_key][1]-outer_start_coord
        
    sub_genome_record = genome_record[outer_start_coord:outer_end_coord]
    sub_genome_record_str = str(sub_genome_record.seq)
    sub_genome_record_len = len(sub_genome_record.seq)
    EcoWG1_data["Fitness Sites"] = EcoWG1_data.apply(lambda x: [sub_genome_record_len-(((lst[0]+lst[1])//2)) if lst[2]=="-" else (lst[0]+lst[1])//2 for lst in find_loc(x["Target Sequence"],sub_genome_record_str)], axis=1)
    EcoWG1_data_subset = EcoWG1_data[EcoWG1_data["Fitness Sites"].apply(len)>0]
    
    target_gene_fitness = EcoWG1_data_subset[EcoWG1_data_subset["gene"]==gene]["T4"]
    logfc_diff = target_gene_fitness - np.min(target_gene_fitness)
    logfc_min_diff = np.min(logfc_diff[~(logfc_diff==0.)])
    
    if logfc_min_diff >= log_fc_min_diff:    
        fitness_arr = np.array([[site,row["T4"]] for idx,row in EcoWG1_data_subset.iterrows() for site in row["Fitness Sites"]])

        for targetid,target_site in subset_target_site_dict.items():
            sgRNA_subset = SeqFeature(location=FeatureLocation(target_site[0]-outer_start_coord,target_site[1]-outer_start_coord),type="Subset-sgRNA",strand=strand_dict[target_site[2]])
            sgRNA_subset.qualifiers["gene"] = str(targetid)
            sub_genome_record.features = sub_genome_record.features + [sgRNA_subset]

        for targetid,target_site in cluster_target_site_dict.items():
            sgRNA_cluster = SeqFeature(location=FeatureLocation(target_site[0]-outer_start_coord,target_site[1]-outer_start_coord),type="Cluster-sgRNA",strand=strand_dict[target_site[2]])
            sgRNA_cluster.qualifiers["gene"] = str(targetid)
            sub_genome_record.features = sub_genome_record.features + [sgRNA_cluster]

        for targetid,target_site in dataset_target_site_dict.items():
            sgRNA_dataset = SeqFeature(location=FeatureLocation(target_site[0]-outer_start_coord,target_site[1]-outer_start_coord),type="Dataset-sgRNA",strand=strand_dict[target_site[2]])
            sgRNA_dataset.qualifiers["gene"] = str(targetid)
            sub_genome_record.features = sub_genome_record.features + [sgRNA_dataset]

        fig, (ax1, ax2) = plt.subplots(2, 1, figsize=figsize, sharex=True, gridspec_kw={"height_ratios": [4, 1]})
        graphic_record = translator.translate_record(sub_genome_record)
        cropped_record = graphic_record.crop((start_coord-view_pad, end_coord+view_pad))
        cropped_record.plot(ax=ax1, strand_in_label_threshold=7, figure_width=10)
        ax2.scatter(fitness_arr[:,0],fitness_arr[:,1])
        ax2.set_ylim(-11,1)
        ax2.set_ylabel("Log FC")
        plt.show()
def display_cluster_outliers(df,selected_df,EcoWG1_data,min_gene_count=min_gene_count,log_fc_min_diff=log_fc_min_diff):

    genes,count = np.unique(selected_df["Gene"].tolist(),return_counts=True)
    genes_to_display = genes[count>=min_gene_count].tolist()

    for gene in genes_to_display:
        print(gene)
        subset_target_site_dict,cluster_target_site_dict,dataset_target_site_dict = gene_to_target_dict(df,selected_df,gene)
        display_target_sites_single_locus_with_fitness(gene,genome_record,subset_target_site_dict,cluster_target_site_dict,dataset_target_site_dict,EcoWG1_data,\
                                                   translator,log_fc_min_diff=log_fc_min_diff,view_pad=3000,outer_context_pad=10000,figsize=(14, 3))

### Reload R-compatible AnnData and Clustree Pandas Dataframe

In [None]:
strong_effect_threshold = [1.,1.25,1.5][0]
n_neighbors = [5,10,15][1]
resolution_list = [0.2,0.5,1.,2.]

rootpath = "/home/de64/group/de64/CRISPRi_Libraries/dev_notebooks/2023-02-26_lDE20_10Hour_Analysis"
z_score_thr_dir = rootpath + "/Z_Score_Thr_" + str(strong_effect_threshold) + "_N_Neighbors_" + str(n_neighbors)
clustree_path = z_score_thr_dir + "/Clustree"
umap_path = z_score_thr_dir + "/UMAP"
compare_path = z_score_thr_dir + "/Comparison"
selected_path = z_score_thr_dir + "/Selected_Clusters"

tr.writedir(selected_path, overwrite=False)

clustree_pandas_df = pd.read_pickle(z_score_thr_dir+"/Pandas_Dataframe.pkl")
clustree_pandas_df['Target Sites'] = clustree_pandas_df['Target Sites'].apply(lambda x: ast.literal_eval(x))
clustree_pandas_df["Target Sequence RC"] = clustree_pandas_df["Target Sequence"].apply(lambda x: str(Seq(x).reverse_complement()))

an_df_clustree_r_compat = anndata.read_h5ad(z_score_thr_dir+"/AnnData.h5ad")

# Get ontologies
obo_fname = download_go_basic_obo()

# Get ecoli association file (ecocyc)
gaf_handle = goatools.base.http_get("http://current.geneontology.org/annotations/ecocyc.gaf.gz",fout=rootpath + "/ecocyc.gaf.gz")
gaf_fname = goatools.base.gunzip(rootpath + "/ecocyc.gaf.gz")

## Getting ontologies and other nonesense

obodag = GODag(obo_fname)
objanno = GafReader(gaf_fname)
ns2assoc = objanno.get_ns2assc()

gene_to_id = {assoc.DB_Symbol:assoc.DB_ID for assoc in objanno.associations}
inv_gene_to_id = {assoc.DB_ID:assoc.DB_Symbol for assoc in objanno.associations}
synonym_dict = {synonym:assoc.DB_ID for assoc in objanno.associations for synonym in assoc.DB_Synonym}
gene_to_id.update(synonym_dict)

#### Make Reference SeqRecord

In [None]:
# urllib.request.urlretrieve('http://regulondb.ccg.unam.mx/menu/download/datasets/files/U00096.3.gbk', rootpath + '/U00096.3.gbk')
# urllib.request.urlretrieve('http://regulondb.ccg.unam.mx/menu/download/datasets/files/PromoterSet.txt', rootpath + '/PromoterSet.txt')
# urllib.request.urlretrieve('https://gitlab.pasteur.fr/dbikard/ecowg1/-/raw/master/data/EcoWG1_data.csv?inline=false', rootpath + '/EcoWG1_data.csv')

genome_record = SeqIO.read(rootpath + '/U00096.3.gbk', "genbank")
promoter_df = pd.read_csv(rootpath + '/PromoterSet.txt',sep="\t",skiprows=40,names=["ID","Name","Strand","TSS","Sigma Factor","Sequence","Evidence","Additional Evidence","Confidence"])
EcoWG1_data = pd.read_csv(rootpath + '/EcoWG1_data.csv')
EcoWG1_data = EcoWG1_data.rename(columns={"Unnamed: 0":"Target Sequence","ori":"Strand"})
promoter_df = promoter_df[promoter_df["Confidence"]=="Strong"]
genome_record_merged = add_promoters_to_genbank(genome_record,promoter_df)

targetseq_df = clustree_pandas_df.groupby("Target Sequence").apply(lambda x: x.iloc[0].drop("Target Sequence"))
fitness_series = EcoWG1_data[["Target Sequence","T4"]].rename(columns={"Target Sequence":"Target Sequence RC","T4":"log FC"}).set_index("Target Sequence RC")["log FC"]
targetseq_df = pd.merge(targetseq_df,fitness_series,on="Target Sequence RC")
targetid_to_log_fc_dict = targetseq_df[["TargetID","log FC"]].set_index("TargetID")['log FC'].to_dict()

In [None]:
translator = sgRNA_Explorer()

### Export Clusters

In [None]:
cluster_level = "L3"
cluster_labels = ["20","21","26","29","35","37","6","45","39","0","28","42","50","34","41"]
selected_df = clustree_pandas_df[clustree_pandas_df[cluster_level]==cluster_label].copy()

### Export Cluster CSV

In [None]:
columns_for_small_df = [
 'Kernel Trace: Delta time (s): Transformed: z score',
 'Kernel Trace: Septum Displacement Length Normalized: Transformed: z score',
 'Kernel Trace: Delta time (s)',
 'Kernel Trace: Septum Displacement',
 'Kernel Trace: Septum Displacement Length Normalized',
 'Kernel Trace: Length: Transformed: z score',
 'Kernel Trace: Width: Transformed: z score',
 'Kernel Trace: mCherry mean_intensity: Transformed: z score',
 'Kernel Trace: Length',
 'Kernel Trace: Width',
 'Kernel Trace: mCherry mean_intensity',
 'Kernel Trace: Instantaneous Growth Rate: Volume: Transformed: z score',
 'Kernel Trace: Instantaneous Growth Rate: Volume',
 'sgRNA',
 'Gene',
 'N Mismatch',
 'Category',
 'TargetID',
 'N Observations',
 'Feature Vector',
 'Kernel Trace: Delta time (hr)',
 'Integrated Z-scores',
 'Last Timepoint Z-scores',
 'Delta time (hr): Integrated Z-score',
 'Delta time (hr): Last Timepoint Z-score',
 'Septum Displacement Length Normalized: Integrated Z-score',
 'Septum Displacement Length Normalized: Last Timepoint Z-score',
 'Length: Integrated Z-score',
 'Length: Last Timepoint Z-score',
 'Width: Integrated Z-score',
 'Width: Last Timepoint Z-score',
 'mCherry mean_intensity: Integrated Z-score',
 'mCherry mean_intensity: Last Timepoint Z-score',
 'Instantaneous Growth Rate: Volume: Integrated Z-score',
 'Instantaneous Growth Rate: Volume: Last Timepoint Z-score',
 'Integrated Z-scores',
 'Feature Max Magnitude Z-score',
 'Mean: Septum Displacement',
 'Mean: Septum Displacement Length Normalized',
 'Mean: Delta time (s)',
 'Mean: Length',
 'Mean: Width',
 'Mean: mCherry mean_intensity',
 'Mean: Instantaneous Growth Rate: Volume',
 'Mean: Delta time (hr)',
# 'Mean: GFP-Penta mean_intensity',
# 'Mean: GFP-Penta mean_intensity: Yeo-Johnson: z score',
 'L0',
 'L1',
 'L2',
 'L3',
 'Target Sequence',
 'N Target Sites',
 'Target Sites',
 'L0 Jackknife Jaccard',
 'L0 Jackknife Recall',
 'L0 Jackknife sgRNA Recall',
 'L1 Jackknife Jaccard',
 'L1 Jackknife Recall',
 'L1 Jackknife sgRNA Recall',
 'L2 Jackknife Jaccard',
 'L2 Jackknife Recall',
 'L2 Jackknife sgRNA Recall',
 'L3 Jackknife Jaccard',
 'L3 Jackknife Recall',
 'L3 Jackknife sgRNA Recall']

rename_column_dict = {'Kernel Trace: Delta time (s): Transformed: z score':'Interdivision Time (s) Z-score Timeseries',\
 'Kernel Trace: Septum Displacement Length Normalized: Transformed: z score':'Septum Displacement Length Normalized Z-score Timeseries',\
 'Kernel Trace: Delta time (s)':'Interdivision Time (s) Timeseries',\
 'Kernel Trace: Septum Displacement':'Septum Displacement (um) Timeseries',\
 'Kernel Trace: Septum Displacement Length Normalized':'Septum Displacement Length Normalized Timeseries',\
 'Kernel Trace: Length: Transformed: z score':'Length Z-score Timeseries',\
 'Kernel Trace: Width: Transformed: z score':'Width Z-score Timeseries',\
 'Kernel Trace: mCherry mean_intensity: Transformed: z score':'mKate2 Mean Intensity Z-score Timeseries',\
 'Kernel Trace: Length':'Length (um) Timeseries',\
 'Kernel Trace: Width':'Width (um) Timeseries',\
 'Kernel Trace: mCherry mean_intensity':'mKate2 Mean Intensity (AU) Timeseries',\
 'Kernel Trace: Instantaneous Growth Rate: Volume: Transformed: z score':'Volume Growth Rate Z-score Timeseries',\
 'Kernel Trace: Instantaneous Growth Rate: Volume':'Volume Growth Rate (1/hr) Timeseries',\
 'Kernel Trace: Delta time (hr)':'Interdivision Time (hr) Timeseries',\
 'Delta time (hr): Integrated Z-score':'Interdivision Time (hr) Integrated Z-score',\
 'Delta time (hr): Last Timepoint Z-score':'Interdivision Time (hr) Last Timepoint Z-score',\
 'Septum Displacement Length Normalized: Integrated Z-score':'Septum Displacement Length Normalized Integrated Z-score',\
 'Septum Displacement Length Normalized: Last Timepoint Z-score':'Septum Displacement Length Normalized Last Timepoint Z-score',\
 'Length: Integrated Z-score':'Length Integrated Z-score',\
 'Length: Last Timepoint Z-score':'Length Last Timepoint Z-score',\
 'Width: Integrated Z-score':'Width Integrated Z-score',\
 'Width: Last Timepoint Z-score':'Width Last Timepoint Z-score',\
 'mCherry mean_intensity: Integrated Z-score':'mKate2 Mean Intensity Integrated Z-score',\
 'mCherry mean_intensity: Last Timepoint Z-score':'mKate2 Mean Intensity Last Timepoint Z-score',\
 'Instantaneous Growth Rate: Volume: Integrated Z-score':'Volume Growth Rate Integrated Z-score',\
 'Instantaneous Growth Rate: Volume: Last Timepoint Z-score':'Volume Growth Rate Last Timepoint Z-score',\
 'Mean: Septum Displacement':'Mean Septum Displacement (um) Steady State',\
 'Mean: Septum Displacement Length Normalized':'Mean Septum Displacement Length Normalized Steady State',\
 'Mean: Delta time (s)':'Mean Interdivision Time (s) Steady State',\
 'Mean: Length':'Mean Length (um) Steady State',\
 'Mean: Width':'Mean Width (um) Steady State',\
 'Mean: mCherry mean_intensity':'Mean mKate2 Mean Intensity (AU) Steady State',\
 'Mean: Instantaneous Growth Rate: Volume':'Mean Volume Growth Rate (1/hr) Steady State',\
 'Mean: Delta time (hr)':'Mean Interdivision Time (hr) Steady State',\
 'Mean: GFP-Penta mean_intensity':'Mean GFP Mean Intensity (AU) (at $t_{f}$ for nucleoid snapshots)',\
 'Mean: GFP-Penta mean_intensity: Yeo-Johnson: z score':'Mean GFP Mean Intensity Z-score (at $t_{f}$ for nucleoid snapshots)'}

for cluster_label in cluster_labels:
    selected_df = clustree_pandas_df[clustree_pandas_df[cluster_level]==cluster_label].copy()

    small_output_df = selected_df[columns_for_small_df].rename(columns=rename_column_dict)
    small_output_df = small_output_df.rename(columns=rename_column_dict)
    very_small_output_df = small_output_df.drop(columns=[name for name in small_output_df.columns if "Timeseries" in name]+["Feature Vector","Integrated Z-scores","Last Timepoint Z-scores","Feature Max Magnitude Z-score","Target Sites"])
    very_small_output_df.insert(0, "Note", "")
    very_small_output_df.insert(0, "Polar", "")
    very_small_output_df.insert(0, "Off-target", "")
    very_small_output_df.insert(0, "Outlier", "")
    very_small_output_df = very_small_output_df.reset_index(drop=False).set_index(["Gene","TargetID"]).sort_index().reset_index(drop=False).set_index("oDEPool7_id")
    
    gene_count = very_small_output_df.groupby("Gene").size()
    included_genes = gene_count[gene_count>=min_gene_count].index.tolist()
    very_small_output_df = very_small_output_df[very_small_output_df["Gene"].isin(included_genes)]
    
    very_small_output_df.to_csv(selected_path + "/" + cluster_level + "_" + cluster_label + ".csv")

## Displays to Check:
- Polar effects
- sgRNAs with an abnormal fitness defect for the gene target
    - Fitness values from Calvo-Villamañán et al 2020
- Potential off-target sites

### Choose Cluster to Assess

In [None]:
cluster_level = "L3"
min_gene_count = 1
log_fc_min_diff = 3.
cluster_label = "39"

selected_df = clustree_pandas_df[clustree_pandas_df[cluster_level]==cluster_label].copy()

### Display Polar Effects

In [None]:
# temp_filter = selected_df[selected_df["Gene"]!="dnaK"]
display_cluster_polar_effects(clustree_pandas_df,selected_df,view_pad=4000)

### Search for Fitness Outliers

In [None]:
display_cluster_outliers(clustree_pandas_df,selected_df,EcoWG1_data)

### Search For Off-Targets
- Find perfect match for first 10 bps of sgRNA

In [None]:
search_offtargets(selected_df,genome_record,"mnmC",view_pad=4000)