In [None]:
%load_ext autoreload
%autoreload 2

# Module imports

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import random
import torch
import pickle
import math
import matplotlib.pyplot as plt
from matplotlib.colors import Normalize
from scipy.special import kl_div
from tqdm import tqdm
from Bio import SeqIO
from Bio.Seq import Seq
import re
from analysis_utils import convert_data, get_tail_info, create_data_frame
from analysis_utils import create_collection_orthologs, ortholog_tails, create_data_frame_ortho
from analysis_utils import get_pre_rec_data

# Set up

## 10 bp

In [None]:
# load generated sequence scores
c10 = # sequence base pair scores
c10h = # head base pair scores

In [None]:
filename='path_to_Ecoli_genome'
record = SeqIO.read(filename, "genbank")
genome_coli=record.seq
filename='path_to_Bsubtilis_genome'
record = SeqIO.read(filename, "genbank")
genome_sub=record.seq
filename='path_to_salmonella_genome'
record = SeqIO.read(filename, "genbank")
genome_nella=record.seq

In [None]:
# load metadata
ht_meta=pd.read_csv("metadata_10bp_ht.csv")
seq_meta=pd.read_csv("metadata_10bp_seqs.csv")

In [None]:
def make_ind_dict(df_h,df_s):
'''
takes up metadata to make dictionaries used in analysis
'''
    ind_th={}
    th_ind={}
    # seq_ind={}
    s_ind={}
    for i in range(len(df_h)):
        ind_th[df_h.iloc[i]["index"]]=(df_h.iloc[i]["Intersection"],df_h.iloc[i]["coordinate"])
        th_ind[(df_h.iloc[i]["Intersection"],df_h.iloc[i]["coordinate"])]=df_h.iloc[i]["index"]

    for i in tqdm(range(len(df_s)), desc="Processing tasks", unit="task"):
        s_ind[(df_s.iloc[i]["Intersection"],df_s.iloc[i]["head"],df_s.iloc[i]["tail"])]=df_s.iloc[i]["index"]
                        
    return ind_th, th_ind, s_ind

In [None]:
ind_th, ind_th_fl, s_ind=make_ind_dict(ht_meta,seq_meta)

In [None]:
# generates tail sequences
tails={}
genomes={"coli":genome_coli, "sub":genome_sub, "nella":genome_nella}
for i in range(len(ht_meta)):
    k=ht_meta.iloc[i]["coordinate"]
    tails[(ht_meta.iloc[i]["Intersection"],k)]=genomes[ht_meta.iloc[i]["organism"]][k+10:k+510]

## Ortholog genes

In [None]:
# load aligned records
aligned_records_clp=read_fasta("picked_clpA.txt")
aligned_aa_clp=read_fasta("picked_clpA_prot.txt")
aligned_records_pro=read_fasta("picked_proRS.txt")
aligned_aa_pro=read_fasta("picked_prot_proRS.txt")

In [None]:
# load base-pair scores for analysis
clp_scores = # for clpA
pro_scores = # for proRS
clp_heads = # for clpA heads
pro_scores = # for proRS heads

# 10 bp analysis

## numerical

In [None]:
collection_500=convert_data(c10h,c10t,c10,ind_th_fl,s_ind,seq_meta)

In [None]:
tail_data=get_tail_info(list(collection_500.keys()),tails)

In [None]:
dataFrame_500=create_data_frame(collection_500,tail_data,mbp=500)

In [None]:
df_500_filt=dataFrame_500_mod[dataFrame_500_mod["tail_id_200"]<1]

In [None]:
print_results(df_500_filt)

## graphical

In [None]:
plt.figure(figsize=(8, 6))
sns.set(style='white')
ax=sns.kdeplot(data=df_500_filt, x='head_sc_log', hue="RCM_head", palette="magma",common_norm=False, fill=True)
ax.set_xlabel('Score of the head')
legend = ax.get_legend()
legend.set_title("Reverse Delta")
plt.show()

In [None]:
plt.figure(figsize=(8, 5))
sns.boxplot(data=df_500_filt, x='RCM_head', y='head_sc_log', palette="magma")
plt.xlabel('Reverse Delta prediction')
plt.ylabel('Score of the head')
plt.show()

In [None]:
a=[0,0.05,0.1,0.15,0.2,0.25,0.3,0.35,0.4,0.45,0.5,0.6]
pre_1,ret_1=get_pre_rec_data(df_500_filt,a,"comparison_head_200")
pre_2,ret_2=get_pre_rec_data(df_500_filt,a,"RCM_head")
pre_3,ret_3=get_pre_rec_data(df_500_filt,a,"RCM_tail")

In [None]:
plt.figure(figsize=(8, 5))
plt.plot(a, pre_1, label='Reverse Comparison', color='#dc267f')
plt.plot(a, pre_2, label='Reverse Delta', color='#788ef0')
plt.plot(a, pre_3, label='Forward Delta', color='#fe6100')
plt.xlabel('Confidence Threshold')  # Replace with your x-axis label
plt.ylabel('Accuracy')  # Replace with your y-axis label
plt.legend()
plt.show()

In [None]:
plt.figure(figsize=(8, 5))
plt.plot(a, ret_1, label='Reverse Comparison', color='#dc267f')
plt.plot(a, ret_2, label='Reverse Delta', color='#788ef0')
plt.plot(a, ret_3, label='Forward Delta', color='#fe6100')
plt.xlabel('Confidence Threshold')  # Replace with your x-axis label
plt.ylabel('Number of Test cases')  # Replace with your y-axis label
plt.legend()
plt.show()

# Ortholog genes analysis

## numerical

In [None]:
pro_collection=create_collection_orthologs(clp_scores, clp_heads)
clp_collection=create_collection_orthologs(pro_scores, pro_heads)

In [None]:
clp_tails=ortholog_tails(aligned_aa_clp,aligned_records_clp)
pro_tails=ortholog_tails(aligned_aa_pro,aligned_records_pro)

In [None]:
clp_df=create_data_frame_ortho(clp_collection,clp_tails)

In [None]:
pro_df=create_data_frame_ortho(pro_collection,pro_tails)

In [None]:
combined_df = pd.concat([pro_df, clp_df], ignore_index=True)

In [None]:
print_results(combined_df)

## graphical

In [None]:
sns.set(style='whitegrid')
plt.figure(figsize=(10,8))
ax=sns.scatterplot(data=combined_df,x='head_sc', y="tail_id_200",palette='magma',hue="RCM_head",s=100)
ax.set_xlabel("Score of the head", fontsize=12)
ax.set_ylabel("Tail identity", fontsize=12)
legend = ax.get_legend()
legend.set_title("Reverse Delta")
plt.show()

In [None]:
a=[0,0.05,0.1,0.15,0.2,0.25,0.3,0.35,0.4]
pre_1,ret_1=get_pre_rec_data(combined_df,a,"comparison_head_200")
pre_2,ret_2=get_pre_rec_data(combined_df,a,"RCM_head")
pre_3,ret_3=get_pre_rec_data(combined_df,a,"RCM_tail")

In [None]:
plt.figure(figsize=(8, 5))
plt.plot(a, pre_1, label='Reverse Comparison', color='#dc267f')
plt.plot(a, pre_2, label='Reverse Delta', color='#788ef0')
plt.plot(a, pre_3, label='Forward Delta', color='#fe6100')
plt.xlabel('Confidence Threshold')  # Replace with your x-axis label
plt.ylabel('Accuracy')  # Replace with your y-axis label
plt.legend()
plt.show()

In [None]:
plt.figure(figsize=(8, 5))
plt.plot(a, ret_1, label='Reverse Comparison', color='#dc267f')
plt.plot(a, ret_2, label='Reverse Delta', color='#788ef0')
plt.plot(a, ret_3, label='Forward Delta', color='#fe6100')
plt.xlabel('Confidence Threshold')  # Replace with your x-axis label
plt.ylabel('Number of Test cases')  # Replace with your y-axis label
plt.legend()
plt.show()