In [None]:
%load_ext autoreload
%autoreload 2

# Module imports

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import random
import torch
import pickle
import math
import matplotlib.pyplot as plt
from matplotlib.colors import Normalize
from scipy.special import kl_div
from scipy.stats import mannwhitneyu
from tqdm import tqdm
from Bio import SeqIO
from Bio.Seq import Seq
import re
from analysis_utils import convert_data, get_overlap_info, create_data_frame
from analysis_utils import create_collection_orthologs, ortholog_overlap, create_data_frame_ortho
from analysis_utils import get_pre_rec_data, read_fasta, print_results

In [None]:
from generating_utils import remove_gaps

In [None]:
color1='#68e7a9'
color2='#fe6100'
color3='#dc267f'
color4='#788ef0'

# Set up

## 10 bp

In [None]:
# load generated sequence scores
with open("scores_10bp_seqs_500.pkl", "rb") as file:
    c10 = pickle.load(file)
with open("scores_10bp_heads.pkl", "rb") as file:
    c10h = pickle.load(file)

In [None]:
with open("scores_10bp_seqs_500_2.pkl", "rb") as file:
    c10_2 = pickle.load(file)
with open("scores_10bp_heads_2.pkl", "rb") as file:
    c10h_2 = pickle.load(file)

In [None]:
filename='path to EColiK12.gbff'
record = SeqIO.read(filename, "genbank")
genome_coli=record.seq
filename='path to BacSub.gbff'
record = SeqIO.read(filename, "genbank")
genome_sub=record.seq
filename='path to AE006468.gb'
record = SeqIO.read(filename, "genbank")
genome_nella=record.seq

In [None]:
# load metadata
ht_meta=pd.read_csv("metadata_10bp_ht.csv")
seq_meta=pd.read_csv("metadata_10bp_seqs.csv")

In [None]:
ht_meta_2=pd.read_csv("metadata_2_10bp_ht.csv")
seq_meta_2=pd.read_csv("metadata_2_10bp_seqs.csv")

In [None]:
def make_ind_dict(df_h,df_s):
    '''
    takes up metadata to make dictionaries used in analysis
    '''
    ind_th={}
    th_ind={}
    # seq_ind={}
    s_ind={}
    for i in range(len(df_h)):
        ind_th[df_h.iloc[i]["index"]]=(df_h.iloc[i]["Intersection"],df_h.iloc[i]["coordinate"])
        th_ind[(df_h.iloc[i]["Intersection"],df_h.iloc[i]["coordinate"])]=df_h.iloc[i]["index"]

    for i in tqdm(range(len(df_s)), desc="Processing tasks", unit="task"):
        s_ind[(df_s.iloc[i]["Intersection"],df_s.iloc[i]["head"],df_s.iloc[i]["tail"])]=df_s.iloc[i]["index"]
                        
    return ind_th, th_ind, s_ind

In [None]:
ind_th, ind_th_fl, s_ind=make_ind_dict(ht_meta,seq_meta)

In [None]:
ind_th_2, ind_th_fl_2, s_ind_2=make_ind_dict(ht_meta_2,seq_meta_2)

In [None]:
# extract sequence pieces
tails={}
genomes={"coli":genome_coli, "sub":genome_sub, "nella":genome_nella}
for i in range(len(ht_meta)):
    k=ht_meta.iloc[i]["coordinate"]
    tails[(ht_meta.iloc[i]["Intersection"],k)]=genomes[ht_meta.iloc[i]["organism"]][k+10:k+510]

heads={}
genomes={"coli":genome_coli, "sub":genome_sub, "nella":genome_nella}
for i in range(len(ht_meta)):
    k=ht_meta.iloc[i]["coordinate"]
    heads[(ht_meta.iloc[i]["Intersection"],k)]=genomes[ht_meta.iloc[i]["organism"]][k-990:k+10]

In [None]:
tails_2={}
genomes={"coli":genome_coli, "sub":genome_sub, "nella":genome_nella}
for i in range(len(ht_meta_2)):
    k=ht_meta_2.iloc[i]["coordinate"]
    tails_2[(ht_meta_2.iloc[i]["Intersection"],k)]=genomes[ht_meta_2.iloc[i]["organism"]][k+10:k+510]
    
heads_2={}
for i in range(len(ht_meta_2)):
    k=ht_meta_2.iloc[i]["coordinate"]
    heads_2[(ht_meta_2.iloc[i]["Intersection"],k)]=genomes[ht_meta_2.iloc[i]["organism"]][k-990:k+10]


## Ortholog genes

In [None]:
# load aligned records
aligned_records_clp=read_fasta("picked_clpA.txt")
aligned_aa_clp=read_fasta("picked_clpA_prot.txt")
aligned_records_pro=read_fasta("picked_proRS.txt")
aligned_aa_pro=read_fasta("picked_prot_proRS.txt")

In [None]:
# load base-pair scores for analysis
with open("scores_clpA_seq.pkl", "rb") as file:
    clp_scores = pickle.load(file)
    
with open("scores_clpA_head.pkl", "rb") as file:
    clp_heads = pickle.load(file)


with open("scores_proRS_seq.pkl", "rb") as file:
    pro_scores = pickle.load(file)
    
with open("scores_proRS_head.pkl", "rb") as file:
    pro_heads = pickle.load(file)

# 10 bp analysis

## numerical

In [None]:
collection_500=convert_data(c10h,c10,ind_th_fl,s_ind,seq_meta)

In [None]:
overlap_data=get_overlap_info(list(collection_500.keys()),tails,heads,500)

In [None]:
dataFrame_500=create_data_frame(collection_500,overlap_data,mbp=500)

In [None]:
df_500_filt=dataFrame_500[dataFrame_500["tail_id_200"]<1]

In [None]:
print_results(df_500_filt)

In [None]:
collection_500_2=convert_data(c10h_2,c10_2,ind_th_fl_2,s_ind_2,seq_meta_2)

In [None]:
overlap_data_2=get_overlap_info(list(collection_500_2.keys()),tails_2,heads_2,500)

In [None]:
dataFrame_500_2=create_data_frame(collection_500_2,overlap_data_2,mbp=500)

In [None]:
df_500_filt_2=dataFrame_500_2[dataFrame_500_2["tail_id_200"]<1]

In [None]:
print_results(df_500_filt_2)

In [None]:
df_comb_filt = pd.concat([df_500_filt, df_500_filt_2], ignore_index=True)

In [None]:
print_results(df_comb_filt)

In [None]:
collection_long=convert_data(c10hh,c10_long,ind_th_fl,s_ind,seq_meta)

In [None]:
dataFrame_long=create_data_frame(collection_long,overlap_data,mbp=800)

In [None]:
df_long_filt=dataFrame_long[dataFrame_long["tail_id_200"]<1]

In [None]:
print_results(df_long_filt)

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
keys=["comparison_200","comparison_head","comparison_head_200","RCM_head","RCM_tail"]
for k in keys:
    output={"now correct":0, "now wrong":0,
            "confidence up correct":0, "confidence down correct":0,
            "confidence up wrong":0, "confidence down wrong":0}
    for i in range(len(df_500_filt)):
        assert df_500_filt.iloc[i]["ID"]==df_long_filt.iloc[i]["ID"]
        if df_500_filt.iloc[i][k]!=df_long_filt.iloc[i][k]:
            if df_500_filt.iloc[i][k]:
                output["now wrong"]+=1
            else:
                output["now correct"]+=1

        else:
            if df_500_filt.iloc[i][k]:
                if df_500_filt.iloc[i][k+"_n"]>df_long_filt.iloc[i][k+"_n"]:
                     output["confidence down correct"]+=1
                else:
                     output["confidence up correct"]+=1
            else:
                if df_500_filt.iloc[i][k+"_n"]>df_long_filt.iloc[i][k+"_n"]:
                    output["confidence down wrong"]+=1
                else:
                    output["confidence up wrong"]+=1
    print(k,output)

In [None]:
keys=["comparison_200","comparison_head","comparison_head_200","RCM_head","RCM_tail"]
for k in keys:
    output={"now correct":0, "now wrong":0,
            "confidence up correct":0, "confidence down correct":0,
            "confidence up wrong":0, "confidence down wrong":0}
    for i in tqdm(range(len(df_500_filt)), desc="Processing tasks", unit="task"):
        assert df_500_filt.iloc[i]["ID"]==df_long_filt.iloc[i]["ID"]
        b1=df_500_filt.iloc[i][k]
        v1=df_500_filt.iloc[i][k+"_n"]
        b2=df_long_filt.iloc[i][k]
        v2=df_long_filt.iloc[i][k+"_n"]
        if b1!=b2:
            if b1:
                output["now wrong"]+=1
            else:
                output["now correct"]+=1

        else:
            if b1:
                if v1>v2:
                     output["confidence down correct"]+=1
                else:
                     output["confidence up correct"]+=1
            else:
                if v1>v2:
                    output["confidence down wrong"]+=1
                else:
                    output["confidence up wrong"]+=1
    print(k,output)

In [None]:
def tail_with_move(collection,t_data):
    overview={
            "ID":[],
            "tail_id_200":[], "tail_id":[], "tail_overlap":[],
                    
            "tail1_score":[], "tail2_score":[], 
            "tail1_score_200":[], "tail2_score_200":[],
            "comparison_all" : [], "comparison_200":[]
        }

    for index, ID in tqdm(enumerate(collection.keys()), total=len(collection.keys())):
        overview["ID"].append(ID)

        overview["tail_id_200"].append(t_data[ID][0])
        overview["tail_id"].append(t_data[ID][1])
        overview["tail_overlap"].append(t_data[ID][2])
        array=collection[ID][0][0][1000+t_data[ID][2]:1000+500]
        overview["tail1_score"].append(math.exp(np.mean(np.log(array))))
        overview["tail1_score_200"].append(math.exp(np.mean(np.log(array[:200]))))
        array=collection[ID][1][0][1000+t_data[ID][2]:1000+500]
        overview["tail2_score"].append(math.exp(np.mean(np.log(array))))
        overview["tail2_score_200"].append(math.exp(np.mean(np.log(array[:200]))))

        overview["comparison_all"].append(overview["tail1_score"][index]>overview["tail2_score"][index])
        overview["comparison_200"].append(overview["tail1_score_200"][index]>overview["tail2_score_200"][index])       
    output=pd.DataFrame(overview)


    return output

In [None]:
df_tail_move=tail_with_move(collection_500,tail_data)
df_tail_move=df_tail_move[df_tail_move["tail_id_200"]<1]
print_results(df_tail_move)

In [None]:
len(df_500_filt[df_500_filt["tail_id"]>0.3])

In [None]:
print_results(df_500_filt[df_500_filt["tail_id"]>0.3])

In [None]:
print(len(df_500_filt[df_500_filt["head_sc"]<0.3]))
print_results(df_500_filt[df_500_filt["head_sc"]<0.3])

In [None]:
df_500_filt[(df_500_filt["comparison_head_200"]==False)&(df_500_filt["head_overlap"]>20)]

## graphical

In [None]:
def get_pre_rec_data_general(df_data, a, k1,k2):
    pre=[]
    ret=[]
    for i in a:
        df=df_data[df_data[k1]>i]
        pre.append(df[k2].to_list().count(True)/len(df))
        ret.append(len(df))
    return pre, ret

### Head score (context)

In [None]:
plt.figure(figsize=(8, 6))
sns.set(style='white')
ax=sns.kdeplot(data=df_500_filt, x='head_sc', hue="RCM_head", palette="magma",common_norm=False, fill=True)
ax.set_xlabel('Score of the head')
legend = ax.get_legend()
legend.set_title("Reverse Delta")
plt.show()

In [None]:
plt.figure(figsize=(8, 5))
sns.boxplot(data=df_500_filt, x='RCM_head', y='head_sc', palette="magma")
plt.xlabel('Correctness of Reverse Delta prediction')
plt.ylabel('Score of the head')
plt.show()

### Confidence

In [None]:
plt.figure(figsize=(8, 5))
sns.boxplot(data=df_500_filt, x='comparison_head_200', y='comparison_head_200_n', palette="magma")
plt.xlabel('Correctness of Reverse Comparison Prediction')
plt.ylabel('Confidence of the model')
plt.show()

In [None]:
plt.figure(figsize=(8, 5))
sns.boxplot(data=df_500_filt, x='comparison_200', y='comparison_200_n', palette="magma")
plt.xlabel('Correctness of Forward Comparison Prediction')
plt.ylabel('Confidence of the model')
plt.show()

In [None]:
a=[0,0.05,0.1,0.15,0.2,0.25,0.3,0.35,0.4,0.45,0.5,0.6]
pre_1,ret_1=get_pre_rec_data_general(df_500_filt,a,"comparison_head_200_n","comparison_head_200")
pre_2,ret_2=get_pre_rec_data_general(df_500_filt,a,"RCM_head_n","RCM_head")
pre_3,ret_3=get_pre_rec_data_general(df_500_filt,a,"RCM_tail_n","RCM_tail")

In [None]:
plt.figure(figsize=(8, 5))
plt.plot(a, pre_1, label='Reverse Comparison', color='#dc267f')
plt.plot(a, pre_2, label='Reverse Delta', color='#788ef0')
plt.plot(a, pre_3, label='Forward Delta', color='#fe6100')
plt.xlabel('Confidence Threshold')  # Replace with your x-axis label
plt.ylabel('Accuracy')  # Replace with your y-axis label
plt.legend()
plt.show()

In [None]:
plt.figure(figsize=(8, 5))
plt.plot(a, np.array(ret_1)/len(df_500_filt), label='Reverse Comparison', color='#dc267f')
plt.plot(a, np.array(ret_2)/len(df_500_filt), label='Reverse Delta', color='#788ef0')
plt.plot(a, np.array(ret_3)/len(df_500_filt), label='Forward Delta', color='#fe6100')
plt.xlabel('Confidence Threshold')  # Replace with your x-axis label
plt.ylabel('Retention rate')  # Replace with your y-axis label
plt.legend()
plt.show()

In [None]:
plt.figure(figsize=(8, 5))
plt.plot(a, ret_1, label='Reverse Comparison', color='#dc267f')
plt.plot(a, ret_2, label='Reverse Delta', color='#788ef0')
plt.plot(a, ret_3, label='Forward Delta', color='#fe6100')
plt.xlabel('Confidence Threshold')  # Replace with your x-axis label
plt.ylabel('Retention rate')  # Replace with your y-axis label
plt.legend()
plt.show()

### Identity

In [None]:
a=[0.1,0.15,0.2,0.25,0.3,0.35,0.4,0.45,0.5,0.55,0.6]
pre_i1,ret_i1=get_pre_rec_data_general(df_500_filt,a,"tail_id_200","comparison_head_200")
pre_i2,ret_i2=get_pre_rec_data_general(df_500_filt,a,"tail_id_200","RCM_head")
pre_i3,ret_i3=get_pre_rec_data_general(df_500_filt,a,"tail_id_200","comparison_200")
pre_i4,ret_i4=get_pre_rec_data_general(df_500_filt,a,"tail_id_200","comparison_all")

In [None]:
plt.figure(figsize=(8, 5))
plt.plot(a, pre_i4, label='Default Method', color='#68e7a9', linewidth=2)
plt.plot(a, pre_i3, label='Forward Comparison', color='#fe6100',linewidth=2)
plt.plot(a, pre_i1, label='Reverse Comparison', color='#dc267f',linewidth=2)
plt.plot(a, pre_i2, label='Reverse Delta', color='#788ef0',linewidth=2)
plt.xlabel('Tail Identity Threshold')  # Replace with your x-axis label
plt.ylabel('Accuracy')  # Replace with your y-axis label
# plt.xlim(0, 0.6)
plt.legend()
plt.show()

In [None]:
plt.figure(figsize=(8, 5))
plt.plot(a, ret_i1, color='#dc267f')
plt.xlabel('Tail Identity Threshold')  # Replace with your x-axis label
plt.ylabel('Retention rate')  # Replace with your y-axis label
# plt.legend()
plt.show()

# Ortholog genes analysis

## numerical

In [None]:
clp_collection=create_collection_orthologs(clp_scores, clp_heads)
pro_collection=create_collection_orthologs(pro_scores, pro_heads)

In [None]:
clp_tails=ortholog_overlap(aligned_aa_clp,aligned_records_clp)
pro_tails=ortholog_overlap(aligned_aa_pro,aligned_records_pro)

In [None]:
clp_df=create_data_frame_ortho(clp_collection,clp_tails)

In [None]:
print_results(clp_df)

In [None]:
pro_df=create_data_frame_ortho(pro_collection,pro_tails)

In [None]:
print_results(pro_df)

In [None]:
combined_df = pd.concat([pro_df, clp_df], ignore_index=True)

In [None]:
print_results(combined_df)

In [None]:
meta={}
meta_fl={}
counter=0
for i in range(20):
    for j in range(20):
        meta[counter]=(i,j)
        meta_fl[(i,j)]=counter
        counter+=1

In [None]:
c_l=[10,50,100,200,300,400,500,600,700,800,900,1000]

In [None]:
def create_collections(sc,cl,coord_ind,length=20,rem=400):
    '''
    structure of the output:
    context: {(ind1,ind2):[real,constructed]}
    '''
    col={}
    for c in range(len(cl)):
        col[cl[c]]={}
        for l in range(length):
            for ll in range(length):
                if l!=ll:
                    real=sc[c][coord_ind[(l,l)]]
                    con=sc[c][coord_ind[(l,ll)]]
                    if (len(real)==rem+cl[c]) and (len(con)==rem+cl[c]):
                        col[cl[c]][(l,ll)]=[real,con]

    return col

In [None]:
def create_collection_full(sc,coord_ind,length=20):
    '''
    structure of the output:
    context: {(ind1,ind2):[real,constructed]}
    '''
    col={}
    for l in range(length):
        for ll in range(length):
            if l!=ll:
                real=sc[coord_ind[(l,l)]]
                con=sc[coord_ind[(l,ll)]]
                col[(l,ll)]=[real,con]

    return col

In [None]:
def get_results_FW(cc,coll):
    res={
        "id":[],
        "full":[], "full_n":[], "tail":[], "tail_n":[], "tail_50":[],"tail_50_n":[],
        "tail_100":[],"tail_100_n":[], "tail_150":[], "tail_150_n":[],
        "tail_200":[],"tail_200_n":[],"tail_250":[],"tail_250_n":[]
    }
    sc=[50,100,150,200,250]
    
    for i in coll.keys():
        res["id"].append(i)
        r=math.exp(np.mean(np.log(coll[i][0])))
        c=math.exp(np.mean(np.log(coll[i][1])))
        res["full"].append(r>c)
        res["full_n"].append(abs(r-c))

        r=math.exp(np.mean(np.log(coll[i][0][cc:])))
        c=math.exp(np.mean(np.log(coll[i][1][cc:])))
        res["tail"].append(r>c)
        res["tail_n"].append(abs(r-c))
        
        for s in sc:
            r=math.exp(np.mean(np.log(coll[i][0][cc:cc+s])))
            c=math.exp(np.mean(np.log(coll[i][1][cc:cc+s])))
            res["tail_"+str(s)].append(r>c)
            res["tail_"+str(s)+"_n"].append(abs(r-c))
            
    res_overview={}
    res_overview["full"]=(res["full"].count(True)/len(res["id"]),res["full"].count(True),len(res["id"]))
    res_overview["tail"]=(res["tail"].count(True)/len(res["id"]),res["tail"].count(True),len(res["id"]))
    for s in sc:
        res_overview["tail_"+str(s)]=(res["tail_"+str(s)].count(True)/len(res["id"]),res["tail_"+str(s)].count(True),len(res["id"]))

    return {"overview":res_overview,"full":res}

In [None]:
def get_results_RC(cc,coll):
    res={
        "id":[],
        "head":[], "head_n":[], "head_50":[],"head_50_n":[],
        "head_100":[],"head_100_n":[], "head_150":[], "head_150_n":[],
        "head_200":[],"head_200_n":[],"head_250":[],"head_250_n":[]
    }
    sc=[50,100,150,200,250]
    
    for i in coll.keys():
        res["id"].append(i)

        r=math.exp(np.mean(np.log(coll[i][0][:-cc])))
        c=math.exp(np.mean(np.log(coll[i][1][:-cc])))
        res["head"].append(r>c)
        res["head_n"].append(abs(r-c))
        
        for s in sc:
            r=math.exp(np.mean(np.log(coll[i][0][-cc-s:-cc])))
            c=math.exp(np.mean(np.log(coll[i][1][-cc-s:-cc])))
            res["head_"+str(s)].append(r>c)
            res["head_"+str(s)+"_n"].append(abs(r-c))
            
    res_overview={}
    res_overview["head"]=(res["head"].count(True)/len(res["id"]),res["head"].count(True),len(res["id"]))
    for s in sc:
        res_overview["head_"+str(s)]=(res["head_"+str(s)].count(True)/len(res["id"]),res["head_"+str(s)].count(True),len(res["id"]))

    return {"overview":res_overview,"full":res}

In [None]:
def get_results_tag(scores,cl,tag,coord_ind):
    '''
    results structure
    context:{overview:(%,#,total),"full":dict df style}
    '''
    collection=create_collections(scores,cl,coord_ind)
    results={}
    for c in cl:
        if tag=="FW":
            results[c]=get_results_FW(c,collection[c])
        else:
            results[c]=get_results_RC(c,collection[c])
    return results

In [None]:
def get_results_full(scores,tag,coord_ind):
    coll=create_collection_full(scores,coord_ind)
    sc=[50,100,150,200,250]
    
    if tag=="RC":
        res={
        "id":[],
        "head":[], "head_n":[], "head_50":[],"head_50_n":[],
        "head_100":[],"head_100_n":[], "head_150":[], "head_150_n":[],
        "head_200":[],"head_200_n":[],"head_250":[],"head_250_n":[]
        }
        
        for i in coll.keys():
            res["id"].append(i)
        
            r=math.exp(np.mean(np.log(coll[i][0][:400])))
            c=math.exp(np.mean(np.log(coll[i][1][:400])))
            res["head"].append(r>c)
            res["head_n"].append(abs(r-c))
            
            for s in sc:
                r=math.exp(np.mean(np.log(coll[i][0][400-s:400])))
                c=math.exp(np.mean(np.log(coll[i][1][400-s:400])))
                res["head_"+str(s)].append(r>c)
                res["head_"+str(s)+"_n"].append(abs(r-c))
                
        res_overview={}
        res_overview["head"]=(res["head"].count(True)/len(res["id"]),res["head"].count(True),len(res["id"]))
        for s in sc:
            res_overview["head_"+str(s)]=(res["head_"+str(s)].count(True)/len(res["id"]),res["head_"+str(s)].count(True),len(res["id"]))
        
        return {"overview":res_overview,"full":res}

    else:
        res={
        "id":[],
        "full":[], "full_n":[], "tail":[], "tail_n":[], "tail_50":[],"tail_50_n":[],
        "tail_100":[],"tail_100_n":[], "tail_150":[], "tail_150_n":[],
        "tail_200":[],"tail_200_n":[],"tail_250":[],"tail_250_n":[]
        }
        sc=[50,100,150,200,250]
        
        for i in coll.keys():
            res["id"].append(i)
            r=math.exp(np.mean(np.log(coll[i][0])))
            c=math.exp(np.mean(np.log(coll[i][1])))
            res["full"].append(r>c)
            res["full_n"].append(abs(r-c))
    
            r=math.exp(np.mean(np.log(coll[i][0][-400:])))
            c=math.exp(np.mean(np.log(coll[i][1][-400:])))
            res["tail"].append(r>c)
            res["tail_n"].append(abs(r-c))
            
            for s in sc:
                r=math.exp(np.mean(np.log(coll[i][0][-400:-400+s])))
                c=math.exp(np.mean(np.log(coll[i][1][-400:-400+s])))
                res["tail_"+str(s)].append(r>c)
                res["tail_"+str(s)+"_n"].append(abs(r-c))
                
        res_overview={}
        res_overview["full"]=(res["full"].count(True)/len(res["id"]),res["full"].count(True),len(res["id"]))
        res_overview["tail"]=(res["tail"].count(True)/len(res["id"]),res["tail"].count(True),len(res["id"]))
        for s in sc:
            res_overview["tail_"+str(s)]=(res["tail_"+str(s)].count(True)/len(res["id"]),res["tail_"+str(s)].count(True),len(res["id"]))
    
        return {"overview":res_overview,"full":res}
        

In [None]:
results_clp_FW=get_results_tag(clp_FW_con,c_l,"FW",meta_fl)
results_clp_RC=get_results_tag(clp_RC_con,c_l,"RC",meta_fl)
results_pro_FW=get_results_tag(pro_FW_con,c_l[:-1],"FW",meta_fl)
results_pro_RC=get_results_tag(pro_RC_con,c_l[:-2],"RC",meta_fl)

In [None]:
results_clp_FW_long=get_results_tag(clp_FW_long,[1100,1200],"FW",meta_fl)
results_clp_RC_long=get_results_tag(clp_RC_long,[1100,1200],"RC",meta_fl)

In [None]:
results_clp_FW_full=get_results_full(clp_FW_full[0],"FW",meta_fl)
results_clp_RC_full=get_results_full(clp_RC_full[0],"RC",meta_fl)
results_pro_FW_full=get_results_full(pro_FW_full[0],"FW",meta_fl)
results_pro_RC_full=get_results_full(pro_RC_full[0],"RC",meta_fl)

## graphical

In [None]:
sns.set(style='whitegrid')
plt.figure(figsize=(10,8))
ax=sns.scatterplot(data=combined_df,x='head_sc', y="tail_id_200",palette='magma',hue="RCM_head",s=100)
ax.set_xlabel("Score of the head", fontsize=12)
ax.set_ylabel("Tail identity", fontsize=12)
legend = ax.get_legend()
legend.set_title("Reverse Delta")
plt.show()

In [None]:
a=[0,0.05,0.1,0.15,0.2,0.25,0.3,0.35,0.4]
pre_1,ret_1=get_pre_rec_data(combined_df,a,"comparison_head_200")
pre_2,ret_2=get_pre_rec_data(combined_df,a,"RCM_head")
pre_3,ret_3=get_pre_rec_data(combined_df,a,"RCM_tail")

In [None]:
plt.figure(figsize=(8, 5))
plt.plot(a, pre_1, label='Reverse Comparison', color='#dc267f')
plt.plot(a, pre_2, label='Reverse Delta', color='#788ef0')
plt.plot(a, pre_3, label='Forward Delta', color='#fe6100')
plt.xlabel('Confidence Threshold')  # Replace with your x-axis label
plt.ylabel('Accuracy')  # Replace with your y-axis label
plt.legend()
plt.show()

In [None]:
plt.figure(figsize=(8, 5))
plt.plot(a, ret_1, label='Reverse Comparison', color='#dc267f')
plt.plot(a, ret_2, label='Reverse Delta', color='#788ef0')
plt.plot(a, ret_3, label='Forward Delta', color='#fe6100')
plt.xlabel('Confidence Threshold')  # Replace with your x-axis label
plt.ylabel('Number of Test cases')  # Replace with your y-axis label
plt.legend()
plt.show()