In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from Bio import SeqIO
from Bio.Seq import Seq
import pandas as pd
import numpy as np
import random
import torch
import pickle
import math
from evo import Evo, positional_entropies
from evo.scoring import prepare_batch, score_sequences
from tqdm import tqdm
from generating_utils import perposition_scores, remove_gaps, complement_5_strand
from analysis_utils import read_fasta

In [None]:
'''
This notebook is used to generate data for the analysis

'''

# Set up

In [None]:
filename='path to EColiK12.gbff'
record = SeqIO.read(filename, "genbank")
genome_coli=record.seq
filename='path to BacSub.gbff'
record = SeqIO.read(filename, "genbank")
genome_sub=record.seq
filename='path to AE006468.gb'
record = SeqIO.read(filename, "genbank")
genome_nella=record.seq

In [None]:
device = 'cuda:0'
seed = 42
torch.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)


evo_model = Evo('evo-1-131k-base')

model, tokenizer = evo_model.model, evo_model.tokenizer
model.to(device) 
model.eval()
dnp=True

In [None]:
'''
* the code that was used to generate metadata is found in gen_metadata script
* metadata is available in the data file
'''

In [None]:
ht_meta=pd.read_csv("path to metadata_10bp_ht.csv")
sequence_meta=pd.read_csv("path to metadata_10bp_seqs.csv")

In [None]:
ht_meta_2=pd.read_csv("path to metadata_2_10bp_ht.csv")
sequence_meta_2=pd.read_csv("path to metadata_2_10bp_seqs.csv")

In [None]:
tails={}
heads={}
genomes={"coli":genome_coli, "sub":genome_sub, "nella":genome_nella}
for i in range(len(ht_meta)):
    k=ht_meta.iloc[i]["coordinate"]
    tails[(ht_meta.iloc[i]["Intersection"],k)]=genomes[ht_meta.iloc[i]["organism"]][k+10:k+510]
    heads[(ht_meta.iloc[i]["Intersection"],k)]=genomes[ht_meta.iloc[i]["organism"]][k-990:k+10]

In [None]:
tails_2={}
heads_2={}
genomes={"coli":genome_coli, "sub":genome_sub, "nella":genome_nella}
for i in range(len(ht_meta_2)):
    k=ht_meta_2.iloc[i]["coordinate"]
    tails_2[(ht_meta_2.iloc[i]["Intersection"],k)]=genomes[ht_meta_2.iloc[i]["organism"]][k+10:k+510]
    heads_2[(ht_meta_2.iloc[i]["Intersection"],k)]=genomes[ht_meta_2.iloc[i]["organism"]][k-990:k+10]

# Functions

In [None]:
def generate_dataset(seq_meta, h_dict, t_dict, model, tokenizer, device):
    c10set=[]
    for i in tqdm(range(len(seq_meta)), desc="Processing tasks", unit="task"):
        inter=seq_meta.iloc[i]["Intersection"]
        head=seq_meta.iloc[i]["head"]
        tail=seq_meta.iloc[i]["tail"]
        seq=str(h_dict[(inter,head)])+str(t_dict[(inter,tail)])
        seq_RC=complement_5_strand(seq)
        sc_FW=perposition_scores([seq], model, tokenizer, device)[0]
        sc_RC=perposition_scores([seq_RC], model, tokenizer, device)[0][::-1]
        c10set.append([sc_FW,sc_RC])
    return c10set  

In [None]:
def generate_head_data(h_meta,h_dict, model, tokenizer, device):
    c10heads=[]
    for i in tqdm(range(len(h_meta)), desc="Processing tasks", unit="task"):
        inter=seq_meta.iloc[i]["Intersection"]
        head=seq_meta.iloc[i]["coordinate"]
        seq=str(h_dict[(inter,head)])
        sc_FW=perposition_scores([seq], model, tokenizer, device)[0]
        c10heads.append(sc_FW)
    return c10heads  

In [None]:
def generate_ortholog_data(aligned,alignedAA, model, tokenizer,device):
    cuttingAA=len(alignedAA[0]["sequence"])//2
    cutting=3*(cuttingAA)
    scores=[]
    for i in range(len(aligned)):
        for j in range(1,len(aligned[i:])):
            head_1=remove_gaps(aligned[i]["sequence"][:cutting])
            head_2=remove_gaps(aligned[i+j]["sequence"][:cutting])
            tail_1=remove_gaps(aligned[i]["sequence"][cutting:])
            tail_2=remove_gaps(aligned[i+j]["sequence"][cutting:])

            seqs=[head_1+tail_1,head_2+tail_2,head_1+tail_2,head_2+tail_1]
            seq_RC=[complement_5_strand(s) for s in seqs]

            head_1_lead_FW=[perposition_scores([seqs[0]], model, tokenizer, device)[0],perposition_scores([seqs[2]], model, tokenizer, device)[0]]
            head_2_lead_FW=[perposition_scores([seqs[1]], model, tokenizer, device)[0],perposition_scores([seqs[3]], model, tokenizer, device)[0]]
            
            head_1_lead_RC=[perposition_scores([seq_RC[0]], model, tokenizer, device)[0][::-1],perposition_scores([seq_RC[2]], model, tokenizer, device)[0][::-1]]
            head_2_lead_RC=[perposition_scores([seq_RC[1]], model, tokenizer, device)[0][::-1],perposition_scores([seq_RC[3]], model, tokenizer, device)[0][::-1]]
            
            scores.append([[head_1_lead_FW,head_1_lead_RC],[head_2_lead_FW,head_2_lead_RC]])

    return scores
            

In [None]:
def generate_ortholog_heads(aligned,alignedAA, model, tokenizer,device):
    cuttingAA=len(alignedAA[0]["sequence"])//2
    cutting=3*(cuttingAA)
    scores=[]
    for i in range(len(aligned)):
        head=remove_gaps(aligned[i]["sequence"][:cutting])
        sc_FW=perposition_scores([head], model, tokenizer, device)[0]
        scores.append(sc_FW)

    return scores

# Application

In [None]:
c10=generate_dataset(sequence_meta, heads, tails, model, tokenizer, device)

In [None]:
with open("scores_10bp_seqs_500.pkl", "wb") as file:
    pickle.dump(c10, file)

In [None]:
c10h=generate_head_data(ht_meta, heads, model, tokenizer, device)

In [None]:
with open("scores_10bp_heads.pkl", "wb") as file:
    pickle.dump(c10h, file)

In [None]:
c10_2=generate_dataset(sequence_meta_2, heads_2, tails_2, model, tokenizer, device)

In [None]:
with open("scores_10bp_seqs_500_2.pkl", "wb") as file:
    pickle.dump(c10_2, file)

In [None]:
c10h_2=generate_head_data(ht_meta_2, heads_2, model, tokenizer, device)

In [None]:
with open("scores_10bp_heads_2.pkl", "wb") as file:
    pickle.dump(c10h_2, file)

In [None]:
aligned_records=read_fasta("picked_clpA.txt")
aligned_aa=read_fasta("picked_clpA_prot.txt")
sc_clp=generate_ortholog_data(aligned_records,aligned_aa, model, tokenizer,device)
head_clp=generate_ortholog_heads(aligned_records,aligned_aa, model, tokenizer,device)

In [None]:
aligned_records=read_fasta("picked_proRS.txt")
aligned_aa=read_fasta("picked_prot_proRS.txt")
sc_pro=generate_ortholog_data(aligned_records,aligned_aa, model, tokenizer,device)
head_pro=generate_ortholog_heads(aligned_records,aligned_aa, model, tokenizer,device)

In [None]:
# save all datasets to a pickle file
with open("scores_clpA_seq.pkl", "wb") as file:
    pickle.dump(sc_clp, file)
with open("scores_proRS_seq.pkl", "wb") as file:
    pickle.dump(sc_pro, file)
    
with open("scores_proRS_head.pkl", "wb") as file:
    pickle.dump(head_pro, file)    
with open("scores_clpA_head.pkl", "wb") as file:
    pickle.dump(head_clp, file)