In [14]:
import pandas as pd
from glob import glob
from Bio import SeqIO
import seaborn as sns
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

pd.set_option('display.max_colwidth', None)

In [15]:
orf1_paths = glob('/home/tobamo/analize/2023-12-01_tobamo_analysis/results/testB_results/rdrp_orf1/*.csv')

In [17]:
df = pd.concat([pd.read_csv(filename).assign(filename=filename.split("/")[-1].split(".")[0]) for filename in orf1_paths], ignore_index=True)

In [18]:
def get_sequence_keys(virga_path, tobamo_path):
    all_keys = list(SeqIO.to_dict(SeqIO.parse(virga_path, "fasta")).keys())
    tobamo_keys = list(SeqIO.to_dict(SeqIO.parse(tobamo_path, "fasta")).keys())
    og_keys = list(set(all_keys) - set(tobamo_keys))
    return og_keys, tobamo_keys

# File paths
files = [
    ('/home/tobamo/analize/2023-12-01_tobamo_analysis/data/refs/virga_rdrp_orf1.txt', '/home/tobamo/analize/2023-12-01_tobamo_analysis/data/refs/tobamo_rdrp_orf1.txt', 'rdrp_orf1'),
    ('/home/tobamo/analize/2023-12-01_tobamo_analysis/data/refs/virga_rdrp_orf2.txt', '/home/tobamo/analize/2023-12-01_tobamo_analysis/data/refs/tobamo_rdrp_orf2.txt', 'rdrp_orf2'),
    ('/home/tobamo/analize/2023-12-01_tobamo_analysis/data/refs/cp.fasta', '/home/tobamo/analize/2023-12-01_tobamo_analysis/data/refs/cp_tobamo.fasta', 'cp'),
    ('/home/tobamo/analize/2023-12-01_tobamo_analysis/data/refs/mp.fasta', '/home/tobamo/analize/2023-12-01_tobamo_analysis/data/refs/mp_tobamo.fasta', 'mp'),
]

# Generate difference lists and keep the original and tobamo sequences
for file, tobamo_file, prefix in files:
    og_var = f'{prefix}_virga'
    tobamo_var = f'{prefix}_tobamo'
    og_keys, tobamo_keys = get_sequence_keys(file, tobamo_file)
    
    # Create variables for original and tobamo sequences
    globals()[og_var] = og_keys
    globals()[tobamo_var] = tobamo_keys
    
    # Create variables for the differences
    globals()[f'{prefix}_og'] = og_keys
    globals()[f'{prefix}_tobamo'] = tobamo_keys

In [19]:
# define function that determines pair type
def determine_type_v2(row, og_list, tobamo_list):
    r1 = row['orf_name']
    r2 = row['ref_name']

    if r1 in tobamo_list and r2 in tobamo_list:
        return 'tobamo-tobamo'
    
    if r1 in og_list and r2 in og_list:
        return 'og-og'
    
    return 'og-tobamo'

In [20]:
df['pair_type'] = df.apply(determine_type_v2, og_list = rdrp_orf1_og, tobamo_list=rdrp_orf1_tobamo, axis=1)
df = df[df.pair_type != 'og-og'] # 5021000
df['len'] = [int(el.split('-')[-2].split('_')[0]) for el in df['filename']]
df.to_csv('/home/tobamo/analize/project-tobamo/analysis/fragmented_orf/results/combined_testB_data.csv', index=False)