In [1]:
import pandas as pd
from utils import *
from Bio import SeqIO
import re

In [64]:
ref_path = '/home/tobamo/analize/2022-01-03_diamond_1000plant/baza/identical_protein_groups_tobamovirus_2.fasta'


replacements = {
    'RNA-dependent_RNA_polymerase': 'rdrp',
    'readthrough': 'rdrp',
    'coat': 'coat',
    'movement': 'movement',
    'transport': 'movement',
    'MP': 'movement'
}

data = []
with open(ref_path, 'r') as fin:
    for record in SeqIO.parse(fin, 'fasta'):
        description = record.id
        
        protein = next((v for k, v in replacements.items() if k in description), 'other')
        
        data.append({
            'ref_name':record.id,
            'seq_len': len(str(record.seq)),
            'protein': protein,
            # 'seq': str(record.seq)
        })

    ref_df = pd.DataFrame(data)
    
    def get_len_type(seq_len):
        # List of conditions with corresponding labels
        conditions = [
            (1116 <= seq_len <= 1160, 'orf1'),
            (seq_len > 1160, 'rdrp'),
            (124 <= seq_len <= 208, 'coat'),
            (334 <= seq_len <= 501, 'orf2'),
            (216 <= seq_len <= 311, 'movement'),
        ]
        # Find and return the first condition that is True
        for condition, label in conditions:
            if condition:
                return label
        # Default to 'other' if no condition is met
        return 'other'

    ref_df['guess_protein_type_based_on_len'] = ref_df['seq_len'].apply(get_len_type)

pd.set_option('max_colwidth', 800)
ref_df.head(15)

Unnamed: 0,ref_name,seq_len,protein,guess_protein_type_based_on_len
0,AAB29319.1_transport_protein_[Tobamovirus],267,movement,movement
1,QNH88584.1_putative_coat_protein_[Hoya_tobamovirus_2],136,coat,coat
2,QNH88582.1_putative_coat_protein_[Hoya_tobamovirus_2],179,coat,coat
3,QED42912.1_putative_MP_[Mycosphaerella_tobamovirus_B],177,movement,coat
4,QNH88585.1_putative_movement_protein_[Hoya_tobamovirus_2],296,movement,movement
5,"QNH88583.1_putative_movement_protein,_partial_[Hoya_tobamovirus_2]",102,movement,other
6,QNH88581.1_putative_movement_protein_[Hoya_tobamovirus_2],296,movement,movement
7,QNH88586.1_putative_coat_protein_[Hoya_tobamovirus_2],159,coat,coat
8,QED42913.1_putative_CP_[Mycosphaerella_tobamovirus_B],332,other,other
9,"QED43014.1_RdRp,_partial_[Leucocoprinus_tobamovirus_B]",346,other,orf2


In [2]:
raw = pd.read_csv('/home/tobamo/analize/project-tobamo/results/megan6_results_combined_add_nr_taxa.csv')
getorf_path = '/home/tobamo/analize/project-tobamo/analysis/whole_orf/results/getorf_output_find1_min_360_max_3510.fasta'

In [77]:
df = raw[['SRR', 'qseqid', 'tpdb2_pident', 'tpdb2_length','tpdb2_qstart', 'tpdb2_qend', 'tpdb2_sstart', 'tpdb2_send','tpdb2_evalue', 'tpdb2_sseqid']].dropna(subset='tpdb2_pident')
df['orientation'] = df.apply(lambda row: 'FORWARD' if row['tpdb2_qstart'] < row['tpdb2_qend'] else 'REVERSE', axis=1)
df["reading_frame"] = df['tpdb2_qstart'].astype(int) % 3

In [94]:
def parse_fasta_to_df(fasta_path):
    # Step 1: Extract data from the FASTA file
    data = []
    with open(fasta_path, "r") as file:
        for record in SeqIO.parse(file, "fasta"):
            description = record.description
            seq = str(record.seq)
            data.append({
                'orf_name': description.split()[0].replace('=', '_'),
                'orf_start': int(description.split('[')[1].split(' - ')[0]),
                'orf_end': int(description.split(' - ')[1].split(']')[0]),
                'orientation': 'REVERSE' if "(REVERSE SENSE)" in description else 'FORWARD',
                'seq': seq
            })
    
    # Step 2: Create DataFrame from extracted data
    df = pd.DataFrame(data)

    # Step 3: Add additional information
    def remove_last_number(s):
        return re.sub(r"_(\d+)$", "", s)

    df["qseqid"] = df["orf_name"].apply(lambda x: remove_last_number(x))
    # df["SRR"] = df["qseqid"].str.extract(r"_([A-Za-z0-9]+)$")
    df["seq_len"] = df["seq"].apply(len)
    df["reading_frame"] = df['orf_start'] % 3

    # Order of the columns to maintain consistency
    columns = ["qseqid", "orf_name", "seq_len", "orf_start", "orf_end", "orientation", "reading_frame", "seq" ]
    df = df[columns]
    
    return df

getorf = parse_fasta_to_df(getorf_path)

In [95]:
df_sub = df[df.qseqid == 'NODE_46_length_7255_cov_4.852187_SRR6660821']
go_sub = getorf[getorf.qseqid == 'NODE_46_length_7255_cov_4.852187_SRR6660821']

In [96]:
getorf[getorf.qseqid == 'NODE_1426_length_3115_cov_410.357472_SRR4435987']

Unnamed: 0,qseqid,orf_name,seq_len,orf_start,orf_end,orientation,reading_frame,seq
520,NODE_1426_length_3115_cov_410.357472_SRR4435987,NODE_1426_length_3115_cov_410.357472_SRR4435987_1,973,196,3114,FORWARD,1,MSSDTLDRFMLSLGDNTSQEVVSRLLRDKTTSTYNIVDQTCSDLIKGKLHRPVGEHSEVIHTKLDPKMQKIFEDTYPEFNFKFVGDVSVGHPFLNVSRSLETNLILKTFLKIDQAKAIMSRYEFYDDYVTDVGGNSVNNILKNLEHVHSCEPVLANYDAYRNANRLEKLMHMHITNDVRSNNVKSLLIDDKVRCRRKVQDCKRTSLFMIAIHSIYDINIRDLADAMDNKKSMVCYGTFMFNPDILLENHGYLERIDVQYKIQGDKITFVHVNDTSFGYSHSIKTYLPYITTPVVYSRKGTCYVKELLFNRNDIQYFKMMRIESTPRNDIFYGITFPSWANKTRILYYDWFYAGAGNNNIRMTLKPRTLIVDTNFYNDVTNYAFGLSDTKFQPSVIYDYAKSTSSRVVHDAKNIKEFRFKDPDFLYSVSFAIYMYVYQKKYEHGKIIQALISEQKILREGTELTIFQKIMYHVRNLLGLDHPVGAKTFAKIPYEELIMMFQNNIISVSKNDVEFVKIYNGVIGSKSLFKKRDTLNIRYFAPDYDDYCVMDKLDNEIMNVCDLVFNESEGMVKKVSNTDSLTEDPVNKFVGAGIATEFHIGNTVPCTLHSHKAISNKADGLCLYYALVASEEKAIIDQMIETLMKNDMSYLPIKVQTELKQTLMNKQPGNIDMVEHFCHTFNSYVIIHIKNEKGCNAIGFGNRFKAAEMIHLLFTPNPINKYGIGHYERIDTTEGKVIGMSLPDIVERMNVQPAVLKESVLKHDYKITRQRFSILSYSFFKTQTNIFEVIENMPFYTD...


In [97]:
df[df.qseqid == 'NODE_1426_length_3115_cov_410.357472_SRR4435987']

Unnamed: 0,SRR,qseqid,tpdb2_pident,tpdb2_length,tpdb2_qstart,tpdb2_qend,tpdb2_sstart,tpdb2_send,tpdb2_evalue,tpdb2_sseqid,orientation,reading_frame
15181,SRR4435987,NODE_1426_length_3115_cov_410.357472_SRR4435987,19.7,502.0,244.0,1626.0,5.0,487.0,1.1e-20,AZT88674.1_replicase_[Acidomyces_richmondensis_tobamo-like_virus_1],FORWARD,1
15182,SRR4435987,NODE_1426_length_3115_cov_410.357472_SRR4435987,19.7,502.0,244.0,1626.0,5.0,487.0,1.1e-20,AZT88673.1_RNA-dependent_RNA_polymerase_[Acidomyces_richmondensis_tobamo-like_virus_1],FORWARD,1


#### FILTER GETORF based on reading frame and orientation

In [121]:
len(getorf)

1737

In [129]:
getorf.columns

Index(['qseqid', 'orf_name', 'seq_len', 'orf_start', 'orf_end', 'orientation',
       'reading_frame', 'seq'],
      dtype='object')

In [148]:
def filter_getorf_output(getorf_df, tpdb2_diamond):
    filtered_df = getorf_df.merge(tpdb2_diamond, on=['qseqid', 'reading_frame', 'orientation'], how='inner')
    aggregated_df = filtered_df.groupby(['qseqid', 'orf_name', 'seq_len', 'orf_start', 'orf_end', 'orientation',
       'reading_frame', 'SRR', 'seq'], as_index=False).agg({'tpdb2_sseqid': lambda x: ','.join(x)})
    return aggregated_df

filtered_getorf = filter_getorf_output(getorf, df)
filtered_getorf.to_csv('filtered_getorf.csv', index=False)