In [133]:
import pandas as pd
import numpy as np
import re
from collections import defaultdict

In [17]:
ko_list = 'user_ko.txt'
pangenome_faa = '../pan_genome_reference.faa'
pangenome_fa = '../pan_genome_reference.fa'

In [118]:
pgref = []
with open(pangenome_faa, 'r') as f:
    current_gid = None
    current_name = None
    current_seq = ''

    for line in f:
        line = line.strip()
        if line.startswith('>'):
            if current_gid:
                pgref.append({
                    'locus': current_gid,
                    'name': current_name,
                    'seq': current_seq,
                })

            header_parts = line[1:].split(maxsplit=1)
            current_gid = header_parts[0]
            current_name = header_parts[1]
            current_seq = ''

        else:
            current_seq += line

    if current_gid:
        pgref.append({
            'locus': current_gid,
            'name': current_name,
            'seq': current_seq,
        })

pgref2 = defaultdict()
with open(pangenome_fa, 'r') as f:
    current_gid = None
    current_name = None

    for line in f:
        line = line.strip()
        if line.startswith('>'):
            if current_gid:
                pgref2[current_gid] = current_name

            header_parts = line[1:].split(maxsplit=1)
            current_gid = header_parts[0]
            current_name = header_parts[1]

    if current_gid:
        pgref2[current_gid] = current_name

for dict in pgref:
    dict['name'] = pgref2[dict['locus']]

In [119]:
df = pd.DataFrame(pgref)

In [120]:
kos = []
with open(ko_list, 'r') as f:
    lines = f.readlines()
    for line in lines:
        if len(line.split('\t')) > 1:
            pid = line.split('\t')[0]
            kid = line.split('\t')[1].strip()
            kos.append({
                'locus': pid,
                'KO': kid,
            })
kodf = pd.DataFrame(kos)

In [121]:
pgref_df = pd.merge(left=df,
        right=kodf,
        how='left',
        on='locus',)

In [122]:
pgref_df = pgref_df

In [123]:
pgref_df.to_csv('longread_pangenome_reference_faa_v4_KO.tsv', sep='\t', header=True, index=False)

In [124]:
ko_map_raw = 'KEGG_mapper_raw.txt'

In [188]:
kmaps = []
with open(ko_map_raw, 'r') as f:
    current_fam = None
    current_ko = None
    current_kot = None
    current_ko_desc = None

    for line in f:
        line = line.strip()

        if line.startswith("Protein families:"):
            current_fam = line.split(":")[1].strip()

        elif line.startswith("ko"):
            parts = line.split()
            current_kot = parts[0]
            current_ko_desc = " ".join(parts[1:])

        elif line.startswith("K"):
            current_ko = line

        else:
            # Split the line by comma to handle multiple locus IDs
            loci = [locus.strip() for locus in line.split(',')]
            for locus in loci:
                kmaps.append({
                    'locus': locus,
                    'family': current_fam,
                    'KO_top': current_kot,
                    'KO_mid': current_ko_desc,
                    'KO': current_ko,
                })

In [189]:
kmaps_df = pd.DataFrame(kmaps)

In [192]:
kmaps_df['KO_mid'] = kmaps_df['KO_mid'].apply(lambda x: re.sub('\([0-9]+\)', '', x))
#kmaps_df['KO_type'] = kmaps_df['KO_type'].apply(lambda x: x.replace(' ', ''))

In [193]:
kmaps_df.to_csv('kegg_mapper_fixed.tsv', sep='\t', index=False, header=True)

In [194]:
new_df = pd.merge(left=pgref_df,
        right=kmaps_df,
        how='left',
        on='locus',
        )


In [195]:
new_df

Unnamed: 0,locus,name,seq,KO_x,family,KO_top,KO_mid,KO_y
0,DMPBEC_00005,Uncharacterized protein BB_0001,MKYSAILLICSVNLFCFQNKLTTSRWEFPKEDLIKKKIKIGIIYHN...,,,,,
1,DMPBEC_00010,Glycoside hydrolase family 3 N-terminal domain...,MDFLKTFSFLFFSFFCLNLIAIESLPEIDYEYFNKDKSDLVDLIKF...,K01207,,,,
2,DMPBEC_00015,group_1482,MNVKVDKIFSEMILEKLNSGEICPNNFESIKYFPCDSHENIFNISD...,,metabolism,ko01000,Enzymes,K00963
3,DMPBEC_00020,group_3841,MLKQYSLNMKNFKKAFDEMIFSPSGFRKIFAKSKNEDSTENEINNE...,,,,,
4,DMPBEC_00025,trpS,LKRKVMLTGDRPTGALHLGHYVGSVVNRLKFQEEYETYFIIADLHT...,K01867,metabolism,ko01000,Enzymes,K01867
...,...,...,...,...,...,...,...,...
4245,JOOONJ_05885,group_704,MDKLKIIFSLIKLYSLSFDKSLNIEFSDISSVISLIENILCNETSS...,,,,,
4246,JOOONJ_05900,group_799,VILIFLTYLVPSALWSEFSSRVLRKYIFANYKLNYIYQFQNQKRFK...,,,,,
4247,JOOONJ_06150,group_1312,MTRKMFVVYAILALTSCCKNYESNVELKKQIEEFLNTKEIAENVVK...,,,,,
4248,JOOONJ_06260,group_2487,MKKISSAIFMVAFFAFINCKSNVGESSNTKDPTNEFYQSVIKLGNG...,,,,,


In [196]:
def merge_ko(ko_x, ko_y):
    if pd.isna(ko_x) and pd.isna(ko_y):
        return np.nan
    elif pd.isna(ko_x):
        return ko_y
    elif pd.isna(ko_y):
        return ko_x
    elif ko_x == ko_y:
        return ko_x
    else:
        print(ko_x, ko_y)
        return f"{ko_x},{ko_y}"

In [197]:
new_df['KO'] = new_df.apply(lambda row: merge_ko(row['KO_x'], row['KO_y']), axis=1)
new_df = new_df.drop(columns=['KO_x', 'KO_y'])

K15580 K02035
K15580 K02035
K15580 K02035
K15581 K02033
K15582 K02034
K15583 K02031
K10823 K02032
K25152 K01990
K02760 K02759
K02760 K02759
K15580 K02035
K15580 K02035
K15580 K02035


In [198]:
new_df.to_csv('longread_pangenome_reference_faa_v4_keggMapped_v2.tsv', sep='\t', index=False)