In [1]:
# imports
import os
import sys
import json

import eagledb
import eagle
from eagledb.scheme import GenomeInfo
from eagle.lib.seqs import SeqsDict

sys.path[0] = os.path.abspath("../")

In [2]:
# constants
SOURCES_DIR = "source"
FNA_PATH = os.path.join(SOURCES_DIR, "CAM_P_0001000.nt.fa")
FNA_META_PATH = os.path.join(SOURCES_DIR, "fna_meta.csv")
FNA_META_SEP = ","
RNA_18S_PATH = os.path.join(SOURCES_DIR, "18s.fasta")
PREPARED_DIR = "prepared"
PREPARED_18S_PATH = os.path.join(PREPARED_DIR, "18s.fasta")
PREPARED_FNA_PATH = os.path.join(PREPARED_DIR, "transcriptomes.fna")
TRANSCRIPTOMES_PATH = os.path.join(PREPARED_DIR, "transcriptomes.json")

In [None]:
# data preparation
def prepare_data(fna_path=FNA_PATH, 
                 rna_18s_path=RNA_18S_PATH, 
                 fna_meta_path=FNA_META_PATH, 
                 fna_meta_sep=FNA_META_SEP,
                 transcriptomes_path=TRANSCRIPTOMES_PATH,
                 prepared_fna_path=PREPARED_FNA_PATH,
                 prepared_18s_path=PREPARED_18S_PATH):
    
    samples_dict = get_samples_dict(fna_meta_path=fna_meta_path, sep=fna_meta_sep)

    rna_seqs = SeqsDict.load_from_file(rna_18s_path, seqs_format="fasta", low_memory=False)
    print("18S rRNA sequences read")
    rna_names_conv = convert_rna_names(rna_seqs, samples_dict)
    print("\n")
    fna_names_conv = convert_fna_names(in_fna_path=fna_path, 
                                       sample_names=rna_names_conv, 
                                       out_fna_path=prepared_fna_path)
    print("Transcriptome sequences read")

    transcriptomes = list()
    fna_ids_conv = dict()
    for sample_name in fna_names_conv:
        transcriptomes.append(GenomeInfo(genome_id=sample_name, 
                                         org_name=samples_dict[sample_name],
                                         fna_path=prepared_fna_path,
                                         fna_id_list=list(fna_names_conv[sample_name])).get_json())
    with open(transcriptomes_path, "w") as transcriptomes_f:
        json.dump(transcriptomes, transcriptomes_f, indent=2)
    print("%s trascriptomes prepared" % len(transcriptomes))

    rna_seqs.rename_seqs({rna_names_conv[t_name]: t_name for t_name in fna_names_conv})
    rna_seqs.get_sample(list(fna_names_conv.keys()), low_memory=False).dump(prepared_18s_path, seqs_format="fasta")

prepare_data()