In [2]:
import json
import matplotlib.pyplot as plt
import os
import sys
import typing
import pandas as pd
from pathlib import Path
 

In [3]:
sys.path.append(str(Path(os.path.dirname(os.getcwd())).parent))

In [4]:
from input_testing_data.generate_input_testing_data_for_modules import generate_testing_data_for_ecoli_and_bacillus
from modules.main import run_modules
from modules.shared_functions_and_vars import write_fasta

In [5]:
def run_single_method_for_orf_sequence(optimization_method: str,
                                       is_ecoli_optimized: bool,
                                       orf_sequence: typing.Optional[str] = None,
                                       orf_sequence_file: typing.Optional[str] = None,
                                       output_path: typing.Optional[str] = None,
                                       optimization_cub_index: str = "CAI"):
    default_user_inp_raw = generate_testing_data_for_ecoli_and_bacillus(
        optimization_method=optimization_method,
        optimization_cub_index=optimization_cub_index,
        clusters_count=1,
        tuning_param=0.5,
        is_ecoli_optimized=is_ecoli_optimized,
        sequence=orf_sequence,
        sequence_file_path=orf_sequence_file,
        output_path=os.path.join("results", output_path),
    )
    return run_modules(default_user_inp_raw)

In [6]:
def run_all_methods(orf_sequence = None, orf_sequence_file = None, output_path = None):
    for optimization_method in [
        "single_codon_ratio",
        "single_codon_diff",
        "single_codon_weakest_link",
        # "zscore_single_aa_ratio",
        "zscore_bulk_aa_ratio",
        # "zscore_single_aa_diff",
        "zscore_bulk_aa_diff",
        # "zscore_single_aa_weakest_link",
        "zscore_bulk_aa_weakest_link",
    ]:
        for optimization_cub_index in ["CAI", "tAI"]:
            for direction in [True, False]:
                run_single_method_for_orf_sequence(optimization_method=optimization_method,
                                                   optimization_cub_index=optimization_cub_index,
                                                   is_ecoli_optimized=direction,
                                                   orf_sequence=orf_sequence,
                                                   orf_sequence_file=orf_sequence_file,
                                                   output_path=output_path)

In [7]:
base_path = os.path.join(Path(os.getcwd()).parent.resolve(), "example_data")

Generating variations for mcherry:

In [8]:
macherry_file_path = os.path.join(base_path, "mCherry_original.fasta")
run_all_methods(orf_sequence_file=macherry_file_path, output_path="mcherry")

##########################
# USER INPUT INFORMATION #
##########################

Information about Escherichia coli:
Organism is optimized
Number of genes: 4310
Found 90 ribosomal proteins in input genome.
CAI will be calculated from a reference set of estimated expression dictionary.
Expression levels were found for 3215
Calculate CAI weights from a reference set of 964 highly expressed genes from estimated expression dictionary.
name=Escherichia coli, cai_std=0.08137084728004947, cai_avg=0.6553953430191558

Information about Bacillus subtilis:
Organism is deoptimized
Number of genes: 4311
Found 98 ribosomal proteins in input genome.
CAI will be calculated from a reference set of estimated expression dictionary.
Expression levels were found for 1863
Calculate CAI weights from a reference set of 559 highly expressed genes from estimated expression dictionary.
name=Bacillus subtilis, cai_std=0.039239181903582866, cai_avg=0.7451526432571732
Escherichia coli has weight of 1.0
Bacillus su

In [9]:
def generate_sequences_fasta_file(root_dir) -> None:
    filename = "run_summary.json"

    sequences = []
    sequences_names = []
    for root, dirs, files in os.walk(root_dir):
        for file in files:
            if file == filename:
                directory_name = Path(root).name
                file_path = os.path.join(root, file)
                with open(file_path, "r") as summary_file:
                    results_json = json.load(summary_file)

                seq = results_json["evaluation"]["final_sequence"]
                sequences.append(seq)
                sequences_names.append(directory_name[:-5])

    write_fasta(os.path.join(root_dir, "mcherry_variants"), sequences, sequences_names)

In [10]:
generate_sequences_fasta_file(r"C:\projects\Igem_TAU_2021_moran\analysis\orf_model_analysis\results\mcherry")