In [1]:
import os
import sys
from pathlib import Path
import urllib.request

In [2]:
sys.path.append(str(Path(os.path.dirname(os.getcwd())).parent))
from analysis.input_testing_data.generate_input_testing_data_for_modules import generate_testing_data
from modules.main import run_modules

## Dask Configuration

In [3]:
from dask_jobqueue import PBSCluster
from pathlib import Path

# Define the working directory path
working_directory = str(Path.cwd())

# Launch a scheduler and workers on HPC via PBS
cluster = PBSCluster(
     cores=4,
     memory="8GB",
     processes=1,
     queue="tamirQ",
     walltime="05:30:00",
     scheduler_options={"dashboard_address": ":12435"},
     # Additional custom options
     log_directory="dask-logs",
     #worker_extra_args=["--lifetime", "25m", "--lifetime-stagger", "4m"],  # for walltime="00:30:00"
     job_script_prologue=[f"cd {working_directory}"]
)

Resource specification for PBS not set, initializing it to select=1:ncpus=4:mem=7630MB
State start
  Scheduler at: tcp://132.66.112.146:44039
  dashboard at:  http://132.66.112.146:12435/status


In [4]:
cluster.adapt(minimum=30, maximum=60)
print(cluster.job_script())

Adaptive scaling started: minimum=30 maximum=60
Resource specification for PBS not set, initializing it to select=1:ncpus=4:mem=7630MB


#!/usr/bin/env bash

#PBS -N dask-worker
#PBS -q tamirQ
#PBS -l select=1:ncpus=4:mem=7630MB
#PBS -l walltime=05:30:00
#PBS -e dask-logs/
#PBS -o dask-logs/
cd /tamir2/moranb/microbiome/Igem_TAU_2021/analysis/orf_model_analysis
/tamir2/moranb/microbiome/Igem_TAU_2021/venv/bin/python -m distributed.cli.dask_worker tcp://132.66.112.146:44039 --nthreads 4 --memory-limit 7.45GiB --name dummy-name --nanny --death-timeout 60



In [5]:
from dask.distributed import Client, progress, wait, get_client, get_worker
client = Client(cluster)

Receive client connection: Client-f54935cc-37c7-11f0-93fd-b4969140d316
Starting established connection to tcp://132.66.112.146:49700


In [6]:
import dask.bag as db
import dask.dataframe as dd
from dask import delayed, compute, persist
import json
from collections import defaultdict
import matplotlib
import numpy as np
import pandas as pd
import re

## Download genomes

In [7]:
from Bio import Entrez

In [8]:
# Configure your email for NCBI Entrez
Entrez.email = "bentulila@mail.tau.ac.il"

def download_refseq_genome(organism_name, output_dir="downloads"):
    print(f"Searching for RefSeq genome for: {organism_name}")

    # Search the assembly database
    search_handle = Entrez.esearch(
        db="assembly",
        term=f"{organism_name}[Organism] AND reference_genome[filter]",
        retmax=1,
        sort="relevance"
    )
    search_results = Entrez.read(search_handle)
    search_handle.close()

    if not search_results["IdList"]:
        print("No reference genome found.")
        return

    assembly_id = search_results["IdList"][0]

    # Fetch summary to get FTP path
    summary_handle = Entrez.esummary(db="assembly", id=assembly_id, report="full")
    summary = Entrez.read(summary_handle)
    summary_handle.close()

    ftp_path = summary['DocumentSummarySet']['DocumentSummary'][0]['FtpPath_RefSeq']
    if not ftp_path:
        print("No RefSeq FTP path found.")
        return

    print("Found RefSeq FTP path:")
    print(ftp_path)

    # Get base name of directory
    base_name = os.path.basename(ftp_path)

    # Files to download (Genomic sequence and annotation)
    files_to_download = [
        # f"{base_name}_genomic.fna.gz",  # DNA sequence
        # f"{base_name}_genomic.gff.gz",  # Annotations (GFF)
        f"{base_name}_genomic.gbff.gz"  # Annotations + sequences (GenBank)
    ]

    os.makedirs(output_dir, exist_ok=True)

    for file_name in files_to_download:
        file_url = f"{ftp_path}/{file_name}"
        out_path = os.path.join(output_dir, organism_name)
        print(f"Downloading: {file_url}")
        try:
            urllib.request.urlretrieve(file_url, f"{out_path}.gbff.gz")
            print(f"Saved to: {out_path}")
        except Exception as e:
            print(f"Failed to download {file_name}: {e}")

In [9]:
# base_output_dir = "/Users/shimka/Documents/Moran's Thesis/arabidopsis_data/genomes"
# genomes_gb_dir_path = os.path.join(base_output_dir, "genomes_gb")
# output_dir = os.path.join(base_output_dir, "refseq_genomes")

# genomes_gb_directory = Path(genomes_gb_dir_path)
# organisms_list = [str(f)[:-5].split("/")[-1].strip() for f in genomes_gb_directory.iterdir() if f.is_file()]
# organisms_list = [o.replace("_", " ") for o in organisms_list if not o.startswith(".")]

In [10]:
# for o in organisms_list:
#     download_refseq_genome(o, output_dir=output_dir)

In [11]:
# import gzip
# import shutil

# def decompress_gz(input_path, output_dir):
#     input_path = Path(input_path)
#     output_dir = Path(output_dir)
#     output_dir.mkdir(parents=True, exist_ok=True)

#     if not input_path.suffix == '.gz':
#         raise ValueError("Input file must have a .gz extension")

#     output_path = output_dir / input_path.with_suffix('').name

#     with gzip.open(input_path, 'rb') as f_in, open(output_path, 'wb') as f_out:
#         shutil.copyfileobj(f_in, f_out)

#     print(f"Decompressed to: {output_path}")

# arabidopsis_gbff_dir = "/Users/shimka/PycharmProjects/Igem_TAU_2021/analysis/example_data/arabidopsis_microbiome"
# for organism_file in Path(output_dir).iterdir():
#     if ".DS_Store" in str(organism_file):
#         continue
#     try:
#         decompress_gz(organism_file, arabidopsis_gbff_dir)
#     except Exception as e:
#         print("*****************")
#         print(organism_file)
#         print(e)
#         print("*****************")

## Arabidopsis analysis

In [3]:
data_directory_path = os.path.join(Path(os.getcwd()).parent.resolve(), "example_data")
genomes_gb_path = os.path.join(data_directory_path, "arabidopsis_microbiome")
zorA_file_path = os.path.join(data_directory_path, "zorA_anti_phage_defense.fasta")

In [4]:
genomes_gb_directory = Path(genomes_gb_path)
organisms_list = [str(f) for f in genomes_gb_directory.iterdir() if f.is_file()]
len(organisms_list)

34

In [5]:
run_configurations = []
for wanted_organism in organisms_list:
    for unwanted_organism in organisms_list:
        if wanted_organism == unwanted_organism:
            continue
        for optimization_method in [
            "single_codon_diff", 
            "single_codon_ratio",
            "zscore_bulk_aa_diff",
            "zscore_bulk_aa_ratio",
        ]:
            run_configurations.append((wanted_organism, unwanted_organism, optimization_method)) 

inputs = [(
    run_configration, 
    generate_testing_data(
        orf_optimization_method = run_configration[2],
        orf_optimization_cub_index = "CAI",
        wanted_hosts = [run_configration[0]],
        unwanted_hosts = [run_configration[1]], 
        genome_path = genomes_gb_path,
        sequence_file_path = zorA_file_path,
        output_path = os.path.join("arabidopsis","pairwise"),
    )) for run_configration in run_configurations]

In [15]:
def run_modules_for_configuration(input):
    # configuration = input[0]
    # user_input = input[1]
    # base_output_path = user_input.get("output_path")
    # configuration_string = f"wanted_{configuration[0]}_unwanted_{configuration[1]}_{configuration[2]}"
    # new_output_path = os.path.join(base_output_path, configuration_string)
    # user_input["output_path"] = new_output_path
    result = run_modules(user_input, should_run_output_module=False)
    return result
    

base_output_path = "/tamir2/moranb/microbiome/Igem_TAU_2021/analysis/results/arabidopsis"
output_path = os.path.join(base_output_path, "pairwise")
inputs_db = db.from_sequence(inputs)
results_db = inputs_db.map(run_modules_for_configuration)

Resource specification for PBS not set, initializing it to select=1:ncpus=4:mem=7630MB
Resource specification for PBS not set, initializing it to select=1:ncpus=4:mem=7630MB
Resource specification for PBS not set, initializing it to select=1:ncpus=4:mem=7630MB
Resource specification for PBS not set, initializing it to select=1:ncpus=4:mem=7630MB
Resource specification for PBS not set, initializing it to select=1:ncpus=4:mem=7630MB
Resource specification for PBS not set, initializing it to select=1:ncpus=4:mem=7630MB


In [16]:
# results_db.map(json.dumps).to_textfiles(os.path.join(output_path, '*.json'))

Resource specification for PBS not set, initializing it to select=1:ncpus=4:mem=7630MB
Resource specification for PBS not set, initializing it to select=1:ncpus=4:mem=7630MB
Resource specification for PBS not set, initializing it to select=1:ncpus=4:mem=7630MB


In [6]:
base_output_path = "/tamir2/moranb/microbiome/Igem_TAU_2021/analysis/results/arabidopsis/pairwise-local"
path = Path(base_output_path)
path.mkdir(parents=True, exist_ok=True) 

for first_organism in organisms_list:
    for second_organism in organisms_list:
        if first_organism == second_organism:
            continue
        for optimization_method in [
            "single_codon_diff", 
            "single_codon_ratio",
            "zscore_bulk_aa_diff",
            "zscore_bulk_aa_ratio",
        ]:
            first_organism_name = first_organism.split("/")[-1][:-5].replace(" ", "_").lower()
            second_organism_name = second_organism.split("/")[-1][:-5].replace(" ", "_").lower()
            configuration_string = f"wanted_{first_organism_name}_unwanted_{second_organism_name}_{optimization_method}"
            result_file_path = os.path.join(base_output_path, configuration_string)
            if Path(result_file_path).exists():
                print(f"skipping file: {configuration_string}") 
                continue
            input_dict = generate_testing_data(
                orf_optimization_method = optimization_method,
                orf_optimization_cub_index = "CAI",
                wanted_hosts = [first_organism],
                unwanted_hosts = [second_organism], 
                genome_path = genomes_gb_path,
                sequence_file_path = zorA_file_path,
                output_path = os.path.join("arabidopsis", "pairwise", "local"),
            )
            result = run_modules(input_dict, should_run_output_module=False)
            with open(result_file_path, "w") as f:      
                json.dump(result, f, indent=4) 

skipping file: wanted_agromyces_allii_unwanted_arthrobacter_ginsengisoli_single_codon_diff
skipping file: wanted_agromyces_allii_unwanted_arthrobacter_ginsengisoli_single_codon_ratio
skipping file: wanted_agromyces_allii_unwanted_arthrobacter_ginsengisoli_zscore_bulk_aa_diff
skipping file: wanted_agromyces_allii_unwanted_arthrobacter_ginsengisoli_zscore_bulk_aa_ratio
skipping file: wanted_agromyces_allii_unwanted_arthrobacter_luteolus_single_codon_diff
skipping file: wanted_agromyces_allii_unwanted_arthrobacter_luteolus_single_codon_ratio
skipping file: wanted_agromyces_allii_unwanted_arthrobacter_luteolus_zscore_bulk_aa_diff
skipping file: wanted_agromyces_allii_unwanted_arthrobacter_luteolus_zscore_bulk_aa_ratio
skipping file: wanted_agromyces_allii_unwanted_arthrobacter_parietis_single_codon_diff
skipping file: wanted_agromyces_allii_unwanted_arthrobacter_parietis_single_codon_ratio
skipping file: wanted_agromyces_allii_unwanted_arthrobacter_parietis_zscore_bulk_aa_diff
skipping fil


##########################
# User Input #
##########################
Sequence to be optimized given in the following file: /tamir2/moranb/microbiome/Igem_TAU_2021/analysis/example_data/zorA_anti_phage_defense.fasta
Open reading frame sequence for optimization is:
ATGACCGATAGCCTTAATCTTTCCAGTTTATGGCCAGACTTGAGCAGTCTAAATGTAATGCCACCTCAAACTCCTGAGCAATTGAGTGCTTTGTTTGTGGTTATTTTATGGGGTGTCGCTCTATTTTTCTTAATTTGGTCAGTTGCTGCATTTATTCGCGCAAGACAACGAGTAGTTTGGCTTAACAAATCTTTGGATGATGCGGAGAAATCGACTCTGTCAGGTGTTCGAAATGATTTAATTTCTAGAGCTGAACAAAAAAAAGATTCAGTAGGGCATTTATGGCTCGAGTTTGATGAAACATTGCTTGAAGTAAAAGGTGCTGATGACGTTGTTCGATTGCATAACACATTTGATGCTGACTATTTTTTTAATAGTTCTAGTTTAGCTGGTGGTAGTACTGAAAACAGAATGATTGCGGTGGTTCCAGGCTTTTTAACGGCTCTGGGTGTTATTGGTACTTTTGTCGGTTTACAGCTAGGACTTTCAGATCTCAATATCGCGGGCAACGTTGATGTCAATGAAATGAAAAATGGCGTTGCTGGAGTTATAAATGGTGCAAAAATCGCATTTATGACATCGGTTTGGGGCGTGCTACTCAGTGTTGCATTTAATTTCATTGAGAAGATACTTGAGCAAATTATTAGAAAGAAAATTAAGTCTTTGCAAAACCGTATTGACAGGATGTTCCCTCGATTAAGTGCGGAATATCAACTGCAATCTATTGCAAATAAC

skipping file: wanted_pedococcus_dokdonensis_unwanted_paenibacillus_oryzisoli_single_codon_diff
skipping file: wanted_pedococcus_dokdonensis_unwanted_paenibacillus_oryzisoli_single_codon_ratio
skipping file: wanted_pedococcus_dokdonensis_unwanted_paenibacillus_oryzisoli_zscore_bulk_aa_diff
skipping file: wanted_pedococcus_dokdonensis_unwanted_paenibacillus_oryzisoli_zscore_bulk_aa_ratio
skipping file: wanted_pedococcus_dokdonensis_unwanted_pedococcus_aerophilus_single_codon_diff
skipping file: wanted_pedococcus_dokdonensis_unwanted_pedococcus_aerophilus_single_codon_ratio
skipping file: wanted_pedococcus_dokdonensis_unwanted_pedococcus_aerophilus_zscore_bulk_aa_diff
skipping file: wanted_pedococcus_dokdonensis_unwanted_pedococcus_aerophilus_zscore_bulk_aa_ratio
skipping file: wanted_pedococcus_dokdonensis_unwanted_pedococcus_bigeumensis_single_codon_diff
skipping file: wanted_pedococcus_dokdonensis_unwanted_pedococcus_bigeumensis_single_codon_ratio
skipping file: wanted_pedococcus_dokd

------------------------------------------
Parsing information for Pseudarthrobacter phenanthrenivorans:
------------------------------------------
Organism is defined as wanted
Found 65 ribosomal proteins in input genome.
Estimated expression dictionary does not have enough expression levels. CAI will be calculated from a reference set of ribosomal proteins or the entire genome.
CAI will be calculated from a reference set of ribosomal proteins.


{'genome_path': '/tamir2/moranb/microbiome/Igem_TAU_2021/analysis/example_data/arabidopsis_microbiome/Pseudarthrobacter phenanthrenivorans.gbff', 'optimized': True, 'expression_csv': None, 'optimization_priority': 50}


cai_std=0.09675833361567142, cai_avg=0.5479525972134006
------------------------------------------
Parsing information for Arthrobacter pascens:
------------------------------------------
Organism is defined as unwanted
Found 68 ribosomal proteins in input genome.
Estimated expression dictionary does not have enough expression levels. CAI will be calculated from a reference set of ribosomal proteins or the entire genome.
CAI will be calculated from a reference set of ribosomal proteins.


{'genome_path': '/tamir2/moranb/microbiome/Igem_TAU_2021/analysis/example_data/arabidopsis_microbiome/Arthrobacter pascens.gbff', 'optimized': False, 'expression_csv': None, 'optimization_priority': 50}


cai_std=0.08927798179680937, cai_avg=0.5974081071241611
-----------------------------
Normalized Prioritization Weights
-----------------------------
Pseudarthrobacter phenanthrenivorans : 1.0
Arthrobacter pascens : 1.0
Total input processing time: 9.464453935623169

##########################
# Initiation #
##########################
Running initiation optimization InitiationOptimizationMethod.original on 15 codons at the start of the ORF
Taking 15 codons from the start of the original ORF sequence:
Optimized sequence is: ATGACCGATAGCCTTAATCTTTCCAGTTTATGGCCAGACTTGAGCAGTCTAAATGTAATGCCACCTCAAACTCCTGAGCAATTGAGTGCTTTGTTTGTGGTTATTTTATGGGGTGTCGCTCTATTTTTCTTAATTTGGTCAGTTGCTGCATTTATTCGCGCAAGACAACGAGTAGTTTGGCTTAACAAATCTTTGGATGATGCGGAGAAATCGACTCTGTCAGGTGTTCGAAATGATTTAATTTCTAGAGCTGAACAAAAAAAAGATTCAGTAGGGCATTTATGGCTCGAGTTTGATGAAACATTGCTTGAAGTAAAAGGTGCTGATGACGTTGTTCGATTGCATAACACATTTGATGCTGACTATTTTTTTAATAGTTCTAGTTTAGCTGGTGGTAGTACTGAAAACAGAATGATTGCGGTGGTTCCAGGCTTTTTAACGGCTCTGGGTGTTATTGGTACTTTTGTCGGT

NameError: name 'json' is not defined

### Liyam's Legacy Analysis code 

In [None]:
final_tested_org = not_really_small_genome
# func_options = ['single_codon_global', 'single_codon_local', 'zscore_hill_climbing_average', 'zscore_hill_climbing_weakest_link']
func_options = ['single_codon_global', 'zscore_hill_climbing_average']

spearman_dict = {}
for translation_function in func_options:
    opt_scores = []
    msa_scores = []

    tic = time.time()
    for org1 in final_tested_org:
        for org2 in final_tested_org:
            # if org1.split(' ')[0] == org2.split(' ')[0]:
            #     continue
            if org1 == org2:
                continue
            org_dict[org1]['optimized'] = True
            org_dict[org1]['tai_profile'] = {}
            org_dict[org1]['tai_std'] = {}
            org_dict[org1]['tai_avg'] = {}
            org_dict[org2]['optimized'] = False
            org_dict[org2]['tai_profile'] = {}
            org_dict[org2]['tai_std'] = {}
            org_dict[org2]['tai_avg'] = {}
            software_dict = {
                'sequence': cds,
                'tuning_param': 0.5,
                'organisms': {},
                }
            software_dict['organisms'][org1] = org_dict[org1]
            software_dict['organisms'][org2] = org_dict[org2]
            inner_tic = time.time()
            final_cds, optimization_index, weakest_score = run_orf_optimization(software_dict)
            print('TIME: ', time.time()-inner_tic)
            # alignment_score = pairwise2.align.globalxx(
            #     ribosomal_dict[org1], ribosomal_dict[org2], score_only=True)
            # aln_scores.append(alignment_score)
            msa_score = diff_letters(ribosomal_msa_dict[org1], ribosomal_msa_dict[org2])
            msa_scores.append(msa_score)
            opt_scores.append(optimization_index)
    spearman_dict[translation_function] = spearmanr(msa_scores, opt_scores, nan_policy='omit')
    print(spearmanr(msa_scores, opt_scores, nan_policy='omit'))
    trans_func_name = translation_function.replace('_', ' ')
    plt.scatter(msa_scores, opt_scores, s=0.1)

            # print(org1, org2, msa_scores, optimization_index)
    toc = time.time()
    print(toc-tic)

print(spearman_dict)
plt.legend(['single codon optimization', 'hill climbing optimization'], loc ="upper right")
plt.title(f'Evolutionary distance and model performance')
plt.xlabel('# Different aligned characters')
plt.ylabel('Optimization score')
plt.show()
