In [22]:
import os

# Dask Configuration

In [23]:
from dask_jobqueue import PBSCluster
from pathlib import Path

# Define the working directory path
working_directory = str(Path.cwd())

# Launch a scheduler and workers on HPC via PBS
cluster = PBSCluster(
     cores=4,
     memory="8GB",
     processes=1,
     queue="tamirQ",
     walltime="02:30:00",
     scheduler_options={"dashboard_address": ":12435"},  # FIXME - this is the port we need to define...
     # Additional custom options
     log_directory="dask-logs",
     #worker_extra_args=["--lifetime", "25m", "--lifetime-stagger", "4m"],  # for walltime="00:30:00"
     job_script_prologue=[f"cd {working_directory}"]
)

Resource specification for PBS not set, initializing it to select=1:ncpus=4:mem=7630MB
Perhaps you already have a cluster running?
Hosting the HTTP server on port 34927 instead


In [24]:
cluster

0,1
Dashboard: http://132.66.112.146:34927/status,Workers: 0
Total threads: 0,Total memory: 0 B

0,1
Comm: tcp://132.66.112.146:45986,Workers: 0
Dashboard: http://132.66.112.146:34927/status,Total threads: 0
Started: Just now,Total memory: 0 B


In [25]:
cluster.adapt(minimum=30, maximum=60)
print(cluster.job_script())

Resource specification for PBS not set, initializing it to select=1:ncpus=4:mem=7630MB


#!/usr/bin/env bash

#PBS -N dask-worker
#PBS -q tamirQ
#PBS -l select=1:ncpus=4:mem=7630MB
#PBS -l walltime=02:30:00
#PBS -e dask-logs/
#PBS -o dask-logs/
cd /tamir2/moranb/microbiome/Igem_TAU_2021
/tamir2/moranb/microbiome/Igem_TAU_2021/venv/bin/python -m distributed.cli.dask_worker tcp://132.66.112.146:45986 --nthreads 4 --memory-limit 7.45GiB --name dummy-name --nanny --death-timeout 60



In [26]:
from dask.distributed import Client, progress, wait, get_client, get_worker
client = Client(cluster)

In [27]:
client

0,1
Connection method: Cluster object,Cluster type: dask_jobqueue.PBSCluster
Dashboard: http://132.66.112.146:34927/status,

0,1
Dashboard: http://132.66.112.146:34927/status,Workers: 0
Total threads: 0,Total memory: 0 B

0,1
Comm: tcp://132.66.112.146:45986,Workers: 0
Dashboard: http://132.66.112.146:34927/status,Total threads: 0
Started: Just now,Total memory: 0 B


In [28]:
import dask.bag as db
import dask.dataframe as dd
from dask import delayed, compute, persist
import json
import matplotlib
import numpy as np
import pandas as pd

# Analysis for endogenous genes

In [29]:
from Bio import SeqIO
from analysis.orf_model_analysis.input_testing_data.generate_input_testing_data_for_modules import generate_testing_data
from analysis.orf_model_analysis.input_testing_data.generate_input_testing_data_for_modules import generate_testing_data_for_ecoli_and_bacillus

from modules.main import run_modules

In [30]:
output_path = "/tamir2/moranb/microbiome/Igem_TAU_2021/analysis/results/endogenous_genes"

In [31]:
def convert_to_json_result(x):
    gene_name = x[0]
    result = run_modules(x[1], should_run_output_module=False)
    return {
        "initial_optimization_score": result["orf"].get("initial_sequence_optimization_score"),
        "final_optimization_score": result["orf"].get("final_sequence_optimization_score"),
        "average_distance_score": result["final_evaluation"].get("average_distance_score"),
        "average_distance_non_normalized_score": result["final_evaluation"].get("average_distance_non_normalized_score"),
        "weakest_link_score": result["final_evaluation"].get("weakest_link_score"),
        "ratio_score": result["final_evaluation"].get("ratio_score"),
        "gene_name": gene_name,
    }

In [32]:
def test_partial(x):
    gene_name = x[0]
    result = run_modules(x[1], should_run_output_module=False)
    return result

## Bacillus and E.coli

In [41]:
organism = "Escherichia-coli"
optimization_cub_index = "CAI"
is_ecoli_optimized = True

# organism = "Bacillus-subtilis"
fasta_file_path = f"/tamir2/moranb/microbiome/Igem_TAU_2021/analysis/example_data/{organism}.fasta"
with open(fasta_file_path, "r") as fasta_handle:
    genome_dict = SeqIO.to_dict(SeqIO.parse(fasta_handle, "fasta"))

for optimization_method in [
    "single_codon_diff", 
    "single_codon_ratio", 
    "zscore_bulk_aa_diff",
    "zscore_single_aa_diff",
    "zscore_bulk_aa_ratio",
    "zscore_single_aa_ratio",
]:
# optimization_method = "single_codon_ratio"
    configuration = f"e_coli_optimized_{is_ecoli_optimized}_bacillus_optimized_{not is_ecoli_optimized}"
    configuration_output_path = os.path.join(output_path, configuration, organism)
    
    inputs = [(gene_name, 
               generate_testing_data_for_ecoli_and_bacillus(
                   optimization_method=optimization_method,
                   optimization_cub_index=optimization_cub_index,
                   is_ecoli_optimized=is_ecoli_optimized,
                   tuning_param=0.5, 
                   sequence=str(orf_sequence.seq),
                   output_path=os.path.join(configuration_output_path,gene_name)),
                   evaluation_score="weakest_link",
              ) for gene_name, orf_sequence in genome_dict.items()]

    inputs = inputs[:100]
    inputs_series = pd.Series(inputs)
    inputs_db = db.from_sequence(inputs)

    results_db = inputs_db.map(convert_to_json_result)
    results_db.map(json.dumps).to_textfiles(os.path.join(configuration_output_path, 'results/*.json'))

    results_dir = F"{configuration_output_path}/{optimization_method}-debug/"
    os.rename(F"{configuration_output_path}/results", results_dir)

Resource specification for PBS not set, initializing it to select=1:ncpus=4:mem=7630MB
Resource specification for PBS not set, initializing it to select=1:ncpus=4:mem=7630MB
Resource specification for PBS not set, initializing it to select=1:ncpus=4:mem=7630MB
Resource specification for PBS not set, initializing it to select=1:ncpus=4:mem=7630MB
Resource specification for PBS not set, initializing it to select=1:ncpus=4:mem=7630MB
Resource specification for PBS not set, initializing it to select=1:ncpus=4:mem=7630MB
Resource specification for PBS not set, initializing it to select=1:ncpus=4:mem=7630MB
Resource specification for PBS not set, initializing it to select=1:ncpus=4:mem=7630MB
Resource specification for PBS not set, initializing it to select=1:ncpus=4:mem=7630MB
Resource specification for PBS not set, initializing it to select=1:ncpus=4:mem=7630MB
Resource specification for PBS not set, initializing it to select=1:ncpus=4:mem=7630MB
Resource specification for PBS not set, ini

FileExistsError: [Errno 17] File exists: '/tamir2/moranb/microbiome/Igem_TAU_2021/analysis/results/endogenous_genes/e_coli_optimized_True_bacillus_optimized_False/Bacillus-subtilis/results' -> '/tamir2/moranb/microbiome/Igem_TAU_2021/analysis/results/endogenous_genes/e_coli_optimized_True_bacillus_optimized_False/Bacillus-subtilis/single_codon_ratio-debug/'

Resource specification for PBS not set, initializing it to select=1:ncpus=4:mem=7630MB
