In [3]:
import os

# Dask Configuration

In [2]:
from dask_jobqueue import PBSCluster
from pathlib import Path

# Define the working directory path
working_directory = str(Path.cwd())

# Launch a scheduler and workers on HPC via PBS
cluster = PBSCluster(
     cores=4,
     memory="8GB",
     processes=1,
     queue="tamirQ",
     walltime="01:30:00",
     scheduler_options={"dashboard_address": ":12435"},  # FIXME - this is the port we need to define...
     # Additional custom options
     log_directory="dask-logs",
     #worker_extra_args=["--lifetime", "25m", "--lifetime-stagger", "4m"],  # for walltime="00:30:00"
     job_script_prologue=[f"cd {working_directory}"]
)

In [3]:
cluster

0,1
Dashboard: http://132.66.112.146:12435/status,Workers: 0
Total threads: 0,Total memory: 0 B

0,1
Comm: tcp://132.66.112.146:39144,Workers: 0
Dashboard: http://132.66.112.146:12435/status,Total threads: 0
Started: Just now,Total memory: 0 B


In [4]:
cluster.adapt(minimum=30, maximum=60)
print(cluster.job_script())

#!/usr/bin/env bash

#PBS -N dask-worker
#PBS -q tamirQ
#PBS -l select=1:ncpus=4:mem=7630MB
#PBS -l walltime=01:30:00
#PBS -e dask-logs/
#PBS -o dask-logs/
cd /tamir2/moranb/microbiome/Igem_TAU_2021
/tamir2/moranb/microbiome/Igem_TAU_2021/venv/bin/python -m distributed.cli.dask_worker tcp://132.66.112.146:39144 --nthreads 4 --memory-limit 7.45GiB --name dummy-name --nanny --death-timeout 60



In [5]:
from dask.distributed import Client, progress, wait, get_client, get_worker
client = Client(cluster)

In [6]:
client

0,1
Connection method: Cluster object,Cluster type: dask_jobqueue.PBSCluster
Dashboard: http://132.66.112.146:12435/status,

0,1
Dashboard: http://132.66.112.146:12435/status,Workers: 0
Total threads: 0,Total memory: 0 B

0,1
Comm: tcp://132.66.112.146:39144,Workers: 0
Dashboard: http://132.66.112.146:12435/status,Total threads: 0
Started: Just now,Total memory: 0 B


In [6]:
import dask.bag as db
import dask.dataframe as dd
from dask import delayed, compute, persist
import json
import matplotlib
import numpy as np
import pandas as pd

# Analysis for endogenous genes

In [7]:
from Bio import SeqIO
from analysis.orf_model_analysis.input_testing_data.generate_input_testing_data_for_modules import generate_testing_data
from modules.main import run_modules

In [8]:
output_path = "/tamir2/moranb/microbiome/Igem_TAU_2021/analysis/results/endogenous_genes"

## Paenibacillus-prosopidis

In [9]:
organism = "Paenibacillus-prosopidis"
fasta_file_path = "/tamir2/moranb/microbiome/Igem_TAU_2021/analysis/example_data/arabidopsis_microbiome/Paenibacillus-prosopidis.fasta"
with open(fasta_file_path, "r") as fasta_handle:
    genome_dict = SeqIO.to_dict(SeqIO.parse(fasta_handle, "fasta"))


### Two wanted hosts and two unwanted hosts

In [10]:
optimization_cub_index = "CAI"
optimization_method = "zscore_bulk_aa_diff"

wanted_hosts = [
    "Paenibacillus-prosopidis.gb", 
     "Arthrobacter-luteolus.gb",
]
unwanted_hosts = [
    "Yonghaparkia-alkaliphila.gb", 
     "Brevibacterium-frigoritolerans.gb",
]

configuration = f"wanted_{'_'.join([host[:-3] for host in wanted_hosts])}_unwanted_{'_'.join([host[:-3] for host in unwanted_hosts])}"
configuration_output_path = os.path.join(output_path, configuration, organism)

In [23]:
inputs = [(gene_name, generate_testing_data(
    optimization_method=optimization_method,
    optimization_cub_index=optimization_cub_index,
    wanted_hosts=wanted_hosts,
    unwanted_hosts=unwanted_hosts,
    tuning_param=0.5,
    sequence=str(orf_sequence.seq),
    output_path=os.path.join(configuration_output_path,gene_name))) for gene_name, orf_sequence in genome_dict.items()]

In [24]:
inputs_series = pd.Series(inputs)
inputs_db = db.from_sequence(inputs)

An example for running the code on some inputs and saving the results to a file:

In [14]:
# %%time
# partial_inputs = inputs_db.take(10)
# results_db = db.from_sequence(partial_inputs).map(lambda x: {x[0]: run_modules(x[1], should_run_output_module=False)})
# results_db.map(json.dumps).to_textfiles(os.path.join(configuration_output_path, 'results/*.json')) 

An example for reading and plotting the results:

In [15]:
# df = dd.read_json([F"{configuration_output_path}/results/{i}.json" for i in range(3)], orient='index')
# df[["weakest_link_score", "average_distance_score"]].compute().hist()

In [None]:
%%time
results_db = inputs_db.map(lambda x: {x[0]: run_modules(x[1], should_run_output_module=False)})
results_db.map(json.dumps).to_textfiles(os.path.join(configuration_output_path, 'results/*.json')) 

Resource specification for PBS not set, initializing it to select=1:ncpus=4:mem=7630MB
Resource specification for PBS not set, initializing it to select=1:ncpus=4:mem=7630MB
Resource specification for PBS not set, initializing it to select=1:ncpus=4:mem=7630MB
Resource specification for PBS not set, initializing it to select=1:ncpus=4:mem=7630MB
Resource specification for PBS not set, initializing it to select=1:ncpus=4:mem=7630MB
Resource specification for PBS not set, initializing it to select=1:ncpus=4:mem=7630MB
Resource specification for PBS not set, initializing it to select=1:ncpus=4:mem=7630MB
Resource specification for PBS not set, initializing it to select=1:ncpus=4:mem=7630MB
Resource specification for PBS not set, initializing it to select=1:ncpus=4:mem=7630MB
Resource specification for PBS not set, initializing it to select=1:ncpus=4:mem=7630MB
Resource specification for PBS not set, initializing it to select=1:ncpus=4:mem=7630MB
Resource specification for PBS not set, ini

In [1]:
def parse_res(file_path):
    with open(file_path, "r") as res:
        content = res.readlines()

    with open(file_path, "w") as res:
        for i in range(len(content)):
            clean_content = content[i].strip("\n")
            if i == 0:
                res.write("[" + clean_content + ",")
            elif i == len(content) - 1:
                res.write(clean_content + "]")
            else:
                res.write(clean_content + ",")

In [11]:
results_dir = os.path.join(configuration_output_path, 'results')
result_files = [F"{results_dir}/{file}" for file in os.listdir(results_dir)]
for file in result_files:
    if file.endswith(".json"):
        parse_res(file)   

In [19]:
# results_dir = F"{configuration_output_path}/results/"
# df = dd.read_json([F"{results_dir}/{file}" for file in os.listdir(results_dir)], orient='records', lines=True)
# df[["weakest_link_score", "average_distance_score"]].compute().hist()

In [13]:
results_dir = "analysis/results/endogenous_genes/wanted_Paenibacillus-prosopidis_Arthrobacter-luteolus_unwanted_Yonghaparkia-alkaliphila_Brevibacterium-frigoritolerans/Paenibacillus-prosopidis/zscore_bulk_aa_ratio/"
result_files = [F"{results_dir}/{file}" for file in os.listdir(results_dir)]
for file in result_files:
    if file.endswith(".json"):
        parse_res(file)   

In [None]:
def fix_res(file_path):
    with open(file_path, "r") as res:
        content = res.readlines()

    with open(file_path, "w") as res:
        clean_content = content[i].strip("\n")
        clean_content.replace(",\n,", ",")
        for i in range(len(content)):
            if i == len(content) - 1:
                res.write(clean_content + "]")
            else:
                res.write(clean_content + ",")