In [30]:
import os
import typing

# Dask Configuration

In [3]:
from dask_jobqueue import PBSCluster
from pathlib import Path

# Define the working directory path
working_directory = str(Path.cwd())

# Launch a scheduler and workers on HPC via PBS
cluster = PBSCluster(
     cores=4,
     memory="8GB",
     processes=1,
     queue="tamirQ",
     walltime="02:30:00",
     scheduler_options={"dashboard_address": ":12435"},  # FIXME - this is the port we need to define...
     # Additional custom options
     log_directory="dask-logs",
     #worker_extra_args=["--lifetime", "25m", "--lifetime-stagger", "4m"],  # for walltime="00:30:00"
     job_script_prologue=[f"cd {working_directory}"]
)

In [4]:
cluster

0,1
Dashboard: http://132.66.112.146:12435/status,Workers: 0
Total threads: 0,Total memory: 0 B

0,1
Comm: tcp://132.66.112.146:44941,Workers: 0
Dashboard: http://132.66.112.146:12435/status,Total threads: 0
Started: Just now,Total memory: 0 B


In [5]:
cluster.adapt(minimum=30, maximum=60)
print(cluster.job_script())

#!/usr/bin/env bash

#PBS -N dask-worker
#PBS -q tamirQ
#PBS -l select=1:ncpus=4:mem=7630MB
#PBS -l walltime=02:30:00
#PBS -e dask-logs/
#PBS -o dask-logs/
cd /tamir2/moranb/microbiome/Igem_TAU_2021
/tamir2/moranb/microbiome/Igem_TAU_2021/venv/bin/python -m distributed.cli.dask_worker tcp://132.66.112.146:44941 --nthreads 4 --memory-limit 7.45GiB --name dummy-name --nanny --death-timeout 60



In [6]:
from dask.distributed import Client, progress, wait, get_client, get_worker
client = Client(cluster)

In [7]:
client

0,1
Connection method: Cluster object,Cluster type: dask_jobqueue.PBSCluster
Dashboard: http://132.66.112.146:12435/status,

0,1
Dashboard: http://132.66.112.146:12435/status,Workers: 0
Total threads: 0,Total memory: 0 B

0,1
Comm: tcp://132.66.112.146:44941,Workers: 0
Dashboard: http://132.66.112.146:12435/status,Total threads: 0
Started: Just now,Total memory: 0 B


In [8]:
import dask.bag as db
import dask.dataframe as dd
from dask import delayed, compute, persist
from pandas.core.dtypes.common import pandas_dtype as dtype
import json
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy.stats as stats
from pathlib import Path



# Analysis for homo sapiens genes

In [9]:
import operator
from Bio import SeqIO
from analysis.input_testing_data.generate_input_testing_data_for_modules import generate_testing_data
from analysis.input_testing_data.generate_input_testing_data_for_modules import generate_testing_data_for_ecoli_and_bacillus

from modules.main import run_modules
from modules.user_IO.input_functions import extract_gene_data

In [10]:
output_path = "/tamir2/moranb/microbiome/Igem_TAU_2021/analysis/results/homo_sapiens"
optimization_cub_index = "CAI"

variant_names = [
    "{score}_single_diff",
    # "{score}_zscore_single_diff",
    "{score}_zscore_bulk_diff",
    "{score}_single_ratio",
    # "{score}_zscore_single_ratio",
    "{score}_zscore_bulk_ratio",
]

average_distance_variant_names = [s.format(score="average_distance_score") for s in variant_names]

In [11]:
def get_configuration(is_ecoli_optimized):
    return f"e_coli_optimized_{is_ecoli_optimized}_bacillus_optimized_{not is_ecoli_optimized}"

In [31]:
def get_orf_summary(summary: typing.Dict[str, typing.Any], evaluation_method: str = "average_distance_score") -> typing.Dict[str, typing.Any]:
    if len(summary["evaluation"]) == 1:
        return summary["orf"]
    final_evaluation = summary["final_evaluation"]
    for i, evaluation_summary in enumerate(summary["evaluation"]):
        if evaluation_summary[evaluation_method] == final_evaluation[evaluation_method]:
            return summary["orf"][i]

def get_total_run_time(summary: typing.Dict[str, typing.Any]) -> float:
     if len(summary["evaluation"]) == 1:
        return summary["orf"]["run_time"]
     run_time = 0
     for orf_summary in summary["orf"]:
         run_time += orf_summary["run_time"]
     return run_time

In [50]:
def convert_json(result):
    return result
    orf = get_orf_summary(result)
    if "orf" not in result or orf is None: # probably thrown in case of an error
        return {"error": result}
    run_time = get_total_run_time(result)
    iterations_count = orf.get("iterations_count", 1)
    return {
        "initial_optimization_score": orf.get("initial_sequence_optimization_score"),
        "final_optimization_score": orf.get("final_sequence_optimization_score"),
        "average_distance_score": result["final_evaluation"].get("average_distance_score"),
        "average_distance_non_normalized_score": result["final_evaluation"].get("average_distance_non_normalized_score"),
        "weakest_link_score": result["final_evaluation"].get("weakest_link_score"),
        "ratio_score": result["final_evaluation"].get("ratio_score"),
        "gene_name": gene_name,
        "orf_optimization_cub_index": result["module_input"].get("orf_optimization_cub_index"),
        "evaluation_score": result["module_input"].get("evaluation_score"),
        "run_time": run_time,
        "iterations_count": iterations_count,
    }

In [44]:
def json_map(x):
    try:
        result = json.loads(x)
        return convert_json(result)
    except Exception as e:
        return {"error_message": e}

In [40]:
def get_df_for_run_configuration(configuration, optimization_method, is_debug=False):
    configuration_output_path = os.path.join(output_path, configuration)
    debug_suffix = '-debug' if is_debug else ''
    results_dir = F"{os.path.join(configuration_output_path, optimization_method + debug_suffix)}"
    file_names = [str(file) for file in list(Path(results_dir).rglob("*.json"))]
    # bag = db.read_text(file_names).map(json.loads)
    bag = db.read_text(file_names).map(json_map)

    error_records = bag.filter(lambda x: "error_message" in x.keys())
    valid_records = bag.filter(lambda x: "error_message" not in x.keys())
    
    # return valid_records.to_dataframe().compute(), error_records.compute()
    return valid_records.compute(), error_records.compute()

    # return dd.read_json([str(file) for file in list(Path(results_dir).rglob("*.json"))], orient='records', lines=True)

In [14]:
def get_scores_df_by_configuration_df(configuration, is_debug=False):
    df_bulk_zscore_ratio, df_bulk_zscore_ratio_errors = get_df_for_run_configuration(configuration, "zscore_bulk_aa_ratio", is_debug)
    df_bulk_zscore_diff, df_bulk_zscore_diff_errors = get_df_for_run_configuration(configuration, "zscore_bulk_aa_diff", is_debug)
    merged_bulk_zscore_df = dd.merge(df_bulk_zscore_ratio, df_bulk_zscore_diff, on='gene_name', how='inner', suffixes=('_zscore_bulk_ratio', '_zscore_bulk_diff'))

    # df_single_zscore_ratio, df_single_zscore_ratio_errors = get_df_for_run_configuration(configuration, "zscore_single_aa_ratio", is_debug)
    # df_single_zscore_diff, df_single_zscore_diff_errors = get_df_for_run_configuration(configuration, "zscore_single_aa_diff", is_debug)
    # merged_single_zscore_df = dd.merge(df_single_zscore_ratio, df_single_zscore_diff, on='gene_name', how='inner', suffixes=('_zscore_single_ratio', '_zscore_single_diff'))

    df_single_codon_ratio, df_single_codon_ratio_error = get_df_for_run_configuration(configuration, "single_codon_ratio", is_debug)
    df_single_codon_diff, df_single_codon_diff_error = get_df_for_run_configuration(configuration, "single_codon_diff", is_debug)
    merged_single_df = dd.merge(df_single_codon_ratio, df_single_codon_diff, on='gene_name', how='inner', suffixes=('_single_ratio', '_single_diff'))

    # merged_df = dd.merge(merged_bulk_zscore_df, merged_single_zscore_df, on='gene_name', how='inner')
    # merged_df = dd.merge(merged_df, merged_single_df, on='gene_name', how='inner')
    merged_df = dd.merge(merged_bulk_zscore_df, merged_single_df, on='gene_name', how='inner')
    return merged_df, [
        df_bulk_zscore_ratio_errors,
        df_bulk_zscore_diff_errors, 
        # df_single_zscore_ratio_errors, 
        # df_single_zscore_diff_errors, 
        df_single_codon_ratio_error,
        df_single_codon_diff_error,
    ]

In [15]:
def get_scores_df_by_configuration(configuration, is_debug=False):
    df_bulk_zscore_ratio, df_bulk_zscore_ratio_errors = get_df_for_run_configuration(configuration, "zscore_bulk_aa_ratio", is_debug)
    df_bulk_zscore_diff, df_bulk_zscore_diff_errors = get_df_for_run_configuration(configuration, "zscore_bulk_aa_diff", is_debug)
    # merged_bulk_zscore_df = dd.merge(df_bulk_zscore_ratio, df_bulk_zscore_diff, on='gene_name', how='inner', suffixes=('_zscore_bulk_ratio', '_zscore_bulk_diff'))

    # df_single_zscore_ratio, df_single_zscore_ratio_errors = get_df_for_run_configuration(configuration, "zscore_single_aa_ratio", is_debug)
    # df_single_zscore_diff, df_single_zscore_diff_errors = get_df_for_run_configuration(configuration, "zscore_single_aa_diff", is_debug)
    # merged_single_zscore_df = dd.merge(df_single_zscore_ratio, df_single_zscore_diff, on='gene_name', how='inner', suffixes=('_zscore_single_ratio', '_zscore_single_diff'))

    df_single_codon_ratio, df_single_codon_ratio_error = get_df_for_run_configuration(configuration, "single_codon_ratio", is_debug)
    df_single_codon_diff, df_single_codon_diff_error = get_df_for_run_configuration(configuration, "single_codon_diff", is_debug)
    # merged_single_df = dd.merge(df_single_codon_ratio, df_single_codon_diff, on='gene_name', how='inner', suffixes=('_single_ratio', '_single_diff'))

    # merged_df = dd.merge(merged_bulk_zscore_df, merged_single_zscore_df, on='gene_name', how='inner')
    # merged_df = dd.merge(merged_df, merged_single_df, on='gene_name', how='inner')
    # merged_df = dd.merge(merged_bulk_zscore_df, merged_single_df, on='gene_name', how='inner')
    return [
        df_bulk_zscore_ratio,
        df_bulk_zscore_diff,
        df_single_codon_ratio,
        df_single_codon_diff,
    ], [
        df_bulk_zscore_ratio_errors,
        df_bulk_zscore_diff_errors, 
        # df_single_zscore_ratio_errors, 
        # df_single_zscore_diff_errors, 
        df_single_codon_ratio_error,
        df_single_codon_diff_error,
    ]

In [16]:
def add_median_to_histogram(value, histogram, row, col, upper_limit):
    median_value = value.median()
    histogram[row][col].axvline(median_value, color="green", linestyle="dashed", label="median")
    histogram[row][col].text(median_value-1, upper_limit, f"median={median_value}")

In [17]:
def add_median_to_histogram_non_normalized(value, histogram, row, col, upper_limit):
    median_value = value.median()
    histogram[row][col].axvline(median_value, color="green", linestyle="dashed", label="median")
    histogram[row][col].text(median_value-0.1, upper_limit, f"median={median_value}")

In [18]:
def plot_comparative_graphs(result):
    figure, axis = plt.subplots(8, 2, figsize=(20,35))

    index = 0
    for i, first_variant in enumerate(average_distance_variant_names):
        for j, second_variant in enumerate(average_distance_variant_names):
            if j <= i:
                continue 
            scatter = result.plot.scatter(x=first_variant, y=second_variant, ax=axis[index//2, index%2])
            r, p = stats.pearsonr(result[first_variant], result[second_variant])
            scatter.annotate('r = {:.3f}'.format(r), xy=(0.7, 0.9), xycoords='axes fraction')
            index += 1

#### Escherichia-coli as wanted host 

In [19]:
configuration = get_configuration(is_ecoli_optimized=True)

In [None]:
df, df_errors = get_df_for_run_configuration(configuration, "zscore_bulk_aa_ratio", False)

Resource specification for PBS not set, initializing it to select=1:ncpus=4:mem=7630MB
Resource specification for PBS not set, initializing it to select=1:ncpus=4:mem=7630MB
Resource specification for PBS not set, initializing it to select=1:ncpus=4:mem=7630MB
Resource specification for PBS not set, initializing it to select=1:ncpus=4:mem=7630MB
Resource specification for PBS not set, initializing it to select=1:ncpus=4:mem=7630MB
Resource specification for PBS not set, initializing it to select=1:ncpus=4:mem=7630MB
Resource specification for PBS not set, initializing it to select=1:ncpus=4:mem=7630MB
Resource specification for PBS not set, initializing it to select=1:ncpus=4:mem=7630MB
Resource specification for PBS not set, initializing it to select=1:ncpus=4:mem=7630MB
Resource specification for PBS not set, initializing it to select=1:ncpus=4:mem=7630MB
Resource specification for PBS not set, initializing it to select=1:ncpus=4:mem=7630MB
Resource specification for PBS not set, ini

In [49]:
df_errors[0]

{'error_message': KeyError('evaluation')}

In [None]:
result, errors = get_scores_df_by_configuration(configuration)

In [20]:
error=False
for results_list in result:
    for record in results_list:
        try:
            int(record["iterations_count"])
        except Exception:
            print(record)
            error=True
            break
    if error:
        break

{'initial_optimization_score': 0.907692690961394, 'final_optimization_score': 1.2662076514775864, 'average_distance_score': 0.7752914068450383, 'average_distance_non_normalized_score': 0.12009975295583997, 'weakest_link_score': 0.7752914068450383, 'ratio_score': 1.0436351158145694, 'gene_name': 'lcl|NC_000001.11_cds_NP_689578.2_2421', 'orf_optimization_cub_index': 'CAI', 'evaluation_score': 'average_distance', 'run_time': 50.58537229895592}


In [22]:
def get_df_for_run_configuration_debug(configuration, optimization_method, is_debug=False, index=None):
    configuration_output_path = os.path.join(output_path, configuration)
    debug_suffix = '-debug' if is_debug else ''
    results_dir = F"{os.path.join(configuration_output_path, optimization_method + debug_suffix)}"
    file_names = [str(file) for file in list(Path(results_dir).rglob("*.json"))]
    if index is not None:
        print(file_names[index])
        bag = db.read_text(file_names[index]).map(json.loads)
    else:
        bag = db.read_text(file_names[:2500]).map(json.loads)

    error_records = bag.filter(lambda x: "error_message" in x.keys())
    valid_records = bag.filter(lambda x: "error_message" not in x.keys())
    
    return valid_records.to_dataframe().compute(), error_records.compute()

In [29]:
# df_single_codon_diff, df_single_codon_diff_error = get_df_for_run_configuration_debug(configuration, "single_codon_ratio", False)
# merged_df, error = get_scores_df_by_configuration_df(configuration, False)
df_single_codon_ratio, df_single_codon_ratio_error = get_df_for_run_configuration_debug(configuration, "zscore_bulk_aa_diff", False)

Resource specification for PBS not set, initializing it to select=1:ncpus=4:mem=7630MB
Resource specification for PBS not set, initializing it to select=1:ncpus=4:mem=7630MB
Resource specification for PBS not set, initializing it to select=1:ncpus=4:mem=7630MB
Resource specification for PBS not set, initializing it to select=1:ncpus=4:mem=7630MB
Resource specification for PBS not set, initializing it to select=1:ncpus=4:mem=7630MB
Resource specification for PBS not set, initializing it to select=1:ncpus=4:mem=7630MB
Resource specification for PBS not set, initializing it to select=1:ncpus=4:mem=7630MB
Resource specification for PBS not set, initializing it to select=1:ncpus=4:mem=7630MB
Resource specification for PBS not set, initializing it to select=1:ncpus=4:mem=7630MB
Resource specification for PBS not set, initializing it to select=1:ncpus=4:mem=7630MB
Resource specification for PBS not set, initializing it to select=1:ncpus=4:mem=7630MB
Resource specification for PBS not set, ini

In [31]:
df_single_codon_ratio_error

[{'error_message': 'Traceback (most recent call last):\n  File "/tamir2/moranb/microbiome/Igem_TAU_2021/modules/main.py", line 55, in run_modules\n    clustered_orf_results.append(run_orf_optimization(\n  File "/tamir2/moranb/microbiome/Igem_TAU_2021/modules/main.py", line 175, in run_orf_optimization\n    cds_nt_final_cai = ORF.ORFModule.run_module(\n  File "/tamir2/moranb/microbiome/Igem_TAU_2021/modules/ORF/orf_main.py", line 47, in run_module\n    return ORFModule.optimize_sequence_by_zscore(\n  File "/tamir2/moranb/microbiome/Igem_TAU_2021/modules/ORF/orf_main.py", line 83, in optimize_sequence_by_zscore\n    result = partial_zscore_optimization_method(sequence=target_gene)\n  File "/tamir2/moranb/microbiome/Igem_TAU_2021/modules/ORF/zscore_optimization_method.py", line 211, in optimize_sequence_by_zscore_bulk_aa\n    aa_to_selected_codon = _get_optimal_codon_per_aa(\n  File "/tamir2/moranb/microbiome/Igem_TAU_2021/modules/ORF/zscore_optimization_method.py", line 298, in _get_opti

Resource specification for PBS not set, initializing it to select=1:ncpus=4:mem=7630MB
Resource specification for PBS not set, initializing it to select=1:ncpus=4:mem=7630MB
Resource specification for PBS not set, initializing it to select=1:ncpus=4:mem=7630MB
Resource specification for PBS not set, initializing it to select=1:ncpus=4:mem=7630MB
Resource specification for PBS not set, initializing it to select=1:ncpus=4:mem=7630MB
Resource specification for PBS not set, initializing it to select=1:ncpus=4:mem=7630MB
Resource specification for PBS not set, initializing it to select=1:ncpus=4:mem=7630MB
Resource specification for PBS not set, initializing it to select=1:ncpus=4:mem=7630MB
Resource specification for PBS not set, initializing it to select=1:ncpus=4:mem=7630MB
Resource specification for PBS not set, initializing it to select=1:ncpus=4:mem=7630MB
Resource specification for PBS not set, initializing it to select=1:ncpus=4:mem=7630MB
Resource specification for PBS not set, ini

In [None]:
index = 0
for i in range(index, index+10):
    try:
        df_single_codon_diff, df_single_codon_diff_error = get_df_for_run_configuration_debug(configuration, "zscore_single_aa_ratio", False, i)
    except:
        print("error ^")

In [None]:
histogram = result[["average_distance_score_zscore_bulk_diff", "average_distance_score_zscore_single_diff", "average_distance_score_single_diff", "average_distance_score_zscore_bulk_ratio", "average_distance_score_zscore_single_ratio", "average_distance_score_single_ratio"]].hist(sharex=True, sharey=True,figsize=(15, 10), layout=(2,3))

median_height = 120
add_median_to_histogram(result["average_distance_score_zscore_bulk_diff"], histogram, 0, 0, median_height)
add_median_to_histogram(result["average_distance_score_zscore_single_diff"], histogram, 0, 1, median_height)
add_median_to_histogram(result["average_distance_score_single_diff"], histogram, 0, 2, median_height)

add_median_to_histogram(result["average_distance_score_zscore_bulk_ratio"], histogram, 1, 0, median_height)
add_median_to_histogram(result["average_distance_score_zscore_single_ratio"], histogram, 1, 1, median_height)
add_median_to_histogram(result["average_distance_score_single_ratio"], histogram, 1,2, median_height)

In [None]:
single_diff_negative = result[result["average_distance_score_single_diff"] < 0] 
single_ratio_negative = result[result["average_distance_score_single_ratio"] < 0] 

zscore_bulk_diff_negative = result[result["average_distance_score_zscore_bulk_diff"] < 0] 
zscore_bulk_ratio_negative = result[result["average_distance_score_zscore_bulk_ratio"] < 0]

zscore_single_diff_negative = result[result["average_distance_score_zscore_single_diff"] < 0] 
zscore_single_ratio_negative = result[result["average_distance_score_zscore_single_ratio"] < 0] 

print(f"{len(single_diff_negative)}, percentage:{len(single_diff_negative)/len(result) * 100}")
print(f"{len(single_ratio_negative)}, percentage:{len(single_ratio_negative)/len(result) * 100}")
print(f"{len(zscore_bulk_diff_negative)}, percentage:{len(zscore_bulk_diff_negative)/len(result) * 100}")
print(f"{len(zscore_bulk_ratio_negative)}, percentage:{len(zscore_bulk_ratio_negative)/len(result) * 100}")
print(f"{len(zscore_single_diff_negative)}, percentage:{len(zscore_single_diff_negative)/len(result) * 100}")
print(f"{len(zscore_single_ratio_negative)}, percentage:{len(zscore_single_ratio_negative)/len(result) * 100}")

In [None]:
plot_comparative_graphs(result)

#### Escherichia-coli as unwanted host 

In [None]:
configuration = get_configuration(is_ecoli_optimized=False)
result, errors = get_scores_df_by_configuration(configuration)

In [None]:
histogram = result[["average_distance_score_zscore_bulk_diff", "average_distance_score_zscore_single_diff", "average_distance_score_single_diff", "average_distance_score_zscore_bulk_ratio", "average_distance_score_zscore_single_ratio", "average_distance_score_single_ratio"]].hist(sharex=True, sharey=True,figsize=(15, 10), layout=(2,3))

median_height = 120
add_median_to_histogram(result["average_distance_score_zscore_bulk_diff"], histogram, 0, 0, median_height)
add_median_to_histogram(result["average_distance_score_zscore_single_diff"], histogram, 0, 1, median_height)
add_median_to_histogram(result["average_distance_score_single_diff"], histogram, 0, 2, median_height)

add_median_to_histogram(result["average_distance_score_zscore_bulk_ratio"], histogram, 1, 0, median_height)
add_median_to_histogram(result["average_distance_score_zscore_single_ratio"], histogram, 1, 1, median_height)
add_median_to_histogram(result["average_distance_score_single_ratio"], histogram, 1,2, median_height)

In [None]:
single_diff_negative = result[result["average_distance_score_single_diff"] < 0] 
single_ratio_negative = result[result["average_distance_score_single_ratio"] < 0] 

zscore_bulk_diff_negative = result[result["average_distance_score_zscore_bulk_diff"] < 0] 
zscore_bulk_ratio_negative = result[result["average_distance_score_zscore_bulk_ratio"] < 0]

zscore_single_diff_negative = result[result["average_distance_score_zscore_single_diff"] < 0] 
zscore_single_ratio_negative = result[result["average_distance_score_zscore_single_ratio"] < 0] 

print(f"{len(single_diff_negative)}, percentage:{len(single_diff_negative)/len(result) * 100}")
print(f"{len(single_ratio_negative)}, percentage:{len(single_ratio_negative)/len(result) * 100}")
print(f"{len(zscore_bulk_diff_negative)}, percentage:{len(zscore_bulk_diff_negative)/len(result) * 100}")
print(f"{len(zscore_bulk_ratio_negative)}, percentage:{len(zscore_bulk_ratio_negative)/len(result) * 100}")
print(f"{len(zscore_single_diff_negative)}, percentage:{len(zscore_single_diff_negative)/len(result) * 100}")
print(f"{len(zscore_single_ratio_negative)}, percentage:{len(zscore_single_ratio_negative)/len(result) * 100}")

In [None]:
plot_comparative_graphs(result)