From 16669e322d101f2575262023df27127b736a9f72 Mon Sep 17 00:00:00 2001 From: jiachen Date: Wed, 26 Apr 2023 18:24:59 +0000 Subject: [PATCH] train larry mono and neu --- pyrovelocity/config.py | 42 ++++ pyrovelocity/data.py | 5 +- reproducibility/figures/config.yaml | 92 +++++++ reproducibility/figures/dvc.lock | 234 ++++++++++++++++++ reproducibility/figures/dvc.yaml | 96 +++++++ .../models/larry_mono_model2/.gitignore | 3 + .../models/larry_mono_model2/metrics.json | 7 + .../models/larry_mono_model2/run_info.json | 12 + .../models/larry_neu_model2/.gitignore | 3 + .../models/larry_neu_model2/metrics.json | 7 + .../models/larry_neu_model2/run_info.json | 12 + 11 files changed, 512 insertions(+), 1 deletion(-) create mode 100644 reproducibility/figures/models/larry_mono_model2/.gitignore create mode 100644 reproducibility/figures/models/larry_mono_model2/metrics.json create mode 100644 reproducibility/figures/models/larry_mono_model2/run_info.json create mode 100644 reproducibility/figures/models/larry_neu_model2/.gitignore create mode 100644 reproducibility/figures/models/larry_neu_model2/metrics.json create mode 100644 reproducibility/figures/models/larry_neu_model2/run_info.json diff --git a/pyrovelocity/config.py b/pyrovelocity/config.py index fbe8c261e..b193d540b 100644 --- a/pyrovelocity/config.py +++ b/pyrovelocity/config.py @@ -180,6 +180,24 @@ def create_reports_config(model_name: str, model_number: int): process_method="load_data", process_args=dict(), ), + larry_mono=create_dataset_config( + "larry_mono", + dl_root="${data_external.root_path}", + data_file="larry_mono.h5ad", + rel_path="${data_external.root_path}/larry_mono.h5ad", + url="${data_external.pyrovelocity.sources.figshare_root_url}/37028572", + process_method="load_data", + process_args=dict(), + ), + larry_neu=create_dataset_config( + "larry_neu", + dl_root="${data_external.root_path}", + data_file="larry_neu.h5ad", + rel_path="${data_external.root_path}/larry_neu.h5ad", + url="${data_external.pyrovelocity.sources.figshare_root_url}/37028575", + process_method="load_data", + process_args=dict(), + ), ), ), model_training=dict( @@ -193,6 +211,8 @@ def create_reports_config(model_name: str, model_number: int): "pons_model1", "pons_model2", "larry_model2", + "larry_mono_model2", + "larry_neu_model2", ], simulate_model1=create_model_config( "simulate", @@ -275,6 +295,28 @@ def create_reports_config(model_name: str, model_number: int): offset=True, max_epochs=1000, ), + larry_mono_model2=create_model_config( + "pyrovelocity", + "larry_mono", + 2, + "emb", + svi_train=True, + batch_size=4000, + cell_state="state_info", + offset=True, + max_epochs=1000, + ), + larry_neu_model2=create_model_config( + "pyrovelocity", + "larry_neu", + 2, + "emb", + svi_train=True, + batch_size=4000, + cell_state="state_info", + offset=True, + max_epochs=1000, + ), ), reports=dict( model_summary=dict( diff --git a/pyrovelocity/data.py b/pyrovelocity/data.py index 809c8efea..c84de5190 100644 --- a/pyrovelocity/data.py +++ b/pyrovelocity/data.py @@ -90,6 +90,9 @@ def load_data( adata = scv.datasets.dentategyrus() elif data == "larry": adata = load_larry() + elif data in ['larry_mono', 'larry_neu']: + adata = load_unipotent_larry(data.split('-')[1]) + adata = adata[adata.obs.state_info != "Centroid", :] else: adata = sc.read(data) @@ -118,7 +121,7 @@ def load_data( scv.tl.velocity(adata, mode="dynamical", use_raw=False) scv.tl.velocity_graph(adata, n_jobs=-1) - if data == "larry": + if "larry" in data: scv.tl.velocity_embedding(adata, basis="emb") else: scv.tl.velocity_embedding(adata) diff --git a/reproducibility/figures/config.yaml b/reproducibility/figures/config.yaml index 5b9039362..380491a2b 100644 --- a/reproducibility/figures/config.yaml +++ b/reproducibility/figures/config.yaml @@ -93,6 +93,26 @@ data_external: process_method: load_data process_args: {} rel_path: data/processed/larry_processed.h5ad + larry_mono: + data_file: larry_mono.h5ad + dl_root: data/external + dl_path: data/external/larry_mono.h5ad + rel_path: data/external/larry_mono.h5ad + url: https://ndownloader.figshare.com/files/37028572 + derived: + process_method: load_data + process_args: {} + rel_path: data/processed/larry_mono_processed.h5ad + larry_neu: + data_file: larry_neu.h5ad + dl_root: data/external + dl_path: data/external/larry_neu.h5ad + rel_path: data/external/larry_neu.h5ad + url: https://ndownloader.figshare.com/files/37028575 + derived: + process_method: load_data + process_args: {} + rel_path: data/processed/larry_neu_processed.h5ad model_training: train: - simulate_model1 @@ -104,6 +124,8 @@ model_training: - pons_model1 - pons_model2 - larry_model2 + - larry_mono_model2 + - larry_neu_model2 simulate_model1: path: models/medium_model1 model_path: models/medium_model1/model @@ -419,6 +441,76 @@ model_training: cell_specific_kinetics: null kinetics_num: 2 loss_plot_path: models/larry_model2/loss_plot.png + larry_mono_model2: + path: models/larry_mono_model2 + model_path: models/larry_mono_model2/model + input_data_path: data/processed/larry_mono_processed.h5ad + trained_data_path: models/larry_mono_model2/trained.h5ad + pyrovelocity_data_path: models/larry_mono_model2/pyrovelocity.pkl + metrics_path: models/larry_mono_model2/metrics.json + run_info_path: models/larry_mono_model2/run_info.json + vector_field_parameters: + basis: emb + training_parameters: + _target_: pyrovelocity.api.train_model + _partial_: true + guide_type: auto + model_type: auto + svi_train: true + batch_size: 4000 + train_size: 1.0 + use_gpu: 0 + likelihood: Poisson + num_samples: 30 + log_every: 100 + cell_state: state_info + patient_improve: 0.0001 + patient_init: 45 + seed: 99 + lr: 0.01 + max_epochs: 1000 + include_prior: true + library_size: true + offset: true + input_type: raw + cell_specific_kinetics: null + kinetics_num: 2 + loss_plot_path: models/larry_mono_model2/loss_plot.png + larry_neu_model2: + path: models/larry_neu_model2 + model_path: models/larry_neu_model2/model + input_data_path: data/processed/larry_neu_processed.h5ad + trained_data_path: models/larry_neu_model2/trained.h5ad + pyrovelocity_data_path: models/larry_neu_model2/pyrovelocity.pkl + metrics_path: models/larry_neu_model2/metrics.json + run_info_path: models/larry_neu_model2/run_info.json + vector_field_parameters: + basis: emb + training_parameters: + _target_: pyrovelocity.api.train_model + _partial_: true + guide_type: auto + model_type: auto + svi_train: true + batch_size: 4000 + train_size: 1.0 + use_gpu: 0 + likelihood: Poisson + num_samples: 30 + log_every: 100 + cell_state: state_info + patient_improve: 0.0001 + patient_init: 45 + seed: 99 + lr: 0.01 + max_epochs: 1000 + include_prior: true + library_size: true + offset: true + input_type: raw + cell_specific_kinetics: null + kinetics_num: 2 + loss_plot_path: models/larry_neu_model2/loss_plot.png reports: model_summary: summarize: diff --git a/reproducibility/figures/dvc.lock b/reproducibility/figures/dvc.lock index 8848c4c1e..788dfbbb3 100644 --- a/reproducibility/figures/dvc.lock +++ b/reproducibility/figures/dvc.lock @@ -1624,3 +1624,237 @@ stages: - path: models/larry_model2/trained.h5ad md5: 3b213324776dd9b266b21b92c0fd98ca size: 6296829206 + data_download_larry_neu: + cmd: python data_download.py data_external.sources=[pyrovelocity] data_external.pyrovelocity.download=[larry_neu] + deps: + - path: data_download.py + md5: 30f38cc794cbf4caad2675ffd88c2467 + size: 3305 + params: + config.yaml: + base: + log_level: INFO + data_external.pyrovelocity.larry_neu: + data_file: larry_neu.h5ad + dl_root: data/external + dl_path: data/external/larry_neu.h5ad + rel_path: data/external/larry_neu.h5ad + url: https://ndownloader.figshare.com/files/37028575 + derived: + process_method: load_data + process_args: {} + rel_path: data/processed/larry_neu_processed.h5ad + outs: + - path: data/external/larry_neu.h5ad + md5: 3192e2fe89d64f5d0d158c0e7d26c79d + size: 60008807 + preprocess_larry_neu: + cmd: python preprocess.py data_external.sources=[pyrovelocity] data_external.pyrovelocity.process=[larry_neu] + deps: + - path: data/external/larry_neu.h5ad + md5: 3192e2fe89d64f5d0d158c0e7d26c79d + size: 60008807 + - path: preprocess.py + md5: bf09c86fc25b1d1a98b9aa1c5fa04361 + size: 2757 + params: + config.yaml: + base: + log_level: INFO + data_external.pyrovelocity.larry_neu: + data_file: larry_neu.h5ad + dl_root: data/external + dl_path: data/external/larry_neu.h5ad + rel_path: data/external/larry_neu.h5ad + url: https://ndownloader.figshare.com/files/37028575 + derived: + process_method: load_data + process_args: {} + rel_path: data/processed/larry_neu_processed.h5ad + outs: + - path: data/processed/larry_neu_processed.h5ad + md5: d4fc7310876d4607b2f2b9e87b515980 + size: 119684115 + data_download_larry_mono: + cmd: python data_download.py data_external.sources=[pyrovelocity] data_external.pyrovelocity.download=[larry_mono] + deps: + - path: data_download.py + md5: 30f38cc794cbf4caad2675ffd88c2467 + size: 3305 + params: + config.yaml: + base: + log_level: INFO + data_external.pyrovelocity.larry_mono: + data_file: larry_mono.h5ad + dl_root: data/external + dl_path: data/external/larry_mono.h5ad + rel_path: data/external/larry_mono.h5ad + url: https://ndownloader.figshare.com/files/37028572 + derived: + process_method: load_data + process_args: {} + rel_path: data/processed/larry_mono_processed.h5ad + outs: + - path: data/external/larry_mono.h5ad + md5: 01f4e084c37482e26800ba4dfa0202bd + size: 66173538 + preprocess_larry_mono: + cmd: python preprocess.py data_external.sources=[pyrovelocity] data_external.pyrovelocity.process=[larry_mono] + deps: + - path: data/external/larry_mono.h5ad + md5: 01f4e084c37482e26800ba4dfa0202bd + size: 66173538 + - path: preprocess.py + md5: bf09c86fc25b1d1a98b9aa1c5fa04361 + size: 2757 + params: + config.yaml: + base: + log_level: INFO + data_external.pyrovelocity.larry_mono: + data_file: larry_mono.h5ad + dl_root: data/external + dl_path: data/external/larry_mono.h5ad + rel_path: data/external/larry_mono.h5ad + url: https://ndownloader.figshare.com/files/37028572 + derived: + process_method: load_data + process_args: {} + rel_path: data/processed/larry_mono_processed.h5ad + outs: + - path: data/processed/larry_mono_processed.h5ad + md5: ae00da2f12ab951745640e24a7a33045 + size: 139326026 + train_larry_mono_model2: + cmd: /usr/bin/time -v python train.py model_training.train=[larry_mono_model2] + deps: + - path: data/processed/larry_mono_processed.h5ad + md5: ae00da2f12ab951745640e24a7a33045 + size: 139326026 + - path: train.py + md5: 0e7c46a112eab9290b48ad4f3deecaaa + size: 8684 + params: + config.yaml: + model_training.larry_mono_model2: + path: models/larry_mono_model2 + model_path: models/larry_mono_model2/model + input_data_path: data/processed/larry_mono_processed.h5ad + trained_data_path: models/larry_mono_model2/trained.h5ad + pyrovelocity_data_path: models/larry_mono_model2/pyrovelocity.pkl + metrics_path: models/larry_mono_model2/metrics.json + run_info_path: models/larry_mono_model2/run_info.json + vector_field_parameters: + basis: emb + training_parameters: + _target_: pyrovelocity.api.train_model + _partial_: true + guide_type: auto + model_type: auto + svi_train: true + batch_size: 4000 + train_size: 1.0 + use_gpu: 0 + likelihood: Poisson + num_samples: 30 + log_every: 100 + cell_state: state_info + patient_improve: 0.0001 + patient_init: 45 + seed: 99 + lr: 0.01 + max_epochs: 1000 + include_prior: true + library_size: true + offset: true + input_type: raw + cell_specific_kinetics: + kinetics_num: 2 + loss_plot_path: models/larry_mono_model2/loss_plot.png + outs: + - path: models/larry_mono_model2/loss_plot.png + md5: 906f8f1c5ea6df8d0c58bd7d7a78455c + size: 12683 + - path: models/larry_mono_model2/metrics.json + md5: eac6e405df47e2a0132613b55d53541c + size: 160 + - path: models/larry_mono_model2/model + md5: b83900a0168c6e075ca66b13c15ddce7.dir + size: 610922 + nfiles: 1 + - path: models/larry_mono_model2/pyrovelocity.pkl + md5: fc35521f8582e1f6496c19062af2f379 + size: 21028511 + - path: models/larry_mono_model2/run_info.json + md5: 65ccbbfbcc7a977381fee6101f3b08fd + size: 457 + - path: models/larry_mono_model2/trained.h5ad + md5: 964026025d35aaeabea8932c39c9a2b6 + size: 158935226 + train_larry_neu_model2: + cmd: /usr/bin/time -v python train.py model_training.train=[larry_neu_model2] + deps: + - path: data/processed/larry_neu_processed.h5ad + md5: d4fc7310876d4607b2f2b9e87b515980 + size: 119684115 + - path: train.py + md5: 0e7c46a112eab9290b48ad4f3deecaaa + size: 8684 + params: + config.yaml: + model_training.larry_neu_model2: + path: models/larry_neu_model2 + model_path: models/larry_neu_model2/model + input_data_path: data/processed/larry_neu_processed.h5ad + trained_data_path: models/larry_neu_model2/trained.h5ad + pyrovelocity_data_path: models/larry_neu_model2/pyrovelocity.pkl + metrics_path: models/larry_neu_model2/metrics.json + run_info_path: models/larry_neu_model2/run_info.json + vector_field_parameters: + basis: emb + training_parameters: + _target_: pyrovelocity.api.train_model + _partial_: true + guide_type: auto + model_type: auto + svi_train: true + batch_size: 4000 + train_size: 1.0 + use_gpu: 0 + likelihood: Poisson + num_samples: 30 + log_every: 100 + cell_state: state_info + patient_improve: 0.0001 + patient_init: 45 + seed: 99 + lr: 0.01 + max_epochs: 1000 + include_prior: true + library_size: true + offset: true + input_type: raw + cell_specific_kinetics: + kinetics_num: 2 + loss_plot_path: models/larry_neu_model2/loss_plot.png + outs: + - path: models/larry_neu_model2/loss_plot.png + md5: c7faa07326dd0881f86ce883ca176908 + size: 12516 + - path: models/larry_neu_model2/metrics.json + md5: d538c4231efb5a2a8fceb2933c93d1b5 + size: 161 + - path: models/larry_neu_model2/model + md5: 6a70127d67f8a2b7042cdcaa86f21507.dir + size: 564522 + nfiles: 1 + - path: models/larry_neu_model2/pyrovelocity.pkl + md5: 1eb39b2ef89cbec3321db8a251cd0960 + size: 18428100 + - path: models/larry_neu_model2/run_info.json + md5: fd0c23e88da6b69cac3d72b3c29fdffd + size: 456 + - path: models/larry_neu_model2/trained.h5ad + md5: 200056b02081a4560ceaceec98b138d7 + size: 136435223 diff --git a/reproducibility/figures/dvc.yaml b/reproducibility/figures/dvc.yaml index 67a71a713..e8b4f0569 100644 --- a/reproducibility/figures/dvc.yaml +++ b/reproducibility/figures/dvc.yaml @@ -90,6 +90,30 @@ stages: - ${data_external.pyrovelocity.larry.rel_path}: persist: true + data_download_larry_mono: + cmd: python data_download.py data_external.sources=[pyrovelocity] data_external.pyrovelocity.download=[larry_mono] + deps: + - data_download.py + params: + - config.yaml: + - base + - data_external.pyrovelocity.larry_mono + outs: + - ${data_external.pyrovelocity.larry_mono.rel_path}: + persist: true + + data_download_larry_neu: + cmd: python data_download.py data_external.sources=[pyrovelocity] data_external.pyrovelocity.download=[larry_neu] + deps: + - data_download.py + params: + - config.yaml: + - base + - data_external.pyrovelocity.larry_neu + outs: + - ${data_external.pyrovelocity.larry_neu.rel_path}: + persist: true + data_download_pons: cmd: python data_download.py data_external.sources=[velocyto] data_external.velocyto.download=[pons] deps: @@ -182,6 +206,32 @@ stages: - ${data_external.pyrovelocity.larry.derived.rel_path} # persist: true + preprocess_larry_mono: + cmd: python preprocess.py data_external.sources=[pyrovelocity] data_external.pyrovelocity.process=[larry_mono] + deps: + - preprocess.py + - ${data_external.pyrovelocity.larry_mono.rel_path} + params: + - config.yaml: + - base + - data_external.pyrovelocity.larry_mono + outs: + - ${data_external.pyrovelocity.larry_mono.derived.rel_path} + # persist: true + + preprocess_larry_neu: + cmd: python preprocess.py data_external.sources=[pyrovelocity] data_external.pyrovelocity.process=[larry_neu] + deps: + - preprocess.py + - ${data_external.pyrovelocity.larry_neu.rel_path} + params: + - config.yaml: + - base + - data_external.pyrovelocity.larry_neu + outs: + - ${data_external.pyrovelocity.larry_neu.derived.rel_path} + # persist: true + train_simulate_model1: cmd: /usr/bin/time -v python train.py model_training.train=[simulate_model1] deps: @@ -409,6 +459,52 @@ stages: - ${model_training.larry_model2.pyrovelocity_data_path} # persist: true + train_larry_mono_model2: + cmd: /usr/bin/time -v python train.py model_training.train=[larry_mono_model2] + deps: + - train.py + - ${model_training.larry_mono_model2.input_data_path} + params: + - config.yaml: + - model_training.larry_mono_model2 + metrics: + - ${model_training.larry_mono_model2.metrics_path}: + cache: false + outs: + - ${model_training.larry_mono_model2.run_info_path}: + cache: false + - ${model_training.larry_mono_model2.training_parameters.loss_plot_path} + # persist: true + - ${model_training.larry_mono_model2.trained_data_path} + # persist: true + - ${model_training.larry_mono_model2.model_path} + # persist: true + - ${model_training.larry_mono_model2.pyrovelocity_data_path} + # persist: true + + train_larry_neu_model2: + cmd: /usr/bin/time -v python train.py model_training.train=[larry_neu_model2] + deps: + - train.py + - ${model_training.larry_neu_model2.input_data_path} + params: + - config.yaml: + - model_training.larry_neu_model2 + metrics: + - ${model_training.larry_neu_model2.metrics_path}: + cache: false + outs: + - ${model_training.larry_neu_model2.run_info_path}: + cache: false + - ${model_training.larry_neu_model2.training_parameters.loss_plot_path} + # persist: true + - ${model_training.larry_neu_model2.trained_data_path} + # persist: true + - ${model_training.larry_neu_model2.model_path} + # persist: true + - ${model_training.larry_neu_model2.pyrovelocity_data_path} + # persist: true + figure2: cmd: python fig2/figure.py deps: diff --git a/reproducibility/figures/models/larry_mono_model2/.gitignore b/reproducibility/figures/models/larry_mono_model2/.gitignore new file mode 100644 index 000000000..d82a076b3 --- /dev/null +++ b/reproducibility/figures/models/larry_mono_model2/.gitignore @@ -0,0 +1,3 @@ +/loss_plot.png +/model +/pyrovelocity.pkl diff --git a/reproducibility/figures/models/larry_mono_model2/metrics.json b/reproducibility/figures/models/larry_mono_model2/metrics.json new file mode 100644 index 000000000..a6c691e2e --- /dev/null +++ b/reproducibility/figures/models/larry_mono_model2/metrics.json @@ -0,0 +1,7 @@ +{ + "-ELBO": -0.9092934969035932, + "MAE": 0.29600671132732154, + "FDR_HMP": 7.114115202502107e-10, + "FDR_sig_frac": 0.831, + "real_epochs": 1000.0 +} \ No newline at end of file diff --git a/reproducibility/figures/models/larry_mono_model2/run_info.json b/reproducibility/figures/models/larry_mono_model2/run_info.json new file mode 100644 index 000000000..54636a071 --- /dev/null +++ b/reproducibility/figures/models/larry_mono_model2/run_info.json @@ -0,0 +1,12 @@ +{ + "artifact_uri": "file:///home/jupyter/pyrovelocity/reproducibility/figures/mlruns/0/62d2603eb6594a8894c11292ac544ba2/artifacts", + "end_time": 1682515549322, + "experiment_id": "0", + "lifecycle_stage": "active", + "run_id": "62d2603eb6594a8894c11292ac544ba2", + "run_name": "larry_mono_model2-62d2603", + "run_uuid": "62d2603eb6594a8894c11292ac544ba2", + "start_time": 1682515031960, + "status": "FINISHED", + "user_id": "jupyter" +} \ No newline at end of file diff --git a/reproducibility/figures/models/larry_neu_model2/.gitignore b/reproducibility/figures/models/larry_neu_model2/.gitignore new file mode 100644 index 000000000..d82a076b3 --- /dev/null +++ b/reproducibility/figures/models/larry_neu_model2/.gitignore @@ -0,0 +1,3 @@ +/loss_plot.png +/model +/pyrovelocity.pkl diff --git a/reproducibility/figures/models/larry_neu_model2/metrics.json b/reproducibility/figures/models/larry_neu_model2/metrics.json new file mode 100644 index 000000000..4502815e5 --- /dev/null +++ b/reproducibility/figures/models/larry_neu_model2/metrics.json @@ -0,0 +1,7 @@ +{ + "-ELBO": -0.9373059432127164, + "MAE": 0.31443805465303426, + "FDR_HMP": 3.2016832157452906e-09, + "FDR_sig_frac": 0.682, + "real_epochs": 1000.0 +} \ No newline at end of file diff --git a/reproducibility/figures/models/larry_neu_model2/run_info.json b/reproducibility/figures/models/larry_neu_model2/run_info.json new file mode 100644 index 000000000..af9557b99 --- /dev/null +++ b/reproducibility/figures/models/larry_neu_model2/run_info.json @@ -0,0 +1,12 @@ +{ + "artifact_uri": "file:///home/jupyter/pyrovelocity/reproducibility/figures/mlruns/0/be498221df1945b49407f45bbef748a8/artifacts", + "end_time": 1682523702581, + "experiment_id": "0", + "lifecycle_stage": "active", + "run_id": "be498221df1945b49407f45bbef748a8", + "run_name": "larry_neu_model2-be49822", + "run_uuid": "be498221df1945b49407f45bbef748a8", + "start_time": 1682523236737, + "status": "FINISHED", + "user_id": "jupyter" +} \ No newline at end of file