Merge pull request #11 from maxibor/dev

Version 0.40
maxibor · Sep 22, 2020 · 204d4fc · 204d4fc
2 parents 7b21923 + be55fe4
commit 204d4fc
Show file tree

Hide file tree

Showing 14 changed files with 68,378 additions and 138 deletions.
diff --git a/.github/workflows/pydamage_ci.yml b/.github/workflows/pydamage_ci.yml
@@ -10,12 +10,14 @@ jobs:
     if: "!contains(github.event.head_commit.message, '[skip_ci]')"
     steps:
       - uses: actions/checkout@v2
-      - uses: goanpeca/setup-miniconda@v1
+      - uses: conda-incubator/setup-miniconda@v1
         with:
-          activate-environment: pydamage
-          environment-file: environment.yml
           python-version: 3.7
-          auto-activate-base: false
+          mamba-version: "*"
+          channels: conda-forge,bioconda,defaults
+          channel-priority: true
+          environment-file: environment.yml
+          activate-environment: pydamage
       - name: Lint with flake8
         shell: bash -l {0}
         run: |
@@ -34,4 +36,4 @@ jobs:
       - name: Check pydamage on test data
         shell: bash -l {0}
         run: |
-          pydamage --verbose tests/data/aligned.bam
+          pydamage --verbose tests/data/aligned.bam tests/data/sequence.fa
diff --git a/README.md b/README.md
@@ -25,23 +25,25 @@ pip install pydamage
 Using pip
 
 ```bash
-pip install git+ssh://git@github.com/maxibor/pydamage.git
+pip install git+ssh://git@github.com/maxibor/pydamage.git@dev
 ```
 
 By cloning in a dedicated conda environment
 
 ```bash
 git clone git@github.com:maxibor/pydamage.git
 cd pydamage
+git checkout dev
 conda env create -f environment.yml
 conda activate pydamage
+pip install -e .
 ```
 
 
 ## Quick start
 
 ```bash
-pydamage aligned.bam
+pydamage aligned.bam reference.fa
 ```
 
 ## CLI help

diff --git a/docs/assets/pydamage_results.csv b/docs/assets/pydamage_results.csv
diff --git a/docs/source/output.md b/docs/source/output.md
@@ -2,24 +2,26 @@
 
 Pydamage generates both a tabular and a visual output.
 
-The tabular output is a comma-separated file (`.csv`) with the following information for all analysed references:
+The tabular output is a comma-separated file (`.csv`) with the following columns, for each analysed reference:
 
   * `reference`: name of the reference genome/contig
   * `null_model_p0`: parameter `p0` of the null model
   * `null_model_p0_stdev`: standard error of the null model paramater `p0`
   * `damage_model_p`: parameter `p` of the damage model
   * `damage_model_p_stdev`: standard error of the parameter `p` of the damage model
-  * `damage_model_pmin`: paramater `p_min` of the damage model
+  * `damage_model_pmin`: paramater `p_min` of the damage model. *This is the modelled damage baseline*
   * `damage_model_pmin_stdev`: standard error of the paramater `p_min` of the damage model
-  * `damage_model_pmax`: paramater `p_max` of the damage model
+  * `damage_model_pmax`: paramater `p_max` of the damage model. *This is the modelled amount of damage on the 5' end.*
   * `damage_model_pmax_stdev`: standard error of the paramater `p_max` of the damage model
   * `pvalue`: p-value calculated from the likelihood-ratio test-statistic using a chi-squared distribution
   * `qvalue`: p-value corrected for multiple testing using Benjamini-Hochberg procedure
   * `RMSE`: residual mean standard error of the model fit of the damage model
   * `nb_reads_aligned`: number of aligned reads
   * `coverage`: average coverage along the reference genome
+  * `CtoT-N`: Proportion of CtoT substitutions observed at position `N` from 5' end
+  * `GtoA-N`: Proportion of GtoA substitutions observed at position `N` from 5'
+  * `pred_accuracy`: Predicted accuracy of Pydamage prediction, from the GLM modelling
 
-Finally, the remaining columns indicate the frequency of C to T and G to A transitions at every read position observed in the sequencing data.
 
 The visual output are PNG files, one per reference contig. They show the frequency of observed C to T, and G to A transitions at the 5' end of the sequencing data and overlay it with the fitted models for both the null and the damage model, including 95% confidence intervals. Furthermore, it provides a "residuals versus fitted" plot to help evaluate the fit of the pydamage damage model. Finally, the plot contains informtion on the average coverage along the reference and the p-value calculated from the likelihood-ratio test-statistic using a chi-squared distribution.
 

diff --git a/pydamage/__init__.py b/pydamage/__init__.py
@@ -1,2 +1,2 @@
-__version__ = '0.30'
+__version__ = "0.40"
 from pydamage.main import analyze
diff --git a/pydamage/accuracy_model.py b/pydamage/accuracy_model.py
@@ -0,0 +1,93 @@
+import pkg_resources
+import pandas as pd
+import numpy as np
+import pickle
+
+
+def load_model():
+    """Returns the pmml model"""
+    # This is a stream,like object. If you want the actual info, call
+    # stream.read()
+    stream = pkg_resources.resource_stream(__name__, "models/glm_accuracy_model.pickle")
+    return pickle.load(stream)
+
+
+def prepare_data(pd_df):
+    """Prepare pydamage result data for accuracy modelling
+
+    Args:
+        pd_df (pandas DataFrame):pydamage df result
+    """
+    coverage_bins = pd.IntervalIndex.from_tuples(
+        [
+            (0, 2),
+            (2, 3),
+            (3, 5),
+            (5, 10),
+            (10, 20),
+            (20, 50),
+            (50, 100),
+            (100, 200),
+            (200, np.inf),
+        ]
+    )
+    coverage_bins_labels = [
+        "1-2",
+        "2-3",
+        "3-5",
+        "5-10",
+        "10-20",
+        "20-50",
+        "50-100",
+        "100-200",
+        "200-500",
+    ]
+
+    reflen_bins = pd.IntervalIndex.from_tuples(
+        [
+            (0, 1000),
+            (1000, 2000),
+            (2000, 5000),
+            (5000, 10000),
+            (10000, 20000),
+            (20000, 50000),
+            (50000, 100000),
+            (100000, 200000),
+            (200000, np.inf),
+        ]
+    )
+
+    reflen_bins_labels = [
+        "500-1000",
+        "1000-2000",
+        "2000-5000",
+        "5000-10000",
+        "10000-20000",
+        "20000-50000",
+        "50000-100000",
+        "100000-200000",
+        "200000-500000",
+    ]
+    simu_cov = pd.cut(pd_df["coverage"], coverage_bins)
+    simu_cov.cat.rename_categories(coverage_bins_labels, inplace=True)
+    pd_df["simuCov"] = simu_cov
+
+    simu_contig_length = pd.cut(pd_df["reflen"], reflen_bins)
+    simu_contig_length.cat.rename_categories(reflen_bins_labels, inplace=True)
+
+    pd_df["simuContigLength"] = simu_contig_length
+    pd_df = pd_df[
+        ["simuCov", "simuContigLength", "damage_model_pmax", "gc_content"]
+    ].rename(columns={"damage_model_pmax": "damage", "gc_content": "GCcontent"})
+
+    return pd_df
+
+
+def fit_model(df, model):
+    """Fit GLM model to data
+
+    Args:
+        df (pandas DataFrame): prepared pydamage results
+        model (pypmml model): GLM accuracy model
+    """
+    return model.predict(df).to_frame(name="pred_accuracy")
diff --git a/pydamage/cli.py b/pydamage/cli.py
@@ -7,49 +7,52 @@
 
 @click.command()
 @click.version_option(__version__)
-@click.argument('bam', type=click.Path(exists=True))
-@click.option('-w',
-              '--wlen',
-              default=35,
-              type=int,
-              show_default=True,
-              help='Window length for damage modeling')
-@click.option('-p',
-              '--process',
-              default=2,
-              type=int,
-              show_default=True,
-              help='Number of processes')
-@click.option('-m',
-              '--mini',
-              default=1000,
-              type=int,
-              show_default=True,
-              help='Minimum reads aligned to consider reference')
-@click.option('-c',
-              '--cov',
-              default=8,
-              type=float,
-              show_default=True,
-              help='Minimum coverage to consider reference')
-@click.option('-s',
-              '--show_al',
-              is_flag=True,
-              help='Show alignments representations')
-@click.option('-pl',
-              '--plot',
-              is_flag=True,
-              help='Make the damage plots')
-@click.option('--verbose', is_flag=True, help='Verbose mode')
-@click.option('-o',
-              '--outdir',
-              type=click.Path(writable=True, dir_okay=True),
-              default="pydamage_results",
-              show_default=True,
-              help="Output directory")
-@click.option('--force',
-              is_flag=True,
-              help='Force overwriting of results directory')
+@click.argument("bam", type=click.Path(exists=True))
+@click.argument("fasta", type=click.Path(exists=True))
+@click.option(
+    "-w",
+    "--wlen",
+    default=35,
+    type=int,
+    show_default=True,
+    help="Window length for damage modeling",
+)
+@click.option(
+    "-p",
+    "--process",
+    default=2,
+    type=int,
+    show_default=True,
+    help="Number of processes",
+)
+@click.option(
+    "-m",
+    "--mini",
+    default=1000,
+    type=int,
+    show_default=True,
+    help="Minimum reads aligned to consider reference",
+)
+@click.option(
+    "-c",
+    "--cov",
+    default=8,
+    type=float,
+    show_default=True,
+    help="Minimum coverage to consider reference",
+)
+@click.option("-s", "--show_al", is_flag=True, help="Show alignments representations")
+@click.option("-pl", "--plot", is_flag=True, help="Make the damage plots")
+@click.option("--verbose", is_flag=True, help="Verbose mode")
+@click.option(
+    "-o",
+    "--outdir",
+    type=click.Path(writable=True, dir_okay=True),
+    default="pydamage_results",
+    show_default=True,
+    help="Output directory",
+)
+@click.option("--force", is_flag=True, help="Force overwriting of results directory")
 def cli(no_args_is_help=True, **kwargs):
     """\b
     PyDamage: Damage parameter estimation for ancient DNA
@@ -58,6 +61,7 @@ def cli(no_args_is_help=True, **kwargs):
     Homepage & Documentation: github.com/maxibor/pydamage
 
     BAM: path to BAM/SAM/CRAM alignment file
+    FASTA: path to reference FASTA file
     """
     analyze(**kwargs)