Skip to content

Commit

Permalink
adding kneedle and pydamage filter command
Browse files Browse the repository at this point in the history
  • Loading branch information
maxibor committed Mar 24, 2021
1 parent b8f7898 commit 0b957e6
Show file tree
Hide file tree
Showing 7 changed files with 145 additions and 34 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ pip install -e .
## Quick start

```bash
pydamage aligned.bam
pydamage analyze aligned.bam
```

## CLI help
Expand Down
4 changes: 3 additions & 1 deletion conda/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,14 +21,16 @@ requirements:
- python>=3.6
- click>=7.0
- numpy>=1.17.5
- pandas>=1.0.0
- pandas>=1.1.0
- patsy>=0.5.1
- pip>=20.0.2
- bioconda::pysam>=0.15.2
- scipy>=1.4.1
- statsmodels>=0.11.0
- matplotlib>=3.1.1
- tqdm>=4.45.0
- biopython>=1.78
- kneed>=0.7.0

test:
commands:
Expand Down
11 changes: 9 additions & 2 deletions docs/source/output.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,9 @@

Pydamage generates both a tabular and a visual output.

The tabular output is a comma-separated file (`.csv`) with the following columns, for each analysed reference:
The tabular outputs are comma-separated file (`.csv`) with the following columns, for each analysed reference:

### `pydamage_results.csv`

* `reference`: name of the reference genome/contig
* `pred_accuracy`: Predicted accuracy of Pydamage prediction, from the GLM modelling
Expand All @@ -22,7 +24,12 @@ The tabular output is a comma-separated file (`.csv`) with the following columns
* `CtoT-N`: Proportion of CtoT substitutions observed at position `N` from 5' end
* `GtoA-N`: Proportion of GtoA substitutions observed at position `N` from 5'

> To select contigs/references with damage, you will most likely want to look at two columns: `pred_accuracy > 0.9` and `qvalue <= 0.05`

### `pydamage_filtered_results.csv`

Same file as above, but with contigs filtered with `qvalue <= 0.05` and `pred_accuracy >= threshold` with the filtering threshold determined with the [kneedle](https://ieeexplore.ieee.org/document/5961514) method.

### Plots

The visual output are PNG files, one per reference contig. They show the frequency of observed C to T, and G to A transition at the 5' end of the sequencing data and overlay it with the fitted models for both the null and the damage model, including 95% confidence intervals. Furthermore, it provides a "residuals versus fitted" plot to help evaluate the fit of the pydamage damage model. Finally, the plot contains informtion on the average coverage along the reference and the p-value calculated from the likelihood-ratio test-statistic using a chi-squared distribution.

Expand Down
4 changes: 2 additions & 2 deletions pydamage/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
__version__ = "0.50beta"
from pydamage.main import analyze
__version__ = "0.60"
from pydamage.main import pydamage_analyze
85 changes: 58 additions & 27 deletions pydamage/cli.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,39 @@
#!/usr/bin/env python3

import click
from pydamage.main import analyze, analyze_group
from pydamage.main import pydamage_analyze
from pydamage.kneedle import apply_filter
from pydamage import __version__


@click.command()
@click.group()
@click.version_option(__version__)
@click.pass_context
@click.option(
"-o",
"--outdir",
type=click.Path(writable=True, dir_okay=True),
default="pydamage_results",
show_default=True,
help="Output directory",
)
def cli(ctx, outdir):
"""\b
PyDamage: Damage parameter estimation for ancient DNA
Author: Maxime Borry
Contact: <borry[at]shh.mpg.de>
Homepage & Documentation: github.com/maxibor/pydamage
"""

ctx.ensure_object(dict)

ctx.obj["outdir"] = outdir
pass


@cli.command()
@click.pass_context
@click.argument("bam", type=click.Path(exists=True))
@click.option(
"-w",
Expand All @@ -25,37 +52,41 @@
help="Number of processes for parallel computing",
)
@click.option(
"-o",
"--outdir",
type=click.Path(writable=True, dir_okay=True),
default="pydamage_results",
show_default=True,
help="Output directory",
"-s", "--show_al", is_flag=True, help="Display alignments representations"
)
@click.option("-s", "--show_al",
is_flag=True,
help="Display alignments representations")
@click.option("-pl", "--plot",
is_flag=True,
help="Write damage fitting plots to disk")
@click.option("-pl", "--plot", is_flag=True, help="Write damage fitting plots to disk")
@click.option("-vv", "--verbose", is_flag=True, help="Verbose mode")
@click.option("-f", "--force",
is_flag=True,
help="Force overwriting of results directory")
@click.option("-g", "--group",
is_flag=True,
help="Use entire BAM file as single reference for analyis "
"(ignore reference headers)")
def cli(no_args_is_help=True, **kwargs):
@click.option(
"-f", "--force", is_flag=True, help="Force overwriting of results directory"
)
@click.option(
"-g",
"--group",
is_flag=True,
help="Use entire BAM file as single reference for analyis "
"(ignore reference headers)",
)
def analyze(ctx, no_args_is_help=True, **kwargs):
"""\b
PyDamage: Damage parameter estimation for ancient DNA
Author: Maxime Borry
Contact: <borry[at]shh.mpg.de>
Homepage & Documentation: github.com/maxibor/pydamage
Run the PyDamage analysis
BAM: path to BAM/SAM/CRAM alignment file. MD tags need to be set.
"""
analyze(**kwargs)
pydamage_analyze(**kwargs, **ctx.obj)


@cli.command()
@click.pass_context
@click.argument("csv", type=click.Path(exists=True))
def filter(ctx, no_args_is_help=True, **kwargs):
"""\b
Filter PyDamage results with optimal pred_accuracy threshold selection
CSV: path to PyDamage result file
"""
print(kwargs, ctx.obj)

apply_filter(**kwargs, **ctx.obj)


if __name__ == "__main__":
Expand Down
70 changes: 70 additions & 0 deletions pydamage/kneedle.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
from kneed import KneeLocator
from numpy import arange
from pydamage.utils import df_to_csv
from pandas import read_csv


def find_knee(pydam_df, min_knee=0.5, alpha=0.05):
"""Find kneedle point in PyDamage results
Finding the kneedle point to get the optimal
tradeoff between FP and FN, for the predicted
accurary threshold
Args:
pydam_df (pandas df): pydamage results
min_knee (float, optional): Min pred_accuracy threshold. Defaults to 0.5
alpha(float, optional): Alpha q-value threshold
"""
thresholds = [i.round(2) for i in arange(min_knee, 1, 0.01)]
nb_contigs = list()
nb_contigs = []
for i in thresholds:
nb_contigs.append(
pydam_df.query(f"pred_accuracy >= {i} & qvalue <= {alpha}").shape[0]
)
kneedle = KneeLocator(
thresholds,
nb_contigs,
S=1.0,
curve="convex",
direction="decreasing",
online=True,
)
print(thresholds)
print(nb_contigs)
return kneedle.knee


def filter_pydamage_results(pydam_df, acc_thresh, alpha=0.05):
"""Filter pydamage results on pred_accuracy and qvalue
Args:
pydam_df (pandas df): pydamage results
acc_thresh (float): predictiona accuracy threshold
alpha (float, optional): Alpha q-value threshold. Defaults to 0.05.
"""

return pydam_df.query(f"pred_accuracy >= {acc_thresh} & qvalue <= {alpha}")


def apply_filter(csv, outdir, alpha=0.05):
"""Apply pydamage filtering
Args:
csv (str): path to pydamage result file
outdir (str): Path to output directory
alpha (float, optional): Alpha q-value threshold. Defaults to 0.05.
"""

df = read_csv(csv)
outfile = "pydamage_filtered_results.csv"
knee = find_knee(df)
print(f"Optimal prediction accuracy threshold found to be: {knee}")
filt_df = filter_pydamage_results(df, acc_thresh=knee)
print(
f"Filtering PyDamage results with qvalue <={alpha} and pred_accuracy >= {knee}"
)
df_to_csv(filt_df, outdir, outfile)
print(f"Filtered PyDamage results written to {outdir}/{outfile}")
return filt_df
3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,8 @@ def get_version(rel_path):
"statsmodels",
"matplotlib",
"tqdm",
"biopython"
"biopython",
"kneed"
],
packages=find_packages(include=["pydamage"]),
entry_points={"console_scripts": ["pydamage = pydamage.cli:cli"]},
Expand Down

0 comments on commit 0b957e6

Please sign in to comment.