Skip to content

Commit

Permalink
make flake8 happy
Browse files Browse the repository at this point in the history
  • Loading branch information
maxibor committed Nov 10, 2020
1 parent 4d4756c commit f2ff0be
Show file tree
Hide file tree
Showing 4 changed files with 60 additions and 35 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/pydamage_ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ jobs:
if: "!contains(github.event.head_commit.message, '[skip_ci]')"
steps:
- uses: actions/checkout@v2
- uses: conda-incubator/setup-miniconda@v1
- uses: conda-incubator/setup-miniconda@v2
with:
python-version: 3.7
mamba-version: "*"
Expand Down
43 changes: 21 additions & 22 deletions docs/source/output.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,34 +4,33 @@ Pydamage generates both a tabular and a visual output.

The tabular output is a comma-separated file (`.csv`) with the following columns, for each analysed reference:

* `reference`: name of the reference genome/contig
* `pred_accuracy`: Predicted accuracy of Pydamage prediction, from the GLM modelling
* `null_model_p0`: parameter `p0` of the null model
* `null_model_p0_stdev`: standard error of the null model paramater `p0`
* `damage_model_p`: parameter `p` of the damage model
* `damage_model_p_stdev`: standard error of the parameter `p` of the damage model
* `damage_model_pmin`: paramater `p_min` of the damage model. *This is the modelled damage baseline*
* `damage_model_pmin_stdev`: standard error of the paramater `p_min` of the damage model
* `damage_model_pmax`: paramater `p_max` of the damage model. *This is the modelled amount of damage on the 5' end.*
* `damage_model_pmax_stdev`: standard error of the paramater `p_max` of the damage model
* `pvalue`: p-value calculated from the likelihood-ratio test-statistic using a chi-squared distribution
* `qvalue`: p-value corrected for multiple testing using Benjamini-Hochberg procedure. *Only computed when multiple references are used*
* `RMSE`: residual mean standard error of the model fit of the damage model
* `nb_reads_aligned`: number of aligned reads
* `coverage`: average coverage along the reference genome
* `CtoT-N`: Proportion of CtoT substitutions observed at position `N` from 5' end
* `GtoA-N`: Proportion of GtoA substitutions observed at position `N` from 5'
* `reference`: name of the reference genome/contig
* `pred_accuracy`: Predicted accuracy of Pydamage prediction, from the GLM modelling
* `null_model_p0`: parameter `p0` of the null model
* `null_model_p0_stdev`: standard error of the null model paramater `p0`
* `damage_model_p`: parameter `p` of the damage model
* `damage_model_p_stdev`: standard error of the parameter `p` of the damage model
* `damage_model_pmin`: paramater `p_min` of the damage model. *This is the modelled damage baseline*
* `damage_model_pmin_stdev`: standard error of the paramater `p_min` of the damage model
* `damage_model_pmax`: paramater `p_max` of the damage model. *This is the modelled amount of damage on the 5' end.*
* `damage_model_pmax_stdev`: standard error of the paramater `p_max` of the damage model
* `pvalue`: p-value calculated from the likelihood-ratio test-statistic using a chi-squared distribution
* `qvalue`: p-value corrected for multiple testing using Benjamini-Hochberg procedure. *Only computed when multiple references are used*
* `RMSE`: residual mean standard error of the model fit of the damage model
* `nb_reads_aligned`: number of aligned reads
* `coverage`: average coverage along the reference genome
* `CtoT-N`: Proportion of CtoT substitutions observed at position `N` from 5' end
* `GtoA-N`: Proportion of GtoA substitutions observed at position `N` from 5'

> To select contigs/references with damage, you will most likely want to look at two columns: `pred_accuracy > 0.9` and `qvalue <= 0.05`
The visual output are PNG files, one per reference contig. They show the frequency of observed C to T, and G to A transition at the 5' end of the sequencing data and overlay it with the fitted models for both the null and the damage model, including 95% confidence intervals. Furthermore, it provides a "residuals versus fitted" plot to help evaluate the fit of the pydamage damage model. Finally, the plot contains informtion on the average coverage along the reference and the p-value calculated from the likelihood-ratio test-statistic using a chi-squared distribution.

> The visual output is only produced when using the `--plot` flag
> The visual output is only produced when using the `--plot` flag
## Example

- [**Tabular ouput**](https://raw.githubusercontent.com/maxibor/pydamage/master/docs/assets/pydamage_results.csv)
- **Visual output**
* [**Tabular ouput**](https://raw.githubusercontent.com/maxibor/pydamage/master/docs/assets/pydamage_results.csv)
* **Visual output**

![](../img/NZ_JHCB02000011.1.png)
![pydamage_plot](../img/NZ_JHCB02000011.1.png)
28 changes: 20 additions & 8 deletions pydamage/damage.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def get_damage(self, wlen, show_al):
all_g = []
all_bases = []
for al in self.alignments:
if al.is_reverse == False and al.is_unmapped == False:
if al.is_reverse is False and al.is_unmapped is False:
cigar = al.cigartuples
ref = al.get_reference_sequence()
quer = al.query_sequence
Expand Down Expand Up @@ -103,7 +103,8 @@ def check_model_fit(model_dict, wlen, verbose):
if np.any(np.array(model_dict["base_cov"][:wlen]) == 0):
if verbose:
print(
f"Could not reliably fit a model to {model_dict['reference']} because of too few reads aligned"
f"Could not reliably fit a model to {model_dict['reference']}"
"because of too few reads aligned"
)
return False
return model_dict
Expand Down Expand Up @@ -159,7 +160,8 @@ def test_damage(ref, bam, mode, wlen, show_al, process, verbose):
print(f"Model fitting for {ref} failed")
print(f"Model fitting error: {e}")
print(
f"nb_reads_aligned: {nb_reads_aligned} - coverage: {cov} - reflen: {reflen}\n"
f"nb_reads_aligned: {nb_reads_aligned} - coverage: {cov}"
" - reflen: {reflen}\n"
)
return False

Expand Down Expand Up @@ -203,13 +205,21 @@ def get_damage_group(ref, bam, mode, wlen, show_al, process):


def test_damage_group(
ct_data, ga_data, cc_data, all_bases, nb_reads_aligned, cov, reflen, wlen, verbose
ct_data,
ga_data,
cc_data,
all_bases,
nb_reads_aligned,
cov,
reflen,
wlen,
verbose
):
"""Performs damage test
Args:
ct_data (list of int): List of positions where CtoT transitions were observed
ga_data (list of int): List of positions where GtoA transitions were observed
ct_data (list of int): List of positions with CtoT transitions
ga_data (list of int): List of positions with GtoA transitions
cc_data (list of int): List of positions where C in ref and query
all_bases (list of int): List of positions where a base is aligned
nb_reads_aligned(int): number of reads aligned
Expand All @@ -220,6 +230,7 @@ def test_damage_group(
Returns:
dict: Dictionary containing LR test results
"""
ref = "reference"
try:
if ct_data:
model_A = models.damage_model()
Expand All @@ -235,7 +246,7 @@ def test_damage_group(
wlen=wlen,
verbose=verbose,
)
test_res["reference"] = "reference"
test_res["reference"] = ref
test_res["nb_reads_aligned"] = nb_reads_aligned
test_res["coverage"] = cov
test_res["reflen"] = reflen
Expand All @@ -247,6 +258,7 @@ def test_damage_group(
print(f"Model fitting for {ref} failed")
print(f"Model fitting error: {e}")
print(
f"nb_reads_aligned: {nb_reads_aligned} - coverage: {cov} - reflen: {reflen}\n"
f"nb_reads_aligned: {nb_reads_aligned} - coverage: {cov} "
"- reflen: {reflen}\n"
)
return False
22 changes: 18 additions & 4 deletions pydamage/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
from pydamage import damage
from pydamage.plot import damageplot
from pydamage.accuracy_model import prepare_data, load_model, fit_model
import pandas as pd
import sys
from tqdm import tqdm
import warnings
Expand All @@ -26,10 +25,24 @@ def analyze(
):

if group:
analyze_group(bam, wlen, show_al, process, outdir, plot, verbose, force)
analyze_group(bam,
wlen,
show_al,
process,
outdir,
plot,
verbose,
force)

else:
analyze_multi(bam, wlen, show_al, process, outdir, plot, verbose, force)
analyze_multi(bam,
wlen,
show_al,
process,
outdir,
plot,
verbose,
force)


def analyze_multi(
Expand Down Expand Up @@ -190,7 +203,8 @@ def analyze_group(
)
print("Estimating and testing Damage")
with multiprocessing.Pool(proc) as p:
res = list(tqdm(p.imap(get_damage_group_partial, refs), total=len(refs)))
res = list(
tqdm(p.imap(get_damage_group_partial, refs), total=len(refs)))
ct_data = []
ga_data = []
cc_data = []
Expand Down

0 comments on commit f2ff0be

Please sign in to comment.