better handling of too few reads to fit models

maxibor · May 7, 2020 · 509f3f6 · 509f3f6
1 parent f129313
commit 509f3f6
Show file tree

Hide file tree

Showing 4 changed files with 22 additions and 7 deletions.
diff --git a/pydamage/damage.py b/pydamage/damage.py
@@ -62,11 +62,22 @@ def avg_coverage(pysam_cov):
     cov = np.mean(cov_all_bases)
     return(cov)
 
+def check_model_fit(model_dict, wlen):
+    # Check that no fitted parameters or stdev are infinite
+    if np.inf in model_dict.values() or -np.inf in model_dict.values():
+        return(False)
+
+    # Check that references have at least reads aligned on the first n wlen bases
+    if not all(i in model_dict.keys() for i in list(range(wlen))):
+        return(False)
+    return(model_dict)
+
 def test_damage(ref, bam, mode, wlen, show_al, min_al, min_cov, process, verbose):
     al_handle = pysam.AlignmentFile(bam, mode=mode, threads=process)
     try:
         cov = avg_coverage(al_handle.count_coverage(contig=ref))
         nb_reads_aligned = al_handle.count(contig=ref)
+        reflen = al_handle.get_reference_length(ref)
 
         if nb_reads_aligned >= min_al or cov >= min_cov:
             al = al_to_damage(reference=ref, al_handle=al_handle)
@@ -84,9 +95,11 @@ def test_damage(ref, bam, mode, wlen, show_al, min_al, min_cov, process, verbose
                 test_res['reference'] = ref
                 test_res['nb_reads_aligned'] = nb_reads_aligned
                 test_res['coverage'] = cov
-                return(test_res)
+                return(check_model_fit(test_res, wlen))
         else:
             pass
-    except ValueError:
-        print(f"Could not fit a model for {ref} because of too few reads aligned ({nb_reads_aligned})")
-        pass
+    except ValueError as e:
+        print(f"Could not fit a model for {ref} because of too few reads aligned")
+        print(f"Model fitting error: {e}")
+        print(f"nb_reads_aligned: {nb_reads_aligned} - coverage: {cov} - reflen: {reflen}\n")
+        return(False)
diff --git a/pydamage/main.py b/pydamage/main.py
@@ -64,7 +64,7 @@ def analyze(bam, wlen=30, show_al=False, mini=2000, cov=0.5, process=1, outdir="
     with multiprocessing.Pool(proc) as p:
         res = p.map(test_damage_partial, refs)
         filt_res = [i for i in res if i]
-    if plot:
+    if plot and len(filt_res) > 0:
         print("\nGenerating pydamage plots")
         for ref in tqdm(filt_res):
             dam_plot = damageplot(damage_dict=ref, wlen=wlen, qlen = ref['qlen'], outdir=outdir)

diff --git a/pydamage/models.py b/pydamage/models.py
@@ -6,7 +6,7 @@
 class geom_mod():
     def __init__(self):
         self.kwds = ['geom_p', 'geom_pmin', 'geom_pmax']
-        self.bounds = ((0.01, 0., 0.01), (0.99, 0.2, 0.99))
+        self.bounds = ((0.001, 0.001, 0.001), (0.99, 0.2, 0.99))
 
     def __repr__(self):
         return(
@@ -57,7 +57,7 @@ def log_pmf(self, x, geom_p, geom_pmin, geom_pmax):
 class unif_mod():
     def __init__(self):
         self.kwds = ('unif_pmin',)
-        self.bounds = ((0.,), (0.2,))
+        self.bounds = ((0.001,), (0.2,))
 
     def __repr__(self):
         return(

diff --git a/pydamage/utils.py b/pydamage/utils.py
@@ -34,6 +34,8 @@ def makedir(dirpath, confirm=True, force=False):
 
 def pandas_processing(res_dict, outdir):
     df = pd.DataFrame(res_dict)
+    if len(res_dict) == 0:
+        return(df)
     df['qvalue'] = multipletests(df['pvalue'], method='fdr_bh')[1]
     df = df[['unif_pmin', 'unif_pmin_stdev', 
              'geom_p', 'geom_p_stdev',