add reverse GtoA damage

maxibor · May 16, 2021 · 8612707 · 8612707
1 parent ac7463a
commit 8612707
Show file tree

Hide file tree

Showing 6 changed files with 271 additions and 178 deletions.
diff --git a/pydamage/cli.py b/pydamage/cli.py
@@ -7,11 +7,10 @@
 from pydamage import __version__
 from collections import OrderedDict
 
-class NaturalOrderGroup(click.Group):
 
+class NaturalOrderGroup(click.Group):
     def __init__(self, name=None, commands=None, **attrs):
-        super(NaturalOrderGroup, self).__init__(
-            name=name, commands=None, **attrs)
+        super(NaturalOrderGroup, self).__init__(name=name, commands=None, **attrs)
         if commands is None:
             commands = OrderedDict()
         elif not isinstance(commands, OrderedDict):
@@ -21,6 +20,7 @@ def __init__(self, name=None, commands=None, **attrs):
     def list_commands(self, ctx):
         return self.commands.keys()
 
+
 @click.group(cls=NaturalOrderGroup)
 @click.version_option(__version__)
 @click.pass_context
@@ -74,6 +74,12 @@ def cli(ctx, outdir):
 @click.option(
     "-f", "--force", is_flag=True, help="Force overwriting of results directory"
 )
+@click.option(
+    "-r",
+    "--count_reverse",
+    is_flag=True,
+    help="Also count GtoA mutations in reverse reads",
+)
 @click.option(
     "-g",
     "--group",
@@ -103,11 +109,12 @@ def filter(ctx, no_args_is_help=True, **kwargs):
 
     apply_filter(**kwargs, **ctx.obj)
 
+
 @cli.command()
 def cite():
-    """Get pydamage citation in bibtex format
-    """
+    """Get pydamage citation in bibtex format"""
     get_citation()
 
+
 if __name__ == "__main__":
     cli()
diff --git a/pydamage/damage.py b/pydamage/damage.py
@@ -8,61 +8,142 @@
 
 
 class al_to_damage:
-    def __init__(self, reference, al_handle):
+    def __init__(self, reference, al_handle, wlen):
         """Constructor of the class
 
         Args:
             reference (string): a reference from the indexed bam file
             al_handle(pysam.AlignmentFile)
+            wlen (int): window length
+
         """
         self.alignments = al_handle.fetch(reference)
         self.reference = reference
+        self.wlen = wlen
         # self.alignments = al_file
 
     # def __repr__(self):
     #     return(f"Reference {reference}")
 
-    def get_damage(self, wlen, show_al):
+    def get_damage(self, count_reverse, show_al):
         """Compute CtoT substitutions
 
         Args:
-            wlen (int): window length
+            count_reverse(bool): count reverse alignments as well
             show_al(bool): print alignments representations
         Returns:
-            list: all_ct - positions of CtoT transitions in reads
-            list: all_ga - positions of GtoA transitions in reads
-            list: all_cc - positions of C in ref and reads, in reads
-            list: all_c - positions of C in reads
-            list: all_g - positions of G in reads
-            list: all_bases - position of all bases
+            list: C - positions from 5' of C in reads
+            list: CT- positions from 5' of CtoT transitions in reads
+            list: G - positions from 3' of G in reverse reads
+            list: GA- positions from 3' of GtoA transitions in reverse reads
+            list: damage_bases - positions of C2T and G2A bases (if reversed)
+            list: C_G_bases - positions of C2C and G2G bases (if reversed)
         """
-
-        all_ct = []
-        all_ga = []
-        all_cc = []
-        all_c = []
-        all_g = []
-        all_bases = []
+        self.C = []
+        self.CT = []
+        self.G = []
+        self.GA = []
+        self.damage_bases = []
+        self.C_G_bases = []
+        self.no_mut = []
         for al in self.alignments:
-            if al.is_reverse is False and al.is_unmapped is False:
-
+            if al.is_unmapped is False:
                 all_damage = damage_al(
                     reference=al.get_reference_sequence(),
                     read_name=al.query_name,
+                    is_reverse=al.is_reverse,
+                    count_reverse=count_reverse,
                     ref_name=self.reference,
                     query=al.query_sequence,
                     cigartuple=al.cigartuples,
-                    wlen=wlen,
+                    wlen=self.wlen,
                     show_al=show_al,
                 )
-                all_ct += all_damage["CT"]
-                all_ga += all_damage["GA"]
-                all_cc += all_damage["CC"]
-                all_c += all_damage["C"]
-                all_g += all_damage["G"]
-                all_bases += all_damage["all"]
+                self.C += all_damage["C"]
+                self.CT += all_damage["CT"]
+                self.G += all_damage["G"]
+                self.GA += all_damage["GA"]
+                self.damage_bases += all_damage["CT"] + all_damage["GA"]
+                self.C_G_bases += all_damage["C"] + all_damage["G"]
+                self.no_mut += all_damage["no_mut"]
+
+    def compute_damage(self):
+        """Computes the amount of damage for statistical modelling"""
+
+        # All C in reference
+        C_pos, C_counts = np.unique(
+            np.sort(
+                self.C,
+            ),
+            return_counts=True,
+        )
+        C_dict = dict(zip(C_pos, C_counts))
 
-        return (all_ct, all_ga, all_cc, all_c, all_g, all_bases)
+        # All G in reference
+        G_pos, G_counts = np.unique(np.sort(self.G), return_counts=True)
+        G_dict = dict(zip(G_pos, G_counts))
+
+        # CtoT transitions
+        CT_pos, CT_counts = np.unique(np.sort(self.CT), return_counts=True)
+        CT_dict = dict(zip(CT_pos, CT_counts))
+
+        # G to A transitions
+        GA_pos, GA_counts = np.unique(np.sort(self.GA), return_counts=True)
+        GA_dict = dict(zip(GA_pos, GA_counts))
+
+        # All transitions
+        damage_bases_pos, damage_bases_counts = np.unique(
+            np.sort(self.damage_bases), return_counts=True
+        )
+        damage_bases_dict = dict(zip(damage_bases_pos, damage_bases_counts))
+
+        # All C and G in reference
+        C_G_bases_pos, C_G_bases_counts = np.unique(
+            np.sort(self.C_G_bases), return_counts=True
+        )
+        C_G_bases_dict = dict(zip(C_G_bases_pos, C_G_bases_counts))
+
+        # All conserved C and G
+        no_mut_pos, no_mut_counts = np.unique(np.sort(self.no_mut), return_counts=True)
+        no_mut_dict = dict(zip(no_mut_pos, no_mut_counts))
+
+        CT_damage_amount = []
+        GA_damage_amount = []
+        damage_amount = []
+
+        for i in range(self.wlen):
+            if i not in CT_dict:
+                CT_dict[i] = 0
+            if i not in GA_dict:
+                GA_dict[i] = 0
+            if i not in damage_bases_dict:
+                damage_bases_dict[i] = 0
+            if i not in no_mut_dict:
+                no_mut_dict[i] = 0
+            if i not in C_dict:
+                CT_damage_amount.append(0)
+            else:
+                CT_damage_amount.append(CT_dict[i] / C_dict[i])
+            if i not in G_dict:
+                GA_damage_amount.append(0)
+            else:
+                GA_damage_amount.append(GA_dict[i] / G_dict[i])
+            if i not in C_G_bases_dict:
+                damage_amount.append(0)
+            else:
+                damage_amount.append(damage_bases_dict[i] / C_G_bases_dict[i])
+
+        return (
+            np.array(
+                list(damage_bases_dict.values())
+            ),  # Number of CtoT and GtoA per position
+            np.array(
+                list(no_mut_dict.values())
+            ),  # Number of conserved C and G per position
+            np.array(CT_damage_amount),
+            np.array(GA_damage_amount),
+            np.array(damage_amount),
+        )
 
 
 def avg_coverage(pysam_cov):
@@ -111,7 +192,7 @@ def check_model_fit(model_dict, wlen, verbose):
     return model_dict
 
 
-def test_damage(ref, bam, mode, wlen, show_al, process, verbose):
+def test_damage(ref, bam, mode, wlen, show_al, count_reverse, process, verbose):
     """Prepare data and run LRtest to test for damage
 
     Args:
@@ -120,6 +201,7 @@ def test_damage(ref, bam, mode, wlen, show_al, process, verbose):
         mode (str): opening mode of alignment file
         wlen (int): window length
         show_al (bool): Show alignment representations
+        count_reverse(bool): count reverse alignments as well
         process (int): Number of process for parallelization
         verbose (bool): Run in verbose mode
     Returns:
@@ -131,29 +213,49 @@ def test_damage(ref, bam, mode, wlen, show_al, process, verbose):
         nb_reads_aligned = al_handle.count(contig=ref)
         reflen = al_handle.get_reference_length(ref)
 
-        al = al_to_damage(reference=ref, al_handle=al_handle)
-        ct_data, ga_data, cc_data, c_data, g_data, all_bases = al.get_damage(
-            wlen=wlen, show_al=show_al
-        )
-        if ct_data:
+        al = al_to_damage(reference=ref, al_handle=al_handle, wlen=wlen)
+        al.get_damage(show_al=show_al, count_reverse=count_reverse)
+        (
+            mut_count,
+            conserved_count,
+            CT_damage,
+            GA_damage,
+            all_damage,
+        ) = al.compute_damage()
+        if all_damage:
             model_A = models.damage_model()
             model_B = models.null_model()
             test_res = fit_models(
                 ref=ref,
                 model_A=model_A,
                 model_B=model_B,
-                ct_data=ct_data,
-                cc_data=cc_data,
-                ga_data=ga_data,
-                all_bases=all_bases,
-                wlen=wlen,
+                damage=all_damage,
+                mut_count=mut_count,
+                conserved_count=conserved_count,
                 verbose=verbose,
             )
             test_res["reference"] = ref
             test_res["nb_reads_aligned"] = nb_reads_aligned
             test_res["coverage"] = cov
             test_res["reflen"] = reflen
 
+            CT_log = {}
+            GA_log = {}
+
+            for i in range(wlen):
+                CT_log[f"CtoT-{i}"] = CT_damage[i]
+                GA_log[f"GtoA-{i}"] = GA_damage[i]
+            test_res.update(CT_log)
+            test_res.update(GA_log)
+
+            # for i in range(qlen):
+            #     if i not in ydata_counts:
+            #         ydata_counts[i] = np.nan
+            #     if f"CtoT-{i}" not in ctot_out:
+            #         ctot_out[f"CtoT-{i}"] = np.nan
+            #     if f"GtoA-{i}" not in gtoa_out:
+            #         gtoa_out[f"GtoA-{i}"] = np.nan
+
             return check_model_fit(test_res, wlen, verbose)
 
     except (ValueError, RuntimeError) as e:
@@ -188,33 +290,46 @@ def get_damage_group(ref, bam, mode, wlen, show_al, process):
     reflen = al_handle.get_reference_length(ref)
 
     al = al_to_damage(reference=ref, al_handle=al_handle)
-    ct_data, ga_data, cc_data, c_data, g_data, all_bases = al.get_damage(
-        wlen=wlen, show_al=show_al
-    )
+    (
+        CT,
+        GA,
+        damage_bases,
+        forward,
+        reverse,
+    ) = al.get_damage(wlen=wlen, show_al=show_al)
 
     return [
-        ct_data,
-        ga_data,
-        cc_data,
-        c_data,
-        g_data,
-        all_bases,
+        CT,
+        GA,
+        damage_bases,
+        forward,
+        reverse,
         cov,
         nb_reads_aligned,
         reflen,
     ]
 
 
 def test_damage_group(
-    ct_data, ga_data, cc_data, all_bases, nb_reads_aligned, cov, reflen, wlen, verbose
+    ct_data,
+    ga_data,
+    damage_bases,
+    forward,
+    reverse,
+    nb_reads_aligned,
+    cov,
+    reflen,
+    wlen,
+    verbose,
 ):
     """Performs damage test
 
     Args:
-        ct_data (list of int): List of positions with CtoT transitions
-        ga_data (list of int): List of positions with GtoA transitions
-        cc_data (list of int): List of positions where C in ref and query
-        all_bases (list of int): List of positions where a base is aligned
+        ct_data (list of int): List of positions from 5' with CtoT transitions
+        ga_data (list of int): List of positions from 3' with GtoA transitions
+        damage_bases (list of int): List of positions with damage
+        forward (list of int): List of positions from 5' where a base is aligned
+        reverse(list of int): List of positions from 3' where a base is aligned
         nb_reads_aligned(int): number of reads aligned
         cov(float): average coverage across all references
         reflen(int): length of all references