update plots for pacbio simulated data

mehrdadbakhtiari · Jul 6, 2018 · 66a952c · 66a952c
1 parent dcad3bf
commit 66a952c
Showing 1 changed file with 75 additions and 44 deletions.
diff --git a/advntr/plot.py b/advntr/plot.py
@@ -671,6 +671,7 @@ def get_correct_estimates_for_ru(files, ru_length=None, adVNTR=False):
 def plot_pacbio_ru_length_result(results_dir='../pacbio_ru_data_for_all_vntrs/'):
     from matplotlib import rc, rcParams
     import matplotlib.pyplot as plt
+    import numpy as np
     fig = plt.figure()
     plt.style.use('ggplot')
     plt.rcParams['axes.facecolor'] = '#FFFFFF'
@@ -685,57 +686,87 @@ def plot_pacbio_ru_length_result(results_dir='../pacbio_ru_data_for_all_vntrs/')
     ax.text(-0.1, 1.1, r'\textbf{B}', transform=ax.transAxes,
           fontsize=16, fontweight='bold', va='top', ha='right')
     import glob
+    import os
     ru_dirs = glob.glob(results_dir + '*')
     points = []
     naive_points = []
+
+    def get_lengths_and_discrepancies(file_name):
+        import ast
+        with open(file_name) as infile:
+            lines = infile.readlines()
+            if len(lines) < 2:
+                return [], []
+            length = ast.literal_eval(lines[0])
+            disc = ast.literal_eval(lines[-1])
+        return length, disc
+
+    lengths = []
+    naive_lengths = []
+    discrepancies = []
+    naive_discrepancies = []
     for ru_dir in ru_dirs:
-        ru_length = int(ru_dir.split('/')[-1])
-        files = glob.glob(ru_dir + '/*.fasta.out')
-        naivefiles = glob.glob(ru_dir + '/*.fasta.out.naive')
-        corrects, error_bar = get_correct_estimates_for_ru(files, ru_length, True)
-        naive_corrects, naive_error_bar = get_correct_estimates_for_ru(naivefiles, ru_length)
-        points.append((ru_length, corrects, error_bar))
-        naive_points.append((ru_length, naive_corrects, naive_error_bar))
-    points = sorted(points)
-    ru_lengths = [x for x, y, z in points]
-    naive_points = sorted(naive_points)
-
-    ru_lengths = [int(x/10)*10 for x, y, z in points]
-    corrects = []
-    naive_corrects = []
-    for x in ru_lengths:
+        vntr_id = os.path.basename(ru_dir)
+        if not os.path.exists(ru_dir + '/advntr_result.txt') or not os.path.exists(ru_dir + '/naive_result.txt'):
+            continue
+        length, disc = get_lengths_and_discrepancies(ru_dir + '/advntr_result.txt')
+        naive_length, naive_disc = get_lengths_and_discrepancies(ru_dir + '/naive_result.txt')
+        lengths += length
+        naive_lengths += naive_length
+        discrepancies += disc
+        naive_discrepancies += naive_disc
+
+    print(len(discrepancies))
+
+    def plot_data(matplot_ax, discrepancies_list, lengths_list, naive=0):
+        data = {}
+        width = 200
+        offset = float(width) / 2
+        if naive:
+            offset *= -1
+
         total = 0
-        num = 0
-        for xx, yy, zz in points:
-            if (xx/10)*10 == x:
-                total += yy
-                num += 1
-        corrects.append(float(total)/num)
-    naive_error_bars = []
-    from scipy import stats
-    for x in ru_lengths:
-        total = 0.0
-        num = 0.0
-        observed_values = []
-        for xx, yy, zz in naive_points:
-            if (xx/10)*10 == x:
-                observed_values.append(yy)
-                total += yy
-                num += 1
-        naive_corrects.append(float(total)/num)
-        naive_error_bars.append(stats.sem(observed_values))
-
-    print(naive_error_bars)
-    error_bars = [0 for x, y, z in points]
-    # naive_error_bars = [0 for x, y, z in naive_points]
-    # corrects = [y for x, y, z in points]
-    # naive_corrects = [y for x, y, z in naive_points]
-    plt.errorbar(ru_lengths, corrects, yerr=error_bars, label='adVNTR')
-    plt.errorbar(ru_lengths, naive_corrects, yerr=naive_error_bars, ls='--', label='Naive Method')
+        total_wrongs = 0
+        for i in range(len(discrepancies_list)):
+            key = 500 * (lengths_list[i] / 500)
+            if key > 3000:
+                continue
+            if key not in data.keys():
+                data[key] = []
+            data[key].append(discrepancies_list[i])
+        for key in data.keys():
+            wrongs = sum([1 for e in data[key] if e > 0])
+            total_wrongs = wrongs
+            total += len(data[key])
+            data[key] = 100 - 100 * wrongs / float(len(data[key]))
+        print total_wrongs, total
+        print(float(total_wrongs) / total * 100)
+        label = 'Naive Method' if naive else 'adVNTR'
+        matplot_ax.bar(np.array(data.keys()) + offset, data.values(), width=width, label=label)
+
+    plot_data(ax, discrepancies, lengths, 0)
+    plot_data(ax, naive_discrepancies, naive_lengths, 1)
+    ax.set_ylim((0, 100))
+    ax.set_xlabel(r'\emph{VNTR Length}', fontsize=13)
+    ax.set_ylabel(r'\emph{Correct Estimate Percentage}', fontsize=13)
+
+    # naive_error_bars = []
+    # from scipy import stats
+    # for x in ru_lengths:
+    #     total = 0.0
+    #     num = 0.0
+    #     observed_values = []
+    #     for xx, yy, zz in naive_points:
+    #         if (xx/10)*10 == x:
+    #             observed_values.append(yy)
+    #             total += yy
+    #             num += 1
+    #     naive_corrects.append(float(total)/num)
+    #     naive_error_bars.append(stats.sem(observed_values))
 
     plt.tight_layout(pad=2, w_pad=0.5, h_pad=1)
 
-    plt.legend(loc=4, fontsize=16)
+    plt.legend(loc=3, fontsize=16)
     plt.savefig('pacbio_ru_length_results.pdf')
 
 
@@ -1249,7 +1280,7 @@ def plot_vntr_length_distribution(max_len=1000):
 # plot_read_recruitment_results()
 # plot_inconsistency_difference()
 
-# plot_pacbio_ru_length_result()
+plot_pacbio_ru_length_result('../pacbio_simulations/')
 # plot_pedigree_tree()
 # plot_lr_pcr()