Merge pull request #60 from manodeep/develop

Develop
manodeep · Jun 8, 2016 · 418ec80 · 418ec80
2 parents 6c3e8e6 + 23461cb
commit 418ec80
Show file tree

Hide file tree

Showing 47 changed files with 2,484 additions and 1,711 deletions.
diff --git a/.gitignore b/.gitignore
@@ -28,7 +28,7 @@ test_*period*
 *.tgz
 cov-int
 *.gcno
-
+*.ipynb
 *.log
 *.out*
 *.d

diff --git a/common.mk b/common.mk
@@ -14,8 +14,6 @@ MINOR:=0
 PATCHLEVEL:=0
 VERSION:=$(MAJOR).$(MINOR).$(PATCHLEVEL)
 
-
-
 DO_CHECKS := 1
 ifeq (clean,$(findstring clean,$(MAKECMDGOALS)))
   DO_CHECKS := 0
@@ -25,6 +23,10 @@ ifeq (distclean,$(findstring distclean,$(MAKECMDGOALS)))
   DO_CHECKS := 0
 endif
 
+ifeq (realclean,$(findstring realclean,$(MAKECMDGOALS)))
+  DO_CHECKS := 0
+endif
+
 ## Only set everything if the command is not "make clean"
 ifeq ($(DO_CHECKS), 1)
   ## First check make version. Versions of make older than 3.80 will crash

diff --git a/io/io.c b/io/io.c
@@ -21,8 +21,8 @@
 int64_t read_positions(const char *filename, const char *format, const size_t size, const int num_fields, ...)
 {
     int64_t np;
-    assert(num_fields >= 1 && "You have to request at least one field to read-in");
-    assert((size == 4 || size == 8) && "Size of each position element can be either 4 (float) or 8 (double)");
+    XASSERT(num_fields >= 1, "Number of fields to read-in = %d must be at least 1\n", num_fields);
+    XASSERT((size == 4 || size == 8), "Size of fields = %zu must be either 4 or 8\n", size);
 
     void *data[num_fields];
     {
@@ -83,21 +83,21 @@ int64_t read_positions(const char *filename, const char *format, const size_t si
         //so rewind by 4 bytes  prepare for calls to ftread
         my_fseek(fp, -sizeof(dummy), SEEK_CUR);
         dummy /= np;
-        assert((dummy == 4 || dummy == 8) && "File must contain either 4 byte (float) or 8 byte(double) precision");
+        XASSERT((dummy == 4 || dummy == 8), "Data-type in file = %u must be either 4 byte (float) or 8 byte(double) precision", dummy);
 
         if(dummy == size) {
             for(int i=0;i<num_fields;i++) {
                 my_ftread(data[i],size, np, fp);
             }
         } else {
 #ifndef SILENT
-            fprintf(stderr,"WARNING: File was written in a different precision than requested (file precision = %u requested precision = %zu)\n",dummy,size);
+            fprintf(stderr,ANSI_COLOR_MAGENTA"WARNING: File was written in a different precision than requested (file precision = %u requested precision = %zu)"ANSI_COLOR_RESET"\n",dummy,size);
 #endif
             //Okay so the file was written in a different precision.
             //First, print a warning message and then read-in correctly with the
             //requested precision
             if(dummy == 4) {
-                assert(size == 8 && "Expected to be storing to doubles");
+                XASSERT(size == 8, "size = %zu should have been 8 (doubles were expected)\n", size);
                 float *tmp = my_malloc(dummy,np);
                 //read-in the fields
                 for(int i=0;i<num_fields;i++) {
@@ -109,7 +109,7 @@ int64_t read_positions(const char *filename, const char *format, const size_t si
                 //free memory
                 free(tmp);
             } else {
-                assert(size == 4 && "Expected to be storing to doubles");
+                XASSERT(size == 4, "size = %zu should have been 4 (floats were expected)\n", size);
                 double *tmp = my_malloc(dummy,np);
 
                 //read-in the fields
@@ -196,7 +196,10 @@ int64_t read_positions(const char *filename, const char *format, const size_t si
     va_list ap;
     va_start(ap,num_fields);
 
-    assert(sizeof(void *) == sizeof(float *) && sizeof(void *) == sizeof(double *) && "Size of void pointer must be the same as size of float/double pointers");
+    XASSERT((sizeof(void *) == sizeof(float *) && sizeof(void *) == sizeof(double *)),
+            "Size of void pointer = %zu must be the same as size of float pointer = %zu and sizeof double pointers = %zu\n",
+            sizeof(void *), sizeof(float *), sizeof(double *));
+
     for(int i=0;i<num_fields;i++) {
         void **source = va_arg(ap, void **);
         *source =  data[i];

diff --git a/paper/get_speedups.py b/paper/get_speedups.py
@@ -0,0 +1,204 @@
+from __future__ import print_function, division
+import numpy as np
+import matplotlib
+import matplotlib.pyplot as plt
+import matplotlib.colors as mcolors
+import matplotlib.cm as cm
+try:
+    import pandas as pd
+except ImportError:
+    pd = None
+
+
+def read_file(filename):
+    dtype = np.dtype([('same_cell', np.int32),
+                      ('N1', np.int),
+                      ('N2', np.int),
+                      ('time', np.float)
+                      ])
+    if pd is not None:
+        timings = pd.read_csv(filename, header=None,
+                              engine="c",
+                              dtype={'same_cell': np.int32,
+                                     'N1': np.int,
+                                     'N2': np.int,
+                                     'time': np.float},
+                              index_col=None,
+                              names=['same_cell', 'N1', 'N2', 'time'],
+                              delim_whitespace=True)
+    else:
+        timings = np.loadtxt(filename, dtype=dtype)
+    return timings
+
+
+class nf(float):
+    def __repr__(self):
+        str = '%.1f' % (self.__float__(),)
+        if str[-1] == '0':
+            return '%.0f' % self.__float__()
+        else:
+            return '%.1f' % self.__float__()
+
+
+def main():
+    base_dir = '../xi_theory/wp/'
+    base_string = 'wp'
+    files = ['timings_naive', 'timings_sse', 'timings_avx']
+    files = [base_dir + f for f in files]
+    legend = ['Naive', 'SSE4.2', 'AVX']
+    numfiles = len(files)
+    all_timings = []
+    for filename in files:
+        timings = read_file(filename)
+        all_timings.append(timings)
+
+    all_speedup = []
+    base_timing = (all_timings[0])['time']
+    N1_parts = (all_timings[0])['N1']
+    N2_parts = (all_timings[0])['N2']
+    gridsize = 40
+    cb_range = [0.0, 5.0]
+    contour_nlevels = 4
+    xlimits = [0, 1000]
+    ylimits = xlimits
+    xlabel = 'Number of points in a cell'
+    ylabel = xlabel
+
+    cb_diff = (cb_range[1] - cb_range[0])
+    positive_Ncolors = int((cb_range[1] - 1.0) / cb_diff * 256)
+    negative_Ncolors = 256 - positive_Ncolors
+    colors1 = cm.OrRd(np.linspace(0.0, 1.0, negative_Ncolors))
+    colors2 = cm.viridis(np.linspace(0.0, 1.0, positive_Ncolors))
+    # combine them and build a new colormap
+    colors = np.vstack((colors1, colors2))
+    mycmap = mcolors.LinearSegmentedColormap.from_list('my_colormap', colors)
+    matplotlib.style.use('default')
+    # Label levels with specially formatted floats
+    if plt.rcParams["text.usetex"]:
+        cntr_fmt = r'%r\%%'
+    else:
+        cntr_fmt = '%r%%'
+
+    for i in xrange(numfiles):
+        if i == 0:
+            continue
+        this_timing = (all_timings[i])['time']
+        ind = (np.where((this_timing > 0.0) & (base_timing > 0.0)))[0]
+        speedup = base_timing[ind] / this_timing[ind]
+        all_speedup.append(speedup)
+        print("Min speedup = {0}. Max = {1}".format(
+            min(speedup), max(speedup)))
+        bad = (np.where(speedup <= 1.0))[0]
+        bad_timings_base = np.sum(base_timing[ind[bad]])
+        bad_timings = np.sum(this_timing[ind[bad]])
+        print("Cells with slowdown  {3}({4:4.3f}%): Base takes - {0:8.3f} sec "
+              "while {1} takes {2:8.3f} seconds".format(
+                  bad_timings_base,
+                  legend[i],
+                  bad_timings,
+                  len(bad),
+                  100.0 * len(bad) / len(ind)))
+
+        good = (np.where(speedup > 1.0))[0]
+        good_timings_base = np.sum(base_timing[ind[good]])
+        good_timings = np.sum(this_timing[ind[good]])
+        print("Cells with speedup {3}({4:4.3f}%): Base takes - {0:8.3f} sec "
+              "while {1} takes {2:8.3f} seconds".format(
+                  good_timings_base,
+                  legend[i],
+                  good_timings,
+                  len(good),
+                  100.0 * len(good) / len(ind)))
+
+        fig = plt.figure(1, figsize=(8, 8))
+        figsize = 0.6
+        left = 0.1
+        bottom = 0.1
+        top_aspect = 0.15
+        hist_area = [left, bottom + figsize, figsize, figsize * top_aspect]
+        axhist = plt.axes(hist_area)
+        axhist.autoscale(enable=True, axis="y")
+        axhist.set_xlim(xlimits)
+        plt.setp(axhist.get_xticklabels(), visible=False)
+        axhist.axis('off')
+        axhist.hist(N1_parts[ind], gridsize, range=xlimits,
+                    color='0.5')
+
+        hist_time_area = [left + figsize, bottom, figsize*top_aspect, figsize]
+        ax_time = plt.axes(hist_time_area)
+        ax_time.autoscale(enable=True, axis="x")
+        ax_time.set_ylim(ylimits)
+        plt.setp(ax_time.get_yticklabels(), visible=False)
+        plt.setp(ax_time.get_xticklabels(), visible=False)
+        ax_time.axis('off')
+        ax_time.hist(N1_parts[ind], gridsize, weights=this_timing[ind],
+                     range=xlimits, orientation="horizontal",
+                     color='0.5')
+
+        im_area = [left, bottom, figsize, figsize]
+        ax = plt.axes(im_area)
+        ax.set_autoscale_on(False)
+        ax.set_xlim(xlimits)
+        ax.set_ylim(ylimits)
+        ax.set_xlabel(xlabel)
+        ax.set_ylabel(ylabel)
+        xedges = np.linspace(xlimits[0], xlimits[1], gridsize)
+        yedges = np.linspace(ylimits[0], ylimits[1], gridsize)
+        cell_time, xedges, yedges = np.histogram2d(
+            N1_parts, N2_parts, (xedges, yedges),
+            weights=base_timing, normed=False)
+
+        cell_time /= np.sum(cell_time)
+        cell_time *= 100.0
+        cell_time_1d = cell_time.flatten()
+        sorted_ind = np.argsort(cell_time_1d)
+        cum_sorted_time = np.cumsum(cell_time_1d[sorted_ind])
+        correct_order_cum_time = np.empty_like(cum_sorted_time)
+        for kk, ct in zip(sorted_ind, cum_sorted_time):
+            correct_order_cum_time[kk] = ct
+
+        correct_order_cum_time = correct_order_cum_time.reshape(
+            cell_time.shape)
+        extent = [yedges[0], yedges[-1], xedges[0], xedges[-1]]
+        xarr, yarr = np.meshgrid(xedges[0:-1], yedges[0:-1])
+        contours = ax.contour(xarr, yarr,
+                              correct_order_cum_time, contour_nlevels,
+                              linewidths=3.0,
+                              extent=extent,
+                              cmap=cm.Greys)
+
+        # Recast levels to new class
+        # Reverse the levels to show that the contours represent
+        # enclosed fraction of time spent
+        contours.levels = [nf(val) for val in contours.levels[::-1]]
+        ax.clabel(contours, contours.levels, fmt=cntr_fmt,
+                  inline=True, fontsize=10)
+
+        # Now plot the image for the speedup
+        im = ax.hexbin(N1_parts[ind], N2_parts[ind], C=speedup[ind],
+                       vmin=cb_range[0], vmax=cb_range[1],
+                       cmap=mycmap, gridsize=gridsize)
+        plt.figtext(left + figsize - 0.03, bottom + figsize - 0.05,
+                    '{0}'.format(legend[i]), fontsize=16, ha='right')
+        cbar_offset = 0.08
+        cbar_width = 0.03
+        cbar_ax = fig.add_axes([left + figsize + figsize*top_aspect +
+                                cbar_offset, bottom,
+                                cbar_width, figsize])
+        cb = fig.colorbar(im, extend='both', format="%.1f",
+                          ticks=np.linspace(cb_range[0], cb_range[1],
+                                            cb_diff + 1.0),
+                          cax=cbar_ax)
+        cb.set_label('Speedup rel. to non-vectorized code')
+        plt.savefig('{1}_Speedup_{0}.png'.format(legend[i], base_string),
+                    dpi=400)
+        plt.savefig('{1}_Speedup_{0}.pdf'.format(legend[i], base_string),
+                    dpi=400)
+        fig.clear()
+        ax.clear()
+        axhist.clear()
+        ax_time.clear()
+        plt.close(fig)
+
+if __name__ == '__main__':
+    main()
diff --git a/paper/wp_Speedup_AVX.pdf b/paper/wp_Speedup_AVX.pdf
diff --git a/paper/wp_Speedup_AVX.png b/paper/wp_Speedup_AVX.png
diff --git a/paper/wp_Speedup_SSE4.2.pdf b/paper/wp_Speedup_SSE4.2.pdf
diff --git a/paper/wp_Speedup_SSE4.2.png b/paper/wp_Speedup_SSE4.2.png
diff --git a/setup.py b/setup.py
@@ -11,6 +11,7 @@
 from sys import version_info
 import re
 
+# partial import
 import Corrfunc
 from Corrfunc import rd
 
@@ -26,6 +27,7 @@
 minor = re.search(r'MINOR\s*:*=\s*(\d)', common).group(1)
 patch = re.search(r'PATCHLEVEL\s*:*=\s*(\d)', common).group(1)
 version = "{0}.{1}.{2}".format(major, minor, patch)
+
 # Check that version matches
 if Corrfunc.__version__ != version:
     msg = "ERROR: Version mis-match. Python version found = {0} \

diff --git a/utils/avx_calls.h b/utils/avx_calls.h
@@ -153,7 +153,7 @@ extern "C" {
     {
         union cos{
             AVX_FLOATS m;
-            DOUBLE x[NVEC];
+            DOUBLE x[AVX_NVEC];
         };
         union cos union_costheta;
         union cos union_returnvalue;
@@ -162,7 +162,7 @@ extern "C" {
         const DOUBLE one = (DOUBLE) 1.0;
         const DOUBLE zero = (DOUBLE) 0.0;
 
-        for(int ii=0;ii<NVEC;ii++) {
+        for(int ii=0;ii<AVX_NVEC;ii++) {
             const DOUBLE costheta = union_costheta.x[ii];
             if(costheta < minus_one) {
                 union_returnvalue.x[ii] = M_PI;