Skip to content

Commit

Permalink
Merge pull request #60 from manodeep/develop
Browse files Browse the repository at this point in the history
Develop
  • Loading branch information
manodeep committed Jun 8, 2016
2 parents 6c3e8e6 + 23461cb commit 418ec80
Show file tree
Hide file tree
Showing 47 changed files with 2,484 additions and 1,711 deletions.
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ test_*period*
*.tgz
cov-int
*.gcno

*.ipynb
*.log
*.out*
*.d
Expand Down
6 changes: 4 additions & 2 deletions common.mk
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,6 @@ MINOR:=0
PATCHLEVEL:=0
VERSION:=$(MAJOR).$(MINOR).$(PATCHLEVEL)



DO_CHECKS := 1
ifeq (clean,$(findstring clean,$(MAKECMDGOALS)))
DO_CHECKS := 0
Expand All @@ -25,6 +23,10 @@ ifeq (distclean,$(findstring distclean,$(MAKECMDGOALS)))
DO_CHECKS := 0
endif

ifeq (realclean,$(findstring realclean,$(MAKECMDGOALS)))
DO_CHECKS := 0
endif

## Only set everything if the command is not "make clean"
ifeq ($(DO_CHECKS), 1)
## First check make version. Versions of make older than 3.80 will crash
Expand Down
17 changes: 10 additions & 7 deletions io/io.c
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@
int64_t read_positions(const char *filename, const char *format, const size_t size, const int num_fields, ...)
{
int64_t np;
assert(num_fields >= 1 && "You have to request at least one field to read-in");
assert((size == 4 || size == 8) && "Size of each position element can be either 4 (float) or 8 (double)");
XASSERT(num_fields >= 1, "Number of fields to read-in = %d must be at least 1\n", num_fields);
XASSERT((size == 4 || size == 8), "Size of fields = %zu must be either 4 or 8\n", size);

void *data[num_fields];
{
Expand Down Expand Up @@ -83,21 +83,21 @@ int64_t read_positions(const char *filename, const char *format, const size_t si
//so rewind by 4 bytes prepare for calls to ftread
my_fseek(fp, -sizeof(dummy), SEEK_CUR);
dummy /= np;
assert((dummy == 4 || dummy == 8) && "File must contain either 4 byte (float) or 8 byte(double) precision");
XASSERT((dummy == 4 || dummy == 8), "Data-type in file = %u must be either 4 byte (float) or 8 byte(double) precision", dummy);

if(dummy == size) {
for(int i=0;i<num_fields;i++) {
my_ftread(data[i],size, np, fp);
}
} else {
#ifndef SILENT
fprintf(stderr,"WARNING: File was written in a different precision than requested (file precision = %u requested precision = %zu)\n",dummy,size);
fprintf(stderr,ANSI_COLOR_MAGENTA"WARNING: File was written in a different precision than requested (file precision = %u requested precision = %zu)"ANSI_COLOR_RESET"\n",dummy,size);
#endif
//Okay so the file was written in a different precision.
//First, print a warning message and then read-in correctly with the
//requested precision
if(dummy == 4) {
assert(size == 8 && "Expected to be storing to doubles");
XASSERT(size == 8, "size = %zu should have been 8 (doubles were expected)\n", size);
float *tmp = my_malloc(dummy,np);
//read-in the fields
for(int i=0;i<num_fields;i++) {
Expand All @@ -109,7 +109,7 @@ int64_t read_positions(const char *filename, const char *format, const size_t si
//free memory
free(tmp);
} else {
assert(size == 4 && "Expected to be storing to doubles");
XASSERT(size == 4, "size = %zu should have been 4 (floats were expected)\n", size);
double *tmp = my_malloc(dummy,np);

//read-in the fields
Expand Down Expand Up @@ -196,7 +196,10 @@ int64_t read_positions(const char *filename, const char *format, const size_t si
va_list ap;
va_start(ap,num_fields);

assert(sizeof(void *) == sizeof(float *) && sizeof(void *) == sizeof(double *) && "Size of void pointer must be the same as size of float/double pointers");
XASSERT((sizeof(void *) == sizeof(float *) && sizeof(void *) == sizeof(double *)),
"Size of void pointer = %zu must be the same as size of float pointer = %zu and sizeof double pointers = %zu\n",
sizeof(void *), sizeof(float *), sizeof(double *));

for(int i=0;i<num_fields;i++) {
void **source = va_arg(ap, void **);
*source = data[i];
Expand Down
204 changes: 204 additions & 0 deletions paper/get_speedups.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,204 @@
from __future__ import print_function, division
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import matplotlib.cm as cm
try:
import pandas as pd
except ImportError:
pd = None


def read_file(filename):
dtype = np.dtype([('same_cell', np.int32),
('N1', np.int),
('N2', np.int),
('time', np.float)
])
if pd is not None:
timings = pd.read_csv(filename, header=None,
engine="c",
dtype={'same_cell': np.int32,
'N1': np.int,
'N2': np.int,
'time': np.float},
index_col=None,
names=['same_cell', 'N1', 'N2', 'time'],
delim_whitespace=True)
else:
timings = np.loadtxt(filename, dtype=dtype)
return timings


class nf(float):
def __repr__(self):
str = '%.1f' % (self.__float__(),)
if str[-1] == '0':
return '%.0f' % self.__float__()
else:
return '%.1f' % self.__float__()


def main():
base_dir = '../xi_theory/wp/'
base_string = 'wp'
files = ['timings_naive', 'timings_sse', 'timings_avx']
files = [base_dir + f for f in files]
legend = ['Naive', 'SSE4.2', 'AVX']
numfiles = len(files)
all_timings = []
for filename in files:
timings = read_file(filename)
all_timings.append(timings)

all_speedup = []
base_timing = (all_timings[0])['time']
N1_parts = (all_timings[0])['N1']
N2_parts = (all_timings[0])['N2']
gridsize = 40
cb_range = [0.0, 5.0]
contour_nlevels = 4
xlimits = [0, 1000]
ylimits = xlimits
xlabel = 'Number of points in a cell'
ylabel = xlabel

cb_diff = (cb_range[1] - cb_range[0])
positive_Ncolors = int((cb_range[1] - 1.0) / cb_diff * 256)
negative_Ncolors = 256 - positive_Ncolors
colors1 = cm.OrRd(np.linspace(0.0, 1.0, negative_Ncolors))
colors2 = cm.viridis(np.linspace(0.0, 1.0, positive_Ncolors))
# combine them and build a new colormap
colors = np.vstack((colors1, colors2))
mycmap = mcolors.LinearSegmentedColormap.from_list('my_colormap', colors)
matplotlib.style.use('default')
# Label levels with specially formatted floats
if plt.rcParams["text.usetex"]:
cntr_fmt = r'%r\%%'
else:
cntr_fmt = '%r%%'

for i in xrange(numfiles):
if i == 0:
continue
this_timing = (all_timings[i])['time']
ind = (np.where((this_timing > 0.0) & (base_timing > 0.0)))[0]
speedup = base_timing[ind] / this_timing[ind]
all_speedup.append(speedup)
print("Min speedup = {0}. Max = {1}".format(
min(speedup), max(speedup)))
bad = (np.where(speedup <= 1.0))[0]
bad_timings_base = np.sum(base_timing[ind[bad]])
bad_timings = np.sum(this_timing[ind[bad]])
print("Cells with slowdown {3}({4:4.3f}%): Base takes - {0:8.3f} sec "
"while {1} takes {2:8.3f} seconds".format(
bad_timings_base,
legend[i],
bad_timings,
len(bad),
100.0 * len(bad) / len(ind)))

good = (np.where(speedup > 1.0))[0]
good_timings_base = np.sum(base_timing[ind[good]])
good_timings = np.sum(this_timing[ind[good]])
print("Cells with speedup {3}({4:4.3f}%): Base takes - {0:8.3f} sec "
"while {1} takes {2:8.3f} seconds".format(
good_timings_base,
legend[i],
good_timings,
len(good),
100.0 * len(good) / len(ind)))

fig = plt.figure(1, figsize=(8, 8))
figsize = 0.6
left = 0.1
bottom = 0.1
top_aspect = 0.15
hist_area = [left, bottom + figsize, figsize, figsize * top_aspect]
axhist = plt.axes(hist_area)
axhist.autoscale(enable=True, axis="y")
axhist.set_xlim(xlimits)
plt.setp(axhist.get_xticklabels(), visible=False)
axhist.axis('off')
axhist.hist(N1_parts[ind], gridsize, range=xlimits,
color='0.5')

hist_time_area = [left + figsize, bottom, figsize*top_aspect, figsize]
ax_time = plt.axes(hist_time_area)
ax_time.autoscale(enable=True, axis="x")
ax_time.set_ylim(ylimits)
plt.setp(ax_time.get_yticklabels(), visible=False)
plt.setp(ax_time.get_xticklabels(), visible=False)
ax_time.axis('off')
ax_time.hist(N1_parts[ind], gridsize, weights=this_timing[ind],
range=xlimits, orientation="horizontal",
color='0.5')

im_area = [left, bottom, figsize, figsize]
ax = plt.axes(im_area)
ax.set_autoscale_on(False)
ax.set_xlim(xlimits)
ax.set_ylim(ylimits)
ax.set_xlabel(xlabel)
ax.set_ylabel(ylabel)
xedges = np.linspace(xlimits[0], xlimits[1], gridsize)
yedges = np.linspace(ylimits[0], ylimits[1], gridsize)
cell_time, xedges, yedges = np.histogram2d(
N1_parts, N2_parts, (xedges, yedges),
weights=base_timing, normed=False)

cell_time /= np.sum(cell_time)
cell_time *= 100.0
cell_time_1d = cell_time.flatten()
sorted_ind = np.argsort(cell_time_1d)
cum_sorted_time = np.cumsum(cell_time_1d[sorted_ind])
correct_order_cum_time = np.empty_like(cum_sorted_time)
for kk, ct in zip(sorted_ind, cum_sorted_time):
correct_order_cum_time[kk] = ct

correct_order_cum_time = correct_order_cum_time.reshape(
cell_time.shape)
extent = [yedges[0], yedges[-1], xedges[0], xedges[-1]]
xarr, yarr = np.meshgrid(xedges[0:-1], yedges[0:-1])
contours = ax.contour(xarr, yarr,
correct_order_cum_time, contour_nlevels,
linewidths=3.0,
extent=extent,
cmap=cm.Greys)

# Recast levels to new class
# Reverse the levels to show that the contours represent
# enclosed fraction of time spent
contours.levels = [nf(val) for val in contours.levels[::-1]]
ax.clabel(contours, contours.levels, fmt=cntr_fmt,
inline=True, fontsize=10)

# Now plot the image for the speedup
im = ax.hexbin(N1_parts[ind], N2_parts[ind], C=speedup[ind],
vmin=cb_range[0], vmax=cb_range[1],
cmap=mycmap, gridsize=gridsize)
plt.figtext(left + figsize - 0.03, bottom + figsize - 0.05,
'{0}'.format(legend[i]), fontsize=16, ha='right')
cbar_offset = 0.08
cbar_width = 0.03
cbar_ax = fig.add_axes([left + figsize + figsize*top_aspect +
cbar_offset, bottom,
cbar_width, figsize])
cb = fig.colorbar(im, extend='both', format="%.1f",
ticks=np.linspace(cb_range[0], cb_range[1],
cb_diff + 1.0),
cax=cbar_ax)
cb.set_label('Speedup rel. to non-vectorized code')
plt.savefig('{1}_Speedup_{0}.png'.format(legend[i], base_string),
dpi=400)
plt.savefig('{1}_Speedup_{0}.pdf'.format(legend[i], base_string),
dpi=400)
fig.clear()
ax.clear()
axhist.clear()
ax_time.clear()
plt.close(fig)

if __name__ == '__main__':
main()
Binary file added paper/wp_Speedup_AVX.pdf
Binary file not shown.
Binary file added paper/wp_Speedup_AVX.png
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added paper/wp_Speedup_SSE4.2.pdf
Binary file not shown.
Binary file added paper/wp_Speedup_SSE4.2.png
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
2 changes: 2 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from sys import version_info
import re

# partial import
import Corrfunc
from Corrfunc import rd

Expand All @@ -26,6 +27,7 @@
minor = re.search(r'MINOR\s*:*=\s*(\d)', common).group(1)
patch = re.search(r'PATCHLEVEL\s*:*=\s*(\d)', common).group(1)
version = "{0}.{1}.{2}".format(major, minor, patch)

# Check that version matches
if Corrfunc.__version__ != version:
msg = "ERROR: Version mis-match. Python version found = {0} \
Expand Down
4 changes: 2 additions & 2 deletions utils/avx_calls.h
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,7 @@ extern "C" {
{
union cos{
AVX_FLOATS m;
DOUBLE x[NVEC];
DOUBLE x[AVX_NVEC];
};
union cos union_costheta;
union cos union_returnvalue;
Expand All @@ -162,7 +162,7 @@ extern "C" {
const DOUBLE one = (DOUBLE) 1.0;
const DOUBLE zero = (DOUBLE) 0.0;

for(int ii=0;ii<NVEC;ii++) {
for(int ii=0;ii<AVX_NVEC;ii++) {
const DOUBLE costheta = union_costheta.x[ii];
if(costheta < minus_one) {
union_returnvalue.x[ii] = M_PI;
Expand Down

0 comments on commit 418ec80

Please sign in to comment.