Permalink
Browse files

Speed up tight loops with chunks_exact()

The chunks_exact()/chunks_exact_mut() versions end up generating much
better code because they require much less checks inside tight loops.
Since that's what we want anyway because we don't want to process half
lines and partial pixels this is a very nice win in rust 1.31+
  • Loading branch information...
pedrocr committed Jan 16, 2019
1 parent 883a4c5 commit da5ed8cf5b09ccaeeb8b63e0abb1d3c9289a6521
@@ -26,6 +26,7 @@ serde_derive = "1"
[build-dependencies]
glob = "0.2.11"
toml = "0.4.5"
rustc_version = "0.2.3"

[profile.release]
panic = "unwind"
@@ -7,6 +7,8 @@ extern crate glob;
use self::glob::glob;
extern crate toml;
use toml::Value;
extern crate rustc_version;
use rustc_version::{version, Version};

fn main() {
let out_dir = env::var("OUT_DIR").unwrap();
@@ -30,4 +32,9 @@ fn main() {
out.write_all(&toml.into_bytes()).unwrap();
out.write_all(b"\n").unwrap();
}

// Check for a minimum version
if version().unwrap() < Version::parse("1.31.0").unwrap() {
println!("cargo:rustc-cfg=needs_chunks_exact");
}
}
@@ -177,7 +177,7 @@ impl<'a> ArwDecoder<'a> {
let mut pump = BitPumpLSB::new(&buf[(row*width)..]);

let mut random = pump.peek_bits(16);
for out in out.chunks_mut(32) {
for out in out.chunks_exact_mut(32) {
// Process 32 pixels at a time in interleaved fashion
for j in 0..2 {
let max = pump.get_bits(11);
@@ -135,3 +135,30 @@ impl LookupTable {
pixel as u16
}
}

// For rust <= 1.31 we just alias chunks_exact() and chunks_exact_mut() to the non-exact versions
// so we can use exact everywhere without spreading special cases across the code
#[cfg(needs_chunks_exact)]
mod chunks_exact {
use std::slice;

// Add a chunks_exact for &[u8] and Vec<u16>
pub trait ChunksExact<T> {
fn chunks_exact(&self, n: usize) -> slice::Chunks<T>;
}
impl<'a, T> ChunksExact<T> for &'a [T] {
fn chunks_exact(&self, n: usize) -> slice::Chunks<T> { self.chunks(n) }
}
impl<T> ChunksExact<T> for Vec<T> {
fn chunks_exact(&self, n: usize) -> slice::Chunks<T> { self.chunks(n) }
}

// Add a chunks_exact_mut for &mut[u16] mostly
pub trait ChunksExactMut<'a, T> {
fn chunks_exact_mut(self, n: usize) -> slice::ChunksMut<'a, T>;
}
impl<'a, T> ChunksExactMut<'a, T> for &'a mut [T] {
fn chunks_exact_mut(self, n: usize) -> slice::ChunksMut<'a, T> { self.chunks_mut(n) }
}
}
#[cfg(needs_chunks_exact)] pub use self::chunks_exact::*;
@@ -59,7 +59,7 @@ impl<'a> Decoder for Cr2Decoder<'a> {
};

let mut random = ljpegout[0] as u32;
for o in ljpegout.chunks_mut(1) {
for o in ljpegout.chunks_exact_mut(1) {
o[0] = table.dither(o[0], &mut random);
}
}
@@ -180,7 +180,7 @@ impl<'a> Cr2Decoder<'a> {

let yoffset = if cam.find_hint("40d_yuv") { 512 } else { 0 };

for pix in image.chunks_mut(3) {
for pix in image.chunks_exact_mut(3) {
let y = pix[0] as i32 - yoffset;
let cb = pix[1] as i32 - 16383;
let cr = pix[2] as i32 - 16383;
@@ -204,7 +204,7 @@ impl<'a> CrwDecoder<'a> {
let mut carry: i32 = 0;
let mut base = [0 as i32;2];
let mut pnum = 0;
for pixout in out.chunks_mut(64) {
for pixout in out.chunks_exact_mut(64) {
// Decode a block of 64 differences
let mut diffbuf = [0 as i32; 64];
let mut i: usize = 0;
@@ -243,7 +243,7 @@ impl<'a> CrwDecoder<'a> {

if lowbits {
// Add the uncompressed 2 low bits to the decoded 8 high bits
for (i,o) in out.chunks_mut(4).enumerate() {
for (i,o) in out.chunks_exact_mut(4).enumerate() {
let c = self.buffer[26+i] as u16;
o[0] = o[0] << 2 | (c ) & 0x03;
o[1] = o[1] << 2 | (c >> 2) & 0x03;
@@ -74,7 +74,7 @@ impl<'a> IiqDecoder<'a> {
let mut pump = BitPumpMSB32::new(&self.buffer[offset..]);
let mut pred = [0 as u32; 2];
let mut len = [0 as u32; 2];
for (col, pixout) in out.chunks_mut(1).enumerate() {
for (col, pixout) in out.chunks_exact_mut(1).enumerate() {
if col >= (width & 0xfffffff8) {
len[0] = 14;
len[1] = 14;
@@ -270,10 +270,10 @@ pub fn decode_hasselblad(ljpeg: &LjpegDecompressor, out: &mut [u16], width: usiz
let mut pump = BitPumpMSB32::new(ljpeg.buffer);
let ref htable = ljpeg.dhts[ljpeg.sof.components[0].dc_tbl_num];

for line in out.chunks_mut(width) {
for line in out.chunks_exact_mut(width) {
let mut p1: i32 = 0x8000;
let mut p2: i32 = 0x8000;
for o in line.chunks_mut(2) {
for o in line.chunks_exact_mut(2) {
let len1 = try!(htable.huff_len(&mut pump));
let len2 = try!(htable.huff_len(&mut pump));
p1 += htable.huff_diff(&mut pump, len1);
@@ -90,7 +90,7 @@ impl<'a> MosDecoder<'a> {
let ljpegout = try!(decompressor.decode_leaf(width, height));
if cam.find_hint("interlaced") {
let mut out = vec![0 as u16; width*height];
for (row,line) in ljpegout.chunks(width).enumerate() {
for (row,line) in ljpegout.chunks_exact(width).enumerate() {
let orow = if row & 1 == 1 {height-1-row/2} else {row/2};
out[orow*width .. (orow+1)*width].copy_from_slice(line);
}
@@ -322,7 +322,7 @@ impl<'a> NefDecoder<'a> {
Ok(decode_threaded(width*3, height, &(|out: &mut [u16], row| {
let inb = &src[row*width*3..];
let mut random = BEu32(inb, 0);
for (o, i) in out.chunks_mut(6).zip(inb.chunks(6)) {
for (o, i) in out.chunks_exact_mut(6).zip(inb.chunks_exact(6)) {
let g1: u16 = i[0] as u16;
let g2: u16 = i[1] as u16;
let g3: u16 = i[2] as u16;
@@ -5,7 +5,7 @@ pub fn decode_8bit_wtable(buf: &[u8], tbl: &LookupTable, width: usize, height: u
let inb = &buf[(row*width)..];
let mut random = LEu32(inb, 0);

for (o, i) in out.chunks_mut(1).zip(inb.chunks(1)) {
for (o, i) in out.chunks_exact_mut(1).zip(inb.chunks_exact(1)) {
o[0] = tbl.dither(i[0] as u16, &mut random);
}
}))
@@ -15,7 +15,7 @@ pub fn decode_10le_lsb16(buf: &[u8], width: usize, height: usize) -> Vec<u16> {
decode_threaded(width, height, &(|out: &mut [u16], row| {
let inb = &buf[(row*width*10/8)..];

for (o, i) in out.chunks_mut(8).zip(inb.chunks(10)) {
for (o, i) in out.chunks_exact_mut(8).zip(inb.chunks_exact(10)) {
let g1: u16 = i[0] as u16;
let g2: u16 = i[1] as u16;
let g3: u16 = i[2] as u16;
@@ -43,7 +43,7 @@ pub fn decode_10le(buf: &[u8], width: usize, height: usize) -> Vec<u16> {
decode_threaded(width, height, &(|out: &mut [u16], row| {
let inb = &buf[(row*width*10/8)..];

for (o, i) in out.chunks_mut(4).zip(inb.chunks(5)) {
for (o, i) in out.chunks_exact_mut(4).zip(inb.chunks_exact(5)) {
let g1: u16 = i[0] as u16;
let g2: u16 = i[1] as u16;
let g3: u16 = i[2] as u16;
@@ -62,7 +62,7 @@ pub fn decode_12be(buf: &[u8], width: usize, height: usize) -> Vec<u16> {
decode_threaded(width, height, &(|out: &mut [u16], row| {
let inb = &buf[(row*width*12/8)..];

for (o, i) in out.chunks_mut(2).zip(inb.chunks(3)) {
for (o, i) in out.chunks_exact_mut(2).zip(inb.chunks_exact(3)) {
let g1: u16 = i[0] as u16;
let g2: u16 = i[1] as u16;
let g3: u16 = i[2] as u16;
@@ -76,7 +76,7 @@ pub fn decode_12be(buf: &[u8], width: usize, height: usize) -> Vec<u16> {
pub fn decode_12be_msb16(buf: &[u8], width: usize, height: usize) -> Vec<u16> {
let mut out: Vec<u16> = vec![0; width*height];

for (o, i) in out.chunks_mut(4).zip(buf.chunks(6)) {
for (o, i) in out.chunks_exact_mut(4).zip(buf.chunks_exact(6)) {
let g1: u16 = i[ 0] as u16;
let g2: u16 = i[ 1] as u16;
let g3: u16 = i[ 2] as u16;
@@ -97,7 +97,7 @@ pub fn decode_12le_16bitaligned(buf: &[u8], width: usize, height: usize) -> Vec<
let stride = ((width*12/8+1) >> 1) << 1;
decode_threaded(width, height, &(|out: &mut [u16], row| {
let inb = &buf[row*stride..];
for (o, i) in out.chunks_mut(2).zip(inb.chunks(3)) {
for (o, i) in out.chunks_exact_mut(2).zip(inb.chunks_exact(3)) {
let g1: u16 = i[ 0] as u16;
let g2: u16 = i[ 1] as u16;
let g3: u16 = i[ 2] as u16;
@@ -111,7 +111,7 @@ pub fn decode_12le_16bitaligned(buf: &[u8], width: usize, height: usize) -> Vec<
pub fn decode_12be_msb32(buf: &[u8], width: usize, height: usize) -> Vec<u16> {
let mut out: Vec<u16> = vec![0; width*height];

for (o, i) in out.chunks_mut(8).zip(buf.chunks(12)) {
for (o, i) in out.chunks_exact_mut(8).zip(buf.chunks_exact(12)) {
let g1: u16 = i[ 0] as u16;
let g2: u16 = i[ 1] as u16;
let g3: u16 = i[ 2] as u16;
@@ -145,8 +145,8 @@ pub fn decode_12le_wcontrol(buf: &[u8], width: usize, height: usize) -> Vec<u16>
decode_threaded(width, height, &(|out: &mut [u16], row| {
let inb = &buf[(row*perline)..];

for (oc, ic) in out.chunks_mut(10).zip(inb.chunks(16)) {
for (o, i) in oc.chunks_mut(2).zip(ic.chunks(3)) {
for (oc, ic) in out.chunks_exact_mut(10).zip(inb.chunks_exact(16)) {
for (o, i) in oc.chunks_exact_mut(2).zip(ic.chunks_exact(3)) {
let g1: u16 = i[0] as u16;
let g2: u16 = i[1] as u16;
let g3: u16 = i[2] as u16;
@@ -165,8 +165,8 @@ pub fn decode_12be_wcontrol(buf: &[u8], width: usize, height: usize) -> Vec<u16>
decode_threaded(width, height, &(|out: &mut [u16], row| {
let inb = &buf[(row*perline)..];

for (oc, ic) in out.chunks_mut(10).zip(inb.chunks(16)) {
for (o, i) in oc.chunks_mut(2).zip(ic.chunks(3)) {
for (oc, ic) in out.chunks_exact_mut(10).zip(inb.chunks_exact(16)) {
for (o, i) in oc.chunks_exact_mut(2).zip(ic.chunks_exact(3)) {
let g1: u16 = i[0] as u16;
let g2: u16 = i[1] as u16;
let g3: u16 = i[2] as u16;
@@ -189,7 +189,7 @@ pub fn decode_12be_interlaced(buf: &[u8], width: usize, height: usize) -> Vec<u1
let off = row/2*width*12/8;
let inb = if (row % 2) == 0 { &buf[off..] } else { &second_field[off..] };

for (o, i) in out.chunks_mut(2).zip(inb.chunks(3)) {
for (o, i) in out.chunks_exact_mut(2).zip(inb.chunks_exact(3)) {
let g1: u16 = i[0] as u16;
let g2: u16 = i[1] as u16;
let g3: u16 = i[2] as u16;
@@ -208,7 +208,7 @@ pub fn decode_12be_interlaced_unaligned(buf: &[u8], width: usize, height: usize)
let off = row/2*width*12/8;
let inb = if (row % 2) == 0 { &buf[off..] } else { &second_field[off..] };

for (o, i) in out.chunks_mut(2).zip(inb.chunks(3)) {
for (o, i) in out.chunks_exact_mut(2).zip(inb.chunks_exact(3)) {
let g1: u16 = i[0] as u16;
let g2: u16 = i[1] as u16;
let g3: u16 = i[2] as u16;
@@ -223,7 +223,7 @@ pub fn decode_12le(buf: &[u8], width: usize, height: usize) -> Vec<u16> {
decode_threaded(width, height, &(|out: &mut [u16], row| {
let inb = &buf[(row*width*12/8)..];

for (o, i) in out.chunks_mut(2).zip(inb.chunks(3)) {
for (o, i) in out.chunks_exact_mut(2).zip(inb.chunks_exact(3)) {
let g1: u16 = i[0] as u16;
let g2: u16 = i[1] as u16;
let g3: u16 = i[2] as u16;
@@ -238,7 +238,7 @@ pub fn decode_12le_unpacked(buf: &[u8], width: usize, height: usize) -> Vec<u16>
decode_threaded(width, height, &(|out: &mut [u16], row| {
let inb = &buf[(row*width*2)..];

for (i, bytes) in (0..width).zip(inb.chunks(2)) {
for (i, bytes) in (0..width).zip(inb.chunks_exact(2)) {
out[i] = LEu16(bytes, 0) & 0x0fff;
}
}))
@@ -248,7 +248,7 @@ pub fn decode_12be_unpacked(buf: &[u8], width: usize, height: usize) -> Vec<u16>
decode_threaded(width, height, &(|out: &mut [u16], row| {
let inb = &buf[(row*width*2)..];

for (i, bytes) in (0..width).zip(inb.chunks(2)) {
for (i, bytes) in (0..width).zip(inb.chunks_exact(2)) {
out[i] = BEu16(bytes, 0) & 0x0fff;
}
}))
@@ -258,7 +258,7 @@ pub fn decode_12be_unpacked_left_aligned(buf: &[u8], width: usize, height: usize
decode_threaded(width, height, &(|out: &mut [u16], row| {
let inb = &buf[(row*width*2)..];

for (i, bytes) in (0..width).zip(inb.chunks(2)) {
for (i, bytes) in (0..width).zip(inb.chunks_exact(2)) {
out[i] = BEu16(bytes, 0) >> 4;
}
}))
@@ -268,7 +268,7 @@ pub fn decode_12le_unpacked_left_aligned(buf: &[u8], width: usize, height: usize
decode_threaded(width, height, &(|out: &mut [u16], row| {
let inb = &buf[(row*width*2)..];

for (i, bytes) in (0..width).zip(inb.chunks(2)) {
for (i, bytes) in (0..width).zip(inb.chunks_exact(2)) {
out[i] = LEu16(bytes, 0) >> 4;
}
}))
@@ -278,7 +278,7 @@ pub fn decode_14le_unpacked(buf: &[u8], width: usize, height: usize) -> Vec<u16>
decode_threaded(width, height, &(|out: &mut [u16], row| {
let inb = &buf[(row*width*2)..];

for (i, bytes) in (0..width).zip(inb.chunks(2)) {
for (i, bytes) in (0..width).zip(inb.chunks_exact(2)) {
out[i] = LEu16(bytes, 0) & 0x3fff;
}
}))
@@ -288,7 +288,7 @@ pub fn decode_14be_unpacked(buf: &[u8], width: usize, height: usize) -> Vec<u16>
decode_threaded(width, height, &(|out: &mut [u16], row| {
let inb = &buf[(row*width*2)..];

for (i, bytes) in (0..width).zip(inb.chunks(2)) {
for (i, bytes) in (0..width).zip(inb.chunks_exact(2)) {
out[i] = BEu16(bytes, 0) & 0x3fff;
}
}))
@@ -298,7 +298,7 @@ pub fn decode_16le(buf: &[u8], width: usize, height: usize) -> Vec<u16> {
decode_threaded(width, height, &(|out: &mut [u16], row| {
let inb = &buf[(row*width*2)..];

for (i, bytes) in (0..width).zip(inb.chunks(2)) {
for (i, bytes) in (0..width).zip(inb.chunks_exact(2)) {
out[i] = LEu16(bytes, 0);
}
}))
@@ -308,7 +308,7 @@ pub fn decode_16le_skiplines(buf: &[u8], width: usize, height: usize) -> Vec<u16
decode_threaded(width, height, &(|out: &mut [u16], row| {
let inb = &buf[(row*width*4)..];

for (i, bytes) in (0..width).zip(inb.chunks(2)) {
for (i, bytes) in (0..width).zip(inb.chunks_exact(2)) {
out[i] = LEu16(bytes, 0);
}
}))
@@ -318,7 +318,7 @@ pub fn decode_16be(buf: &[u8], width: usize, height: usize) -> Vec<u16> {
decode_threaded(width, height, &(|out: &mut [u16], row| {
let inb = &buf[(row*width*2)..];

for (i, bytes) in (0..width).zip(inb.chunks(2)) {
for (i, bytes) in (0..width).zip(inb.chunks_exact(2)) {
out[i] = BEu16(bytes, 0);
}
}))
@@ -97,7 +97,7 @@ impl<'a> Rw2Decoder<'a> {
}

let mut sh: i32 = 0;
for out in out.chunks_mut(14) {
for out in out.chunks_exact_mut(14) {
let mut pred: [i32;2] = [0,0];
let mut nonz: [i32;2] = [0,0];

0 comments on commit da5ed8c

Please sign in to comment.