Permalink
Browse files

fixed: gcc44 aliasing violations (gh-1)

  • Loading branch information...
lvv committed Apr 21, 2009
1 parent 4ddd97b commit 1ff951993e553607f61aaa564b405c89eaceda31
Showing with 66 additions and 18 deletions.
  1. +26 −10 array.h
  2. +24 −2 sse.h
  3. +9 −2 u-array-small.cc
  4. +7 −4 u-array.cc
View
36 array.h
@@ -17,6 +17,8 @@
#include <numeric>
using std::accumulate;
#include <algorithm>
//using std::max;
#include <iterator>
#include <algorithm>
@@ -154,7 +156,7 @@ struct array {
template <typename TT, int NN, int BB> friend istream& operator>> (istream& is, array<TT,NN,BB>& a);
//// ================================================================================================================ SUM
//// ================================================================================================================ SUM
template<typename method_type> T sum() const { return sum_impl(method_type(), T()); } // explicit
T sum() const { return sum_impl(typename select_method<T,N>::type(), T()); } // auto-selection
@@ -207,19 +209,24 @@ float max_impl (sse, float) const { // DBG cerr <<" max<sse,float> " << N <<
assert(N>=sse_size*unroll && "sse can be used for N>=8");
const unsigned prefetch = 512;
const unsigned cycle_step = unroll * sse_size;
const size_t sse_cycle = N - N % cycle_step;
const size_t sse_cycles = N - N % cycle_step;
__m128 m1 = mk_m128(elems[0]);
__m128 m2 = mk_m128(elems[sse_size]);
for (size_t i= cycle_step; i < sse_cycle; i+=cycle_step) { // SSE
for (size_t i= cycle_step; i < sse_cycles; i+=cycle_step) { // SSE
m1 = _mm_max_ps(m1, mk_m128(elems[i]) );
m2 = _mm_max_ps(m2, mk_m128(elems[i+sse_size]) );
__builtin_prefetch((void*)&elems[i+prefetch],0,0);
}
m1 = _mm_max_ps(m1, m2);
T m = mk_array<float,4,0>(m1).max<plain>(); for (size_t i=sse_cycle; i<N; i++) m = m < elems[i] ? elems[i] : m; return m;
float reg_save[4] __attribute__((aligned(16)));
_mm_store_ps (reg_save, m1);
T reg_max = reinterpret_cast<const array<float,4>* > (reg_save) -> max<plain>(); // vector register max
T tail_max = reinterpret_cast<const array<float,N-sse_cycles>* > (elems +sse_cycles) -> max<plain>(); // tail max
return std::max(reg_max, tail_max);
}
#endif
@@ -234,29 +241,38 @@ int16_t max_impl (sse2, int16_t) const { // DBG cerr << " max<sse2,int16> " <<
assert(N>=sse_size*unroll);
const unsigned prefetch = 1024;
const unsigned cycle_step = unroll * sse_size;
const size_t sse_cycle = N - N % cycle_step;
const size_t sse_cycles = N - N % cycle_step;
__m128i m1 = mk_m128i(elems[0]);
__m128i m2 = mk_m128i(elems[sse_size]);
for (size_t i= cycle_step; i < sse_cycle; i+=cycle_step) {
for (size_t i= cycle_step; i < sse_cycles; i+=cycle_step) {
m1 = _mm_max_epi16(m1, mk_m128i(elems[i]) );
m2 = _mm_max_epi16(m2, mk_m128i(elems[i+sse_size]) );
__builtin_prefetch((void*)(elems+prefetch),0,0);
}
m1 = _mm_max_epi16(m1, m2);
/*
int16_t tmp8[8] __attribute__((aligned(16)));
_mm_store_si128 ((__m128i *)tmp8, m1);
int16_t max = tmp8[0];
for (size_t i=1; i<8; i++) max = max < tmp8[i] ? tmp8[i] : max;
for (size_t i=sse_cycle; i<N; i++) max = max < elems[i] ? elems[i] : max;
for (size_t i=sse_cycles; i<N; i++) max = max < elems[i] ? elems[i] : max;
// 123
//cout << " : " << *(array<int16_t,8>*) &m1;
*/
int16_t max = mk_array<int16_t,sse_size,0>(m1).max<plain>(); for (size_t i=sse_cycle; i<N; i++) max = max < elems[i] ? elems[i] : max;
//int16_t max = reinterpret_cast < array<int16_t, sse_size, 0> > (m1) .max<plain>();
/*
for ( size_t i=sse_cycles; i<N; i++)
max = (max < elems[i])
? elems[i]
: max;
*/
return max;
}
View
26 sse.h
@@ -37,9 +37,31 @@
#define mk_m128(x) *(__m128*)&(x)
#define mk_m128i(x) *(__m128i*)&(x)
template<typename T, int N, int B> class array;
template<typename T, int N, int B> array<T,N,B>& mk_array(const __m128& V) { return *(array<T,N,B>*)&(V); } // is there an overhead?
//template<typename T, int N, int B> array<T,N,B>& mk_array(const __m128& V) { return *(array<T,N,B>*)&(V); } // is there an overhead?
//template<typename T, int N, int B> const array<T,N,B>& mk_array(const __m128& V) { return *(array<T,N,B>*)&(V); } // is there an overhead?
template<typename T, int N, int B> const array<T,N,B>& mk_array(const __m128i& V) { return *(array<T,N,B>*)&(V); }
template<typename T, int N, int B>
union sse_vector{
__m128i m128i;
array<T,N,B> arr;
};
//template<typename T, int N, int B> const array<T,N,B>& mk_array(const __m128i& V) {
// template<typename T, int N, int B>
//const array<T,N,B>& mk_array(const __m128i& V) {
// sse_vector<T,N,B> SV;
// SV.m128i = V;
// return SV.arr;
//}
// #endif // __SSE__
template<typename T, int N, int B>
const array<T,N,B>& mk_array(const __m128i& V) {
sse_vector<T,N,B> SV;
SV.m128i = V;
return SV.arr;
}
#endif // __SSE__
View
@@ -18,9 +18,16 @@
#include <lvv/check.h>
using namespace std;
using namespace lvv;
int
main() {
array<int16_t,1000> h1000 = {{1,2,3,4,5,6,7,8,9,10}};
CHECKeq((h1000.max_impl(sse2(),int16_t())),10);
array<int16_t,10> h10 = {{1,2,3,4,5,6,7,8,9,10}};
//CHECKeq((h10.max_impl(sse2(),int16_t())),10);
cout << h10[0] << endl;
cout << h10 << endl;
cout << h10[9] << endl;
//cout << *(h10.sub_array<int16_t,4,1>(3)) << endl;
CHECK_EXIT;
}
View
@@ -2,7 +2,6 @@
#define CANUSE_SSE
#define CANUSE_SSE2
#define CANUSE_SSE3
#define GCC_BUG
//#include <iostream>
@@ -166,21 +165,26 @@ main() {
CHECK(typeid(select_method<int8_t , 200>::type) == typeid(plain) ) ;
CHECK(typeid(select_method<int8_t ,200>::type ) == typeid(plain));
cout << " ***** SSE SPECIALISATION ******************************************\n"; ///////////////////////////////////////////////
cout << "\n ##### SSE SPECIALISATION ##########################################\n"; ///////////////////////////////////////////////
{ cout << " ***** FLOAT-32 *******\n";
{ array<float,6> f6 = {{1,2,3,4,5,6}}; CHECK( f6.max() == 6 ); }
CHECK(typeid(select_method<float , 2>::type ) == typeid(plain) ) ;
CHECK(typeid(select_method<float , 200>::type) == typeid(sse) ) ;
array<float,3> f3 = {{1,2,3}};
array<float,4> f4 = {{1,2,3,4}};
array<float,8,1> f81 = {{1,2,3,4,5,6,7,8}};
array<float,10> f10 = {{1,2,3,4,5,6,7,8,9,10}};
array<float,20> f20 = {{1,2,3,4,5,6,7,8,9,10}};
#ifdef CANUSE_SSE
// CHECKeq((f3.max<sse>()),3); should trigger static assert
CHECKeq((f10.max<sse>()),10);
#endif
CHECKeq(f20.max(),10);
CHECKeq(f4.max(),4);
CHECKeq(f81.max(),8);
}
@@ -196,12 +200,11 @@ main() {
//CHECKeq((h10.max<sse>()),10); // should trigger static assert (sould be: sse2)
//CHECKeq((h10.max<sse2>()),10); // assert will fail (no true: 10 > 8*2)
#ifdef GCC_BUG
CHECKeq((h1000.max<plain>()),10);
CHECKeq((h1000.max()),10);
CHECKeq((h1000.max_impl(sse2(),int16_t())),10);
CHECKeq((h1000.max<sse2>()),10);
#endif
}
CHECK_EXIT;

0 comments on commit 1ff9519

Please sign in to comment.