diff --git a/INSTALL b/INSTALL
index 988761c8db..cac32a1b62 100644
--- a/INSTALL
+++ b/INSTALL
@@ -1,77 +1,97 @@
-
-You can use ./build.sh to configure and build with default options.
-
-It is advised to run ./autogen.sh before ./configure (autoconf and automake
-need to be installed on your system for autogen.sh to work)
-
-./configure has an option named --with-cuda that allows you to specify
-where your CUDA 6.5 toolkit is installed (usually /usr/local/cuda,
-but some distros may have a different default location)
-
-
-** How to compile on Fedora 25 **
-
-Note: You may find an alternative method via rpms :
-see https://negativo17.org/nvidia-driver/ and https://negativo17.org/repos/multimedia/
-
-
-# Step 1: gcc and dependencies
-dnf install gcc gcc-c++ autoconf automake
-dnf install jansson-devel openssl-devel libcurl-devel zlib-devel
-
-# Step 2: nvidia drivers (Download common linux drivers from nvidia site)
-dnf install kernel-devel
-dnf install https://download1.rpmfusion.org/free/fedora/rpmfusion-free-release-$(rpm -E %fedora).noarch.rpm https://download1.rpmfusion.org/nonfree/fedora/rpmfusion-nonfree-release-$(rpm -E %fedora).noarch.rpm
-dnf check-update
-dnf install xorg-x11-drv-nvidia-cuda kmod-nvidia
-ln -s libnvidia-ml.so.1 /usr/lib64/libnvidia-ml.so
-
-# Step 3: CUDA SDK (Download from nvidia the generic ".run" archive)
-#         --override is required to ignore "too recent" gcc 6.3
-#         --silent is required to install only the toolkit (no kmod)
-./cuda_8.0.61_375.26_linux.run --toolkit --silent --override
-nvcc --version
-
-# add the nvcc binary path to the system
-ln -s /usr/local/cuda-8.0 /usr/local/cuda # (if not already made)
-echo 'export PATH=$PATH:/usr/local/cuda/bin' > /etc/profile.d/cuda.sh
-
-# add the cudart library path to the system
-echo /usr/local/cuda/lib64 > /etc/ld.so.conf.d/cuda.conf
-ldconfig
-
-# Step 4: Fix the toolkit incompatibility with gcc 6
-
-# You need to build yourself an older GCC/G++ version, i recommend the 5.4
-# see https://gcc.gnu.org/mirrors.html
-# Note: this manual method will override the default gcc, it could be better to use a custom toolchain prefix
-
-wget ftp://ftp.lip6.fr/pub/gcc/releases/gcc-5.4.0/gcc-5.4.0.tar.bz2
-dnf install libmpc-devel mpfr-devel gmp-devel
-./configure --prefix=/usr/local --enable-languages=c,c++,lto --disable-multilib
-make -j 8 && make install
-(while this step, you have the time to cook something :p)
-
-# or, for previous fedora versions, edit the file /usr/local/cuda/include/host_config.h
-# and comment/delete the line 121 : #error -- unsupported GNU version! gcc versions later than 5 are not supported!
-
-./build.sh
-
-./ccminer -n
-
-
-** How to compile on macOS **
-
-# Step 1: download and install CUDA Toolkit 8 or more recent
-# https://developer.nvidia.com/cuda-toolkit-archive
-
-# Step 2: install Homebrew
-ruby -e "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/master/install)"
-
-# Step 3: dependencies
-brew install pkg-config autoconf automake curl openssl llvm
-
-./build.sh
-
-./ccminer -n
-
+
+You can use ./build.sh to configure and build with default options.
+
+It is advised to run ./autogen.sh before ./configure (autoconf and automake
+need to be installed on your system for autogen.sh to work)
+
+./configure has an option named --with-cuda that allows you to specify
+where your CUDA 6.5 toolkit is installed (usually /usr/local/cuda,
+but some distros may have a different default location)
+
+
+** How to compile on Ubuntu (16.04 LTS)
+
+First, install Cuda toolkit and nVidia Driver, and type `nvidia-smi` to check if your card is detected.
+
+Install dependencies
+```sudo apt-get install libcurl4-openssl-dev libssl-dev libjansson-dev automake autotools-dev build-essential```
+
+Ubuntu is now shipped with gcc 6 or 7 so please install gcc/g++ 5 and make it the default (required by the cuda toolkit)
+```
+sudo apt-get install gcc-5 g++-5
+sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-5 1
+```
+
+Then use the helper ./build.sh in ccminer source folder, edit configure.sh and the Makefile.am if required.
+```
+./build.sh
+./ccminer --version
+```
+
+
+** How to compile on Fedora 25 **
+
+Note: You may find an alternative method via rpms :
+see https://negativo17.org/nvidia-driver/ and https://negativo17.org/repos/multimedia/
+
+
+# Step 1: gcc and dependencies
+dnf install gcc gcc-c++ autoconf automake
+dnf install jansson-devel openssl-devel libcurl-devel zlib-devel
+
+# Step 2: nvidia drivers (Download common linux drivers from nvidia site)
+dnf install kernel-devel
+dnf install https://download1.rpmfusion.org/free/fedora/rpmfusion-free-release-$(rpm -E %fedora).noarch.rpm https://download1.rpmfusion.org/nonfree/fedora/rpmfusion-nonfree-release-$(rpm -E %fedora).noarch.rpm
+dnf check-update
+dnf install xorg-x11-drv-nvidia-cuda kmod-nvidia
+ln -s libnvidia-ml.so.1 /usr/lib64/libnvidia-ml.so
+
+# Step 3: CUDA SDK (Download from nvidia the generic ".run" archive)
+#         --override is required to ignore "too recent" gcc 6.3
+#         --silent is required to install only the toolkit (no kmod)
+./cuda_8.0.61_375.26_linux.run --toolkit --silent --override
+nvcc --version
+
+# add the nvcc binary path to the system
+ln -s /usr/local/cuda-8.0 /usr/local/cuda # (if not already made)
+echo 'export PATH=$PATH:/usr/local/cuda/bin' > /etc/profile.d/cuda.sh
+
+# add the cudart library path to the system
+echo /usr/local/cuda/lib64 > /etc/ld.so.conf.d/cuda.conf
+ldconfig
+
+# Step 4: Fix the toolkit incompatibility with gcc 6
+
+# You need to build yourself an older GCC/G++ version, i recommend the 5.4
+# see https://gcc.gnu.org/mirrors.html
+# Note: this manual method will override the default gcc, it could be better to use a custom toolchain prefix
+
+wget ftp://ftp.lip6.fr/pub/gcc/releases/gcc-5.4.0/gcc-5.4.0.tar.bz2
+dnf install libmpc-devel mpfr-devel gmp-devel
+./configure --prefix=/usr/local --enable-languages=c,c++,lto --disable-multilib
+make -j 8 && make install
+(while this step, you have the time to cook something :p)
+
+# or, for previous fedora versions, edit the file /usr/local/cuda/include/host_config.h
+# and comment/delete the line 121 : #error -- unsupported GNU version! gcc versions later than 5 are not supported!
+
+./build.sh
+
+./ccminer -n
+
+
+** How to compile on macOS **
+
+# Step 1: download and install CUDA Toolkit 8 or more recent
+# https://developer.nvidia.com/cuda-toolkit-archive
+
+# Step 2: install Homebrew
+ruby -e "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/master/install)"
+
+# Step 3: dependencies
+brew install pkg-config autoconf automake curl openssl llvm
+
+./build.sh
+
+./ccminer -n
+
diff --git a/JHA/jha.cu b/JHA/jha.cu
index ec7895c10d..94172ff585 100644
--- a/JHA/jha.cu
+++ b/JHA/jha.cu
@@ -147,6 +147,9 @@ extern "C" int scanhash_jha(int thr_id, struct work *work, uint32_t max_nonce, u
 			CUDA_LOG_ERROR();
 		}
 		cuda_get_arch(thr_id);
+		if (cuda_arch[dev_id] >= 500) {
+			applog(LOG_WARNING, "You are not using the optimal algo, please try -a jackpot");
+		}
 		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
 
 		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput));
diff --git a/Makefile.am b/Makefile.am
index cc7a1698d9..abf3115a1f 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -66,14 +66,16 @@ ccminer_SOURCES	= elist.h miner.h compat.h \
 			  sph/ripemd.c sph/sph_sha2.c \
 			  lbry/lbry.cu lbry/cuda_sha256_lbry.cu lbry/cuda_sha512_lbry.cu lbry/cuda_lbry_merged.cu \
 			  qubit/qubit.cu qubit/qubit_luffa512.cu qubit/deep.cu qubit/luffa.cu \
-			  tribus.cu \
+			  tribus/tribus.cu tribus/cuda_echo512_final.cu \
 			  x11/x11.cu x11/fresh.cu x11/cuda_x11_luffa512.cu x11/cuda_x11_cubehash512.cu \
 			  x11/cuda_x11_shavite512.cu x11/cuda_x11_simd512.cu x11/cuda_x11_echo.cu \
 			  x11/cuda_x11_luffa512_Cubehash.cu x11/x11evo.cu x11/timetravel.cu x11/bitcore.cu \
 			  x13/x13.cu x13/cuda_x13_hamsi512.cu x13/cuda_x13_fugue512.cu \
+			  x13/hsr.cu x13/cuda_hsr_sm3.cu x13/sm3.c \
 			  x15/x14.cu x15/x15.cu x15/cuda_x14_shabal512.cu x15/cuda_x15_whirlpool.cu \
 			  x15/whirlpool.cu x15/cuda_x15_whirlpool_sm3.cu \
 			  x17/x17.cu x17/hmq17.cu x17/cuda_x17_haval256.cu x17/cuda_x17_sha512.cu \
+			  x11/phi.cu x11/cuda_streebog_maxwell.cu \
 			  x11/c11.cu x11/s3.cu x11/sib.cu x11/veltor.cu x11/cuda_streebog.cu
 
 # scrypt
diff --git a/README.md b/README.md
index 001c4ecab5..5bcb50572c 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,4 @@
-ccminer
-=======
+# ccminer
 
 Based on Christian Buchner's &amp; Christian H.'s CUDA project, no more active on github since 2014.
 
@@ -10,7 +9,7 @@ BTC donation address: 1AJdfCpLWPNoAMDfHF1wD5y8VgKSSTHxPo (tpruvot)
 A part of the recent algos were originally written by [djm34](https://github.com/djm34) and [alexis78](https://github.com/alexis78)
 
 This variant was tested and built on Linux (ubuntu server 14.04, 16.04, Fedora 22 to 25)
-It is also built for Windows 7 to 10 with VStudio 2013, to stay compatible with Windows Vista.
+It is also built for Windows 7 to 10 with VStudio 2013, to stay compatible with Windows 7 and Vista.
 
 Note that the x86 releases are generally faster than x64 ones on Windows, but that tend to change with the recent drivers.
 
@@ -22,9 +21,7 @@ About source code dependencies
 This project requires some libraries to be built :
 
 - OpenSSL (prebuilt for win)
-
 - Curl (prebuilt for win)
-
 - pthreads (prebuilt for win)
 
 The tree now contains recent prebuilt openssl and curl .lib for both x86 and x64 platforms (windows).
@@ -32,5 +29,8 @@ The tree now contains recent prebuilt openssl and curl .lib for both x86 and x64
 To rebuild them, you need to clone this repository and its submodules :
     git clone https://github.com/peters/curl-for-windows.git compat/curl-for-windows
 
-On Linux, you can use the helper ./build.sh (edit configure.sh and the Makefile.am if required)
 
+Compile on Linux
+----------------
+
+Please see [INSTALL](https://github.com/tpruvot/ccminer/blob/linux/INSTALL) file or [project Wiki](https://github.com/tpruvot/ccminer/wiki/Compatibility)
diff --git a/README.txt b/README.txt
index 0fa05f9b22..290ba605c1 100644
--- a/README.txt
+++ b/README.txt
@@ -1,5 +1,5 @@
 
-ccminer 2.2 (August 2017) "Equihash, tribus and optimized skunk"
+ccminer 2.2.2 (Oct. 2017) "phi and hsr algos"
 ---------------------------------------------------------------
 
 ***************************************************************
@@ -90,7 +90,8 @@ its command line interface and options.
                           fugue256    use to mine Fuguecoin
                           groestl     use to mine Groestlcoin
                           heavy       use to mine Heavycoin
-                          jha         use to mine JackpotCoin
+                          hsr         use to mine Hshare
+                          jackpot     use to mine Sweepcoin
                           keccak      use to mine Maxcoin
                           lbry        use to mine LBRY Credits
                           luffa       use to mine Joincoin
@@ -102,6 +103,7 @@ its command line interface and options.
                           neoscrypt   use to mine FeatherCoin
                           nist5       use to mine TalkCoin
                           penta       use to mine Joincoin / Pentablake
+                          phi         use to mine LUXCoin
                           quark       use to mine Quarkcoin
                           qubit       use to mine Qubit
                           scrypt      use to mine Scrypt coins
@@ -155,6 +157,7 @@ its command line interface and options.
   -T, --timeout=N       network timeout, in seconds (default: 300)
   -s, --scantime=N      upper bound on time spent scanning current work when
                         long polling is unavailable, in seconds (default: 5)
+      --submit-stale    ignore stale job checks, may create more rejected shares
   -n, --ndevs           list cuda devices
   -N, --statsavg        number of samples used to display hashrate (default: 30)
       --no-gbt          disable getblocktemplate support (height check in solo)
@@ -277,6 +280,16 @@ so we can more efficiently implement new algorithms using the latest hardware
 features.
 
 >>> RELEASE HISTORY <<<
+  Oct. 09th 2017  v2.2.2
+                  Import and clean the hsr algo (x13 + custom hash)
+                  Import and optimise phi algo from LuxCoin repository
+                  Improve sib algo too for maxwell and pascal cards
+                  Small fix to handle more than 9 cards on linux (-d 10+)
+                  Attempt to free equihash memory "properly"
+                  --submit-stale parameter for supernova pool (which change diff too fast)
+
+  Sep. 01st 2017  v2.2.1
+                  Improve tribus algo on recent cards (up to +10%)
 
   Aug. 13th 2017  v2.2
                   New skunk algo, using the heavy streebog algorithm
diff --git a/algos.h b/algos.h
index b7dd0f21fa..3c1528b326 100644
--- a/algos.h
+++ b/algos.h
@@ -22,6 +22,7 @@ enum sha_algos {
 	ALGO_GROESTL,
 	ALGO_HEAVY,		/* Heavycoin hash */
 	ALGO_HMQ1725,
+	ALGO_HSR,
 	ALGO_KECCAK,
 	ALGO_JACKPOT,
 	ALGO_JHA,
@@ -35,6 +36,7 @@ enum sha_algos {
 	ALGO_NEOSCRYPT,
 	ALGO_NIST5,
 	ALGO_PENTABLAKE,
+	ALGO_PHI,
 	ALGO_QUARK,
 	ALGO_QUBIT,
 	ALGO_SCRYPT,
@@ -87,6 +89,7 @@ static const char *algo_names[] = {
 	"groestl",
 	"heavy",
 	"hmq1725",
+	"hsr",
 	"keccak",
 	"jackpot",
 	"jha",
@@ -100,6 +103,7 @@ static const char *algo_names[] = {
 	"neoscrypt",
 	"nist5",
 	"penta",
+	"phi",
 	"quark",
 	"qubit",
 	"scrypt",
@@ -161,12 +165,16 @@ static inline int algo_to_int(char* arg)
 			i = ALGO_LUFFA;
 		else if (!strcasecmp("hmq17", arg))
 			i = ALGO_HMQ1725;
+		else if (!strcasecmp("hshare", arg))
+			i = ALGO_HSR;
 		//else if (!strcasecmp("jackpot", arg))
 		//	i = ALGO_JHA;
 		else if (!strcasecmp("lyra2re", arg))
 			i = ALGO_LYRA2;
 		else if (!strcasecmp("lyra2rev2", arg))
 			i = ALGO_LYRA2v2;
+		else if (!strcasecmp("phi1612", arg))
+			i = ALGO_PHI;
 		else if (!strcasecmp("bitcoin", arg))
 			i = ALGO_SHA256D;
 		else if (!strcasecmp("sha256", arg))
diff --git a/bench.cpp b/bench.cpp
index 8f32c40fd0..b1bb5bc87d 100644
--- a/bench.cpp
+++ b/bench.cpp
@@ -59,12 +59,14 @@ void algo_free_all(int thr_id)
 	free_cryptonight(thr_id);
 	free_decred(thr_id);
 	free_deep(thr_id);
+	free_equihash(thr_id);
 	free_keccak256(thr_id);
 	free_fresh(thr_id);
 	free_fugue256(thr_id);
 	free_groestlcoin(thr_id);
 	free_heavy(thr_id);
 	free_hmq17(thr_id);
+	free_hsr(thr_id);
 	free_jackpot(thr_id);
 	free_jha(thr_id);
 	free_lbry(thr_id);
@@ -76,6 +78,7 @@ void algo_free_all(int thr_id)
 	free_neoscrypt(thr_id);
 	free_nist5(thr_id);
 	free_pentablake(thr_id);
+	free_phi(thr_id);
 	free_quark(thr_id);
 	free_qubit(thr_id);
 	free_skeincoin(thr_id);
diff --git a/ccminer.cpp b/ccminer.cpp
index dbab501ac1..a39e78cd0f 100644
--- a/ccminer.cpp
+++ b/ccminer.cpp
@@ -98,7 +98,7 @@ bool allow_gbt = true;
 bool allow_mininginfo = true;
 bool check_dups = true; //false;
 bool check_stratum_jobs = false;
-
+bool opt_submit_stale = false;
 bool submit_old = false;
 bool use_syslog = false;
 bool use_colors = true;
@@ -263,6 +263,7 @@ Options:\n\
 			groestl     Groestlcoin\n\
 			heavy       Heavycoin\n\
 			hmq1725     Doubloons / Espers\n\
+			jackpot     JHA v8\n\
 			jha         JHA v8 (JackpotCoin)\n\
 			keccak      Keccak-256 (Maxcoin)\n\
 			lbry        LBRY Credits (Sha/Ripemd)\n\
@@ -275,6 +276,7 @@ Options:\n\
 			neoscrypt   FeatherCoin, Phoenix, UFO...\n\
 			nist5       NIST5 (TalkCoin)\n\
 			penta       Pentablake hash (5x Blake 512)\n\
+			phi         BHCoin\n\
 			quark       Quark\n\
 			qubit       Qubit\n\
 			sha256d     SHA256d (bitcoin)\n\
@@ -288,7 +290,7 @@ Options:\n\
 			skunk       Skein Cube Fugue Streebog\n\
 			s3          S3 (1Coin)\n\
 			timetravel  Machinecoin permuted x8\n\
-			tribus      Denerius\n\
+			tribus      Denarius\n\
 			vanilla     Blake256-8 (VNL)\n\
 			veltor      Thorsriddle streebog\n\
 			whirlcoin   Old Whirlcoin (Whirlpool algo)\n\
@@ -315,7 +317,7 @@ Options:\n\
       --cuda-schedule   Set device threads scheduling mode (default: auto)\n\
   -f, --diff-factor     Divide difficulty by this factor (default 1.0) \n\
   -m, --diff-multiplier Multiply difficulty by this value (default 1.0) \n\
-      --vote=VOTE       vote (for HeavyCoin)\n\
+      --vote=VOTE       vote (for decred and HeavyCoin)\n\
       --trust-pool      trust the max block reward vote (maxvote) sent by the pool\n\
   -o, --url=URL         URL of mining server\n\
   -O, --userpass=U:P    username:password pair for mining server\n\
@@ -331,7 +333,8 @@ Options:\n\
       --time-limit      maximum time [s] to mine before exiting the program.\n\
   -T, --timeout=N       network timeout, in seconds (default: 300)\n\
   -s, --scantime=N      upper bound on time spent scanning current work when\n\
-                          long polling is unavailable, in seconds (default: 10)\n"
+                          long polling is unavailable, in seconds (default: 10)\n\
+      --submit-stale    ignore stale jobs checks, may create more rejected shares\n\"
 #ifndef ORG
 "\
       --segwit          Agree with Segwit (Solo Mining only)\n"
@@ -455,7 +458,8 @@ struct option options[] = {
 	{ "retries", 1, NULL, 'r' },
 	{ "retry-pause", 1, NULL, 'R' },
 	{ "scantime", 1, NULL, 's' },
-	{ "show-diff", 0, NULL, 1013 },
+	{ "show-diff", 0, NULL, 1013 }, // deprecated
+	{ "submit-stale", 0, NULL, 1015 },
 	{ "hide-diff", 0, NULL, 1014 },
 	{ "statsavg", 1, NULL, 'N' },
 	{ "gpu-clock", 1, NULL, 1070 },
@@ -927,7 +931,7 @@ static bool submit_upstream_work(CURL *curl, struct work *work)
 
 	/* discard if a newer block was received */
 	stale_work = work->height && work->height < g_work.height;
-	if (have_stratum && !stale_work && opt_algo != ALGO_ZR5 && opt_algo != ALGO_SCRYPT_JANE) {
+	if (have_stratum && !stale_work && !opt_submit_stale && opt_algo != ALGO_ZR5 && opt_algo != ALGO_SCRYPT_JANE) {
 		pthread_mutex_lock(&g_work_lock);
 		if (strlen(work->job_id + 8))
 			stale_work = strncmp(work->job_id + 8, g_work.job_id + 8, sizeof(g_work.job_id) - 8);
@@ -1049,7 +1053,7 @@ static bool submit_upstream_work(CURL *curl, struct work *work)
 			applog(LOG_DEBUG, "share diff: %.5f (x %.1f)",
 				stratum.sharediff, work->shareratio[idnonce]);
 
-		if (opt_vote) { // ALGO_HEAVY
+		if (opt_vote) { // ALGO_HEAVY ALGO_DECRED
 			nvotestr = bin2hex((const uchar*)(&nvote), 2);
 			sprintf(s, "{\"method\": \"mining.submit\", \"params\": ["
 					"\"%s\", \"%s\", \"%s\", \"%s\", \"%s\", \"%s\"], \"id\":%u}",
@@ -2684,7 +2688,9 @@ static void *miner_thread(void *userdata)
 			case ALGO_HEAVY:
 			case ALGO_JACKPOT:
 			case ALGO_JHA:
+			case ALGO_HSR:
 			case ALGO_LYRA2v2:
+			case ALGO_PHI:
 			case ALGO_S3:
 			case ALGO_SKUNK:
 			case ALGO_TIMETRAVEL:
@@ -2789,6 +2795,8 @@ static void *miner_thread(void *userdata)
 			rc = scanhash_cryptonight(thr_id, &work, max_nonce, &hashes_done);
 			break;
 		case ALGO_DECRED:
+			//applog(LOG_BLUE, "version %x, nbits %x, ntime %x extra %x",
+			//	work.data[0], work.data[29], work.data[34], work.data[38]);
 			rc = scanhash_decred(thr_id, &work, max_nonce, &hashes_done);
 			break;
 		case ALGO_DEEP:
@@ -2815,6 +2823,9 @@ static void *miner_thread(void *userdata)
 		case ALGO_HMQ1725:
 			rc = scanhash_hmq17(thr_id, &work, max_nonce, &hashes_done);
 			break;
+		case ALGO_HSR:
+			rc = scanhash_hsr(thr_id, &work, max_nonce, &hashes_done);
+			break;
 
 		case ALGO_HEAVY:
 			rc = scanhash_heavy(thr_id, &work, max_nonce, &hashes_done, work.maxvote, HEAVYCOIN_BLKHDR_SZ);
@@ -2864,6 +2875,9 @@ static void *miner_thread(void *userdata)
 		case ALGO_PENTABLAKE:
 			rc = scanhash_pentablake(thr_id, &work, max_nonce, &hashes_done);
 			break;
+		case ALGO_PHI:
+			rc = scanhash_phi(thr_id, &work, max_nonce, &hashes_done);
+			break;
 		case ALGO_SCRYPT:
 			rc = scanhash_scrypt(thr_id, &work, max_nonce, &hashes_done,
 				NULL, &tv_start, &tv_end);
@@ -4118,6 +4132,9 @@ void parse_arg(int key, char *arg)
 	case 1014:
 		opt_showdiff = false;
 		break;
+	case 1015:
+		opt_submit_stale = true;
+		break;
 	case 'S':
 	case 1018:
 		applog(LOG_INFO, "Now logging to syslog...");
@@ -4186,10 +4203,10 @@ void parse_arg(int key, char *arg)
 		{
 			int device_thr[MAX_GPUS] = { 0 };
 			int ngpus = cuda_num_devices();
-			char * pch = strtok (arg,",");
+			char* pch = strtok(arg,",");
 			opt_n_threads = 0;
 			while (pch != NULL && opt_n_threads < MAX_GPUS) {
-				if (pch[0] >= '0' && pch[0] <= '9' && pch[1] == '\0')
+				if (pch[0] >= '0' && pch[0] <= '9' && strlen(pch) <= 2)
 				{
 					if (atoi(pch) < ngpus)
 						device_map[opt_n_threads++] = atoi(pch);
@@ -4440,7 +4457,7 @@ int main(int argc, char *argv[])
 #endif
 			CUDART_VERSION/1000, (CUDART_VERSION % 1000)/10, arch);
 		printf("  Originally based on Christian Buchner and Christian H. project\n");
-		printf("  Include some algos from alexis78, djm34, sp, tsiv and klausT.\n\n");
+		printf("  Include some kernels from alexis78, djm34, djEzo, tsiv and krnlx.\n\n");
 		printf("BTC donation address: 1AJdfCpLWPNoAMDfHF1wD5y8VgKSSTHxPo (tpruvot)\n\n");
 	}
 
diff --git a/ccminer.vcxproj b/ccminer.vcxproj
index 01d1aff5f8..f242c59d6f 100644
--- a/ccminer.vcxproj
+++ b/ccminer.vcxproj
@@ -331,6 +331,7 @@
     <ClCompile Include="sph\hamsi.c" />
     <ClCompile Include="sph\hamsi_helper.c" />
     <ClCompile Include="sph\whirlpool.c" />
+    <ClCompile Include="x13\sm3.c" />
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="compat.h" />
@@ -539,7 +540,11 @@
       <MaxRegCount>64</MaxRegCount>
     </CudaCompile>
     <CudaCompile Include="skunk\cuda_skunk_streebog.cu" />
-    <CudaCompile Include="tribus.cu" />
+    <ClInclude Include="tribus\cuda_echo512_aes.cuh" />
+    <CudaCompile Include="tribus\cuda_echo512_final.cu">
+      <CodeGeneration>compute_50,sm_50;compute_52,sm_52</CodeGeneration>
+    </CudaCompile>
+    <CudaCompile Include="tribus\tribus.cu" />
     <ClInclude Include="x11\cuda_x11_aes.cuh" />
     <CudaCompile Include="x11\cuda_x11_cubehash512.cu" />
     <CudaCompile Include="x11\cuda_x11_echo.cu">
@@ -556,8 +561,10 @@
       <MaxRegCount>64</MaxRegCount>
     </CudaCompile>
     <CudaCompile Include="x11\cuda_streebog.cu" />
+    <CudaCompile Include="x11\cuda_streebog_maxwell.cu" />
     <CudaCompile Include="x11\c11.cu" />
     <CudaCompile Include="x11\fresh.cu" />
+    <CudaCompile Include="x11\phi.cu" />
     <CudaCompile Include="x11\sib.cu" />
     <CudaCompile Include="x11\s3.cu" />
     <CudaCompile Include="x11\timetravel.cu" />
@@ -571,6 +578,8 @@
     <CudaCompile Include="x13\cuda_x13_fugue512.cu">
     </CudaCompile>
     <CudaCompile Include="x13\x13.cu" />
+    <CudaCompile Include="x13\cuda_hsr_sm3.cu" />
+    <CudaCompile Include="x13\hsr.cu" />
     <CudaCompile Include="x15\x14.cu" />
     <CudaCompile Include="x15\cuda_x14_shabal512.cu" />
     <CudaCompile Include="x15\cuda_x15_whirlpool.cu" />
diff --git a/ccminer.vcxproj.filters b/ccminer.vcxproj.filters
index 37372b42ab..87e18010bc 100644
--- a/ccminer.vcxproj.filters
+++ b/ccminer.vcxproj.filters
@@ -109,6 +109,9 @@
     <Filter Include="Source Files\equi">
       <UniqueIdentifier>{031afae7-2a78-4e32-9738-4b589b6f7ff3}</UniqueIdentifier>
     </Filter>
+    <Filter Include="Source Files\CUDA\tribus">
+      <UniqueIdentifier>{1e548d79-c217-4203-989a-a592fe2b2de3}</UniqueIdentifier>
+    </Filter>
   </ItemGroup>
   <ItemGroup>
     <ClCompile Include="compat\jansson\dump.c">
@@ -222,6 +225,9 @@
     <ClCompile Include="sph\streebog.c">
       <Filter>Source Files\sph</Filter>
     </ClCompile>
+    <ClCompile Include="x13\sm3.c">
+      <Filter>Source Files\sph</Filter>
+    </ClCompile>
     <ClCompile Include="compat\winansi.c">
       <Filter>Source Files</Filter>
     </ClCompile>
@@ -721,6 +727,12 @@
     <CudaCompile Include="x13\x13.cu">
       <Filter>Source Files\CUDA\x13</Filter>
     </CudaCompile>
+    <CudaCompile Include="x13\cuda_hsr_sm3.cu">
+      <Filter>Source Files\CUDA\x13</Filter>
+    </CudaCompile>
+    <CudaCompile Include="x13\hsr.cu">
+      <Filter>Source Files\CUDA\x13</Filter>
+    </CudaCompile>
     <CudaCompile Include="cuda_checkhash.cu">
       <Filter>Source Files\CUDA</Filter>
     </CudaCompile>
@@ -763,8 +775,17 @@
     <CudaCompile Include="skunk\cuda_skunk_streebog.cu">
       <Filter>Source Files\CUDA\skunk</Filter>
     </CudaCompile>
-    <CudaCompile Include="tribus.cu">
-      <Filter>Source Files\CUDA</Filter>
+    <CudaCompile Include="tribus\tribus.cu">
+      <Filter>Source Files\CUDA\tribus</Filter>
+    </CudaCompile>
+    <CudaCompile Include="tribus\cuda_echo512_final.cu">
+      <Filter>Source Files\CUDA\tribus</Filter>
+    </CudaCompile>
+    <ClInclude Include="tribus\cuda_echo512_aes.cuh">
+      <Filter>Source Files\CUDA\tribus</Filter>
+    </ClInclude>
+    <CudaCompile Include="x11\phi.cu">
+      <Filter>Source Files\CUDA\x11</Filter>
     </CudaCompile>
     <CudaCompile Include="x11\sib.cu">
       <Filter>Source Files\CUDA\x11</Filter>
@@ -772,6 +793,9 @@
     <CudaCompile Include="x11\cuda_streebog.cu">
       <Filter>Source Files\CUDA\x11</Filter>
     </CudaCompile>
+    <CudaCompile Include="x11\cuda_streebog_maxwell.cu">
+      <Filter>Source Files\CUDA\x11</Filter>
+    </CudaCompile>
     <CudaCompile Include="x11\s3.cu">
       <Filter>Source Files\CUDA\x11</Filter>
     </CudaCompile>
diff --git a/compat/ccminer-config.h b/compat/ccminer-config.h
index d07e736d21..7c280727b7 100644
--- a/compat/ccminer-config.h
+++ b/compat/ccminer-config.h
@@ -1,188 +1,188 @@
-/* CONFIG ONLY FOR MS VC++ BUILD */
-
-/* Define to one of `_getb67', `GETB67', `getb67' for Cray-2 and Cray-YMP
-   systems. This function is required for `alloca.c' support on those systems.
-   */
-/* #undef CRAY_STACKSEG_END */
-
-/* Define to 1 if using `alloca.c'. */
-/* #undef C_ALLOCA */
-
-/* Define to 1 if you have `alloca', as a function or macro. */
-#define HAVE_ALLOCA 1
-
-/* Define to 1 if you have <alloca.h> and it should be used (not on Ultrix).
-   */
-#define HAVE_ALLOCA_H 1
-
-/* Define to 1 if you have the declaration of `be32dec', and to 0 if you
-   don't. */
-#define HAVE_DECL_BE32DEC 0
-
-/* Define to 1 if you have the declaration of `be32enc', and to 0 if you
-   don't. */
-#define HAVE_DECL_BE32ENC 0
-
-/* Define to 1 if you have the declaration of `le32dec', and to 0 if you
-   don't. */
-#define HAVE_DECL_LE32DEC 0
-
-/* Define to 1 if you have the declaration of `le32enc', and to 0 if you
-   don't. */
-#define HAVE_DECL_LE32ENC 0
-
-/* Define to 1 if you have the `getopt_long' function. */
-#define HAVE_GETOPT_LONG 1
-
-/* Define to 1 if you have the <inttypes.h> header file. */
-#define HAVE_INTTYPES_H 1
-
-/* Define to 1 if you have the `crypto' library (-lcrypto). */
-#define HAVE_LIBCRYPTO 1
-
-/* Define to 1 if you have a functional curl library. */
-#define HAVE_LIBCURL 1
-
-/* Define to 1 if you have the `ssl' library (-lssl). */
-#define HAVE_LIBSSL 1
-
-/* Define to 1 if you have the `z' library (-lz). */
-#define HAVE_LIBZ 1
-
-/* Define to 1 if you have the <memory.h> header file. */
-#define HAVE_MEMORY_H 1
-
-/* Define to 1 if you have the <stdint.h> header file. */
-#define HAVE_STDINT_H 1
-
-/* Define to 1 if you have the <stdlib.h> header file. */
-#define HAVE_STDLIB_H 1
-
-/* Define to 1 if you have the <strings.h> header file. */
-#define HAVE_STRINGS_H 1
-
-/* Define to 1 if you have the <string.h> header file. */
-#define HAVE_STRING_H 1
-
-/* Define to 1 if you have the <syslog.h> header file. */
-/* #undef HAVE_SYSLOG_H */
-
-/* Define to 1 if you have the <sys/endian.h> header file. */
-/* #undef HAVE_SYS_ENDIAN_H */
-
-/* Define to 1 if you have the <sys/param.h> header file. */
-#define HAVE_SYS_PARAM_H 1
-
-/* Define to 1 if you have the <sys/stat.h> header file. */
-#define HAVE_SYS_STAT_H 1
-
-/* Define to 1 if you have the <sys/sysctl.h> header file. */
-#define HAVE_SYS_SYSCTL_H 1
-
-/* Define to 1 if you have the <sys/types.h> header file. */
-#define HAVE_SYS_TYPES_H 1
-
-/* Define to 1 if you have the <unistd.h> header file. */
-#define HAVE_UNISTD_H 1
-
-/* Defined if libcurl supports AsynchDNS */
-/* #undef LIBCURL_FEATURE_ASYNCHDNS */
-
-/* Defined if libcurl supports IDN */
-#define LIBCURL_FEATURE_IDN 1
-
-/* Defined if libcurl supports IPv6 */
-#define LIBCURL_FEATURE_IPV6 1
-
-/* Defined if libcurl supports KRB4 */
-/* #undef LIBCURL_FEATURE_KRB4 */
-
-/* Defined if libcurl supports libz */
-#define LIBCURL_FEATURE_LIBZ 1
-
-/* Defined if libcurl supports NTLM */
-#define LIBCURL_FEATURE_NTLM 1
-
-/* Defined if libcurl supports SSL */
-#define LIBCURL_FEATURE_SSL 1
-
-/* Defined if libcurl supports SSPI */
-/* #undef LIBCURL_FEATURE_SSPI */
-
-/* Defined if libcurl supports DICT */
-/* #undef LIBCURL_PROTOCOL_DICT */
-
-/* Defined if libcurl supports FILE */
-#define LIBCURL_PROTOCOL_FILE 1
-
-/* Defined if libcurl supports FTP */
-#define LIBCURL_PROTOCOL_FTP 1
-
-/* Defined if libcurl supports FTPS */
-#define LIBCURL_PROTOCOL_FTPS 1
-
-/* Defined if libcurl supports HTTP */
-#define LIBCURL_PROTOCOL_HTTP 1
-
-/* Defined if libcurl supports HTTPS */
-#define LIBCURL_PROTOCOL_HTTPS 1
-
-/* Defined if libcurl supports IMAP */
-/* #undef LIBCURL_PROTOCOL_IMAP */
-
-/* Defined if libcurl supports LDAP */
-/* #undef LIBCURL_PROTOCOL_LDAP */
-
-/* Defined if libcurl supports POP3 */
-/* #undef LIBCURL_PROTOCOL_POP3 */
-
-/* Defined if libcurl supports RTSP */
-/* #undef LIBCURL_PROTOCOL_RTSP */
-
-/* Defined if libcurl supports SMTP */
-/* #undef LIBCURL_PROTOCOL_SMTP */
-
-/* Defined if libcurl supports TELNET */
-/* #undef LIBCURL_PROTOCOL_TELNET */
-
-/* Defined if libcurl supports TFTP */
-/* #undef LIBCURL_PROTOCOL_TFTP */
-
-/* Define to 1 if your C compiler doesn't accept -c and -o together. */
-/* #undef NO_MINUS_C_MINUS_O */
-
-/* Name of package */
-#define PACKAGE "ccminer"
-
-/* Define to the address where bug reports for this package should be sent. */
-#define PACKAGE_BUGREPORT ""
-
-/* Define to the full name of this package. */
-#define PACKAGE_NAME "ccminer"
-
-/* Define to the home page for this package. */
-#define PACKAGE_URL "http://github.com/tpruvot/ccminer"
-
-/* Define to the version of this package. */
-#define PACKAGE_VERSION "2.2"
-
-/* If using the C implementation of alloca, define if you know the
-   direction of stack growth for your system; otherwise it will be
-   automatically deduced at runtime.
-	STACK_DIRECTION > 0 => grows toward higher addresses
-	STACK_DIRECTION < 0 => grows toward lower addresses
-	STACK_DIRECTION = 0 => direction of growth unknown */
-/* #undef STACK_DIRECTION */
-
-/* Define to 1 if you have the ANSI C header files. */
-#define STDC_HEADERS 1
-
-/* Define curl_free() as free() if our version of curl lacks curl_free. */
-/* #undef curl_free */
-
-/* Define to `unsigned int' if <sys/types.h> does not define. */
-//#define size_t unsigned int
-
-#if !defined(HAVE_STRUCT_TIMESPEC) && _MSC_VER >= 1900
-#define HAVE_STRUCT_TIMESPEC
-#endif
+/* CONFIG ONLY FOR MS VC++ BUILD */
+
+/* Define to one of `_getb67', `GETB67', `getb67' for Cray-2 and Cray-YMP
+   systems. This function is required for `alloca.c' support on those systems.
+   */
+/* #undef CRAY_STACKSEG_END */
+
+/* Define to 1 if using `alloca.c'. */
+/* #undef C_ALLOCA */
+
+/* Define to 1 if you have `alloca', as a function or macro. */
+#define HAVE_ALLOCA 1
+
+/* Define to 1 if you have <alloca.h> and it should be used (not on Ultrix).
+   */
+#define HAVE_ALLOCA_H 1
+
+/* Define to 1 if you have the declaration of `be32dec', and to 0 if you
+   don't. */
+#define HAVE_DECL_BE32DEC 0
+
+/* Define to 1 if you have the declaration of `be32enc', and to 0 if you
+   don't. */
+#define HAVE_DECL_BE32ENC 0
+
+/* Define to 1 if you have the declaration of `le32dec', and to 0 if you
+   don't. */
+#define HAVE_DECL_LE32DEC 0
+
+/* Define to 1 if you have the declaration of `le32enc', and to 0 if you
+   don't. */
+#define HAVE_DECL_LE32ENC 0
+
+/* Define to 1 if you have the `getopt_long' function. */
+#define HAVE_GETOPT_LONG 1
+
+/* Define to 1 if you have the <inttypes.h> header file. */
+#define HAVE_INTTYPES_H 1
+
+/* Define to 1 if you have the `crypto' library (-lcrypto). */
+#define HAVE_LIBCRYPTO 1
+
+/* Define to 1 if you have a functional curl library. */
+#define HAVE_LIBCURL 1
+
+/* Define to 1 if you have the `ssl' library (-lssl). */
+#define HAVE_LIBSSL 1
+
+/* Define to 1 if you have the `z' library (-lz). */
+#define HAVE_LIBZ 1
+
+/* Define to 1 if you have the <memory.h> header file. */
+#define HAVE_MEMORY_H 1
+
+/* Define to 1 if you have the <stdint.h> header file. */
+#define HAVE_STDINT_H 1
+
+/* Define to 1 if you have the <stdlib.h> header file. */
+#define HAVE_STDLIB_H 1
+
+/* Define to 1 if you have the <strings.h> header file. */
+#define HAVE_STRINGS_H 1
+
+/* Define to 1 if you have the <string.h> header file. */
+#define HAVE_STRING_H 1
+
+/* Define to 1 if you have the <syslog.h> header file. */
+/* #undef HAVE_SYSLOG_H */
+
+/* Define to 1 if you have the <sys/endian.h> header file. */
+/* #undef HAVE_SYS_ENDIAN_H */
+
+/* Define to 1 if you have the <sys/param.h> header file. */
+#define HAVE_SYS_PARAM_H 1
+
+/* Define to 1 if you have the <sys/stat.h> header file. */
+#define HAVE_SYS_STAT_H 1
+
+/* Define to 1 if you have the <sys/sysctl.h> header file. */
+#define HAVE_SYS_SYSCTL_H 1
+
+/* Define to 1 if you have the <sys/types.h> header file. */
+#define HAVE_SYS_TYPES_H 1
+
+/* Define to 1 if you have the <unistd.h> header file. */
+#define HAVE_UNISTD_H 1
+
+/* Defined if libcurl supports AsynchDNS */
+/* #undef LIBCURL_FEATURE_ASYNCHDNS */
+
+/* Defined if libcurl supports IDN */
+#define LIBCURL_FEATURE_IDN 1
+
+/* Defined if libcurl supports IPv6 */
+#define LIBCURL_FEATURE_IPV6 1
+
+/* Defined if libcurl supports KRB4 */
+/* #undef LIBCURL_FEATURE_KRB4 */
+
+/* Defined if libcurl supports libz */
+#define LIBCURL_FEATURE_LIBZ 1
+
+/* Defined if libcurl supports NTLM */
+#define LIBCURL_FEATURE_NTLM 1
+
+/* Defined if libcurl supports SSL */
+#define LIBCURL_FEATURE_SSL 1
+
+/* Defined if libcurl supports SSPI */
+/* #undef LIBCURL_FEATURE_SSPI */
+
+/* Defined if libcurl supports DICT */
+/* #undef LIBCURL_PROTOCOL_DICT */
+
+/* Defined if libcurl supports FILE */
+#define LIBCURL_PROTOCOL_FILE 1
+
+/* Defined if libcurl supports FTP */
+#define LIBCURL_PROTOCOL_FTP 1
+
+/* Defined if libcurl supports FTPS */
+#define LIBCURL_PROTOCOL_FTPS 1
+
+/* Defined if libcurl supports HTTP */
+#define LIBCURL_PROTOCOL_HTTP 1
+
+/* Defined if libcurl supports HTTPS */
+#define LIBCURL_PROTOCOL_HTTPS 1
+
+/* Defined if libcurl supports IMAP */
+/* #undef LIBCURL_PROTOCOL_IMAP */
+
+/* Defined if libcurl supports LDAP */
+/* #undef LIBCURL_PROTOCOL_LDAP */
+
+/* Defined if libcurl supports POP3 */
+/* #undef LIBCURL_PROTOCOL_POP3 */
+
+/* Defined if libcurl supports RTSP */
+/* #undef LIBCURL_PROTOCOL_RTSP */
+
+/* Defined if libcurl supports SMTP */
+/* #undef LIBCURL_PROTOCOL_SMTP */
+
+/* Defined if libcurl supports TELNET */
+/* #undef LIBCURL_PROTOCOL_TELNET */
+
+/* Defined if libcurl supports TFTP */
+/* #undef LIBCURL_PROTOCOL_TFTP */
+
+/* Define to 1 if your C compiler doesn't accept -c and -o together. */
+/* #undef NO_MINUS_C_MINUS_O */
+
+/* Name of package */
+#define PACKAGE "ccminer"
+
+/* Define to the address where bug reports for this package should be sent. */
+#define PACKAGE_BUGREPORT ""
+
+/* Define to the full name of this package. */
+#define PACKAGE_NAME "ccminer"
+
+/* Define to the home page for this package. */
+#define PACKAGE_URL "http://github.com/tpruvot/ccminer"
+
+/* Define to the version of this package. */
+#define PACKAGE_VERSION "2.2.2"
+
+/* If using the C implementation of alloca, define if you know the
+   direction of stack growth for your system; otherwise it will be
+   automatically deduced at runtime.
+	STACK_DIRECTION > 0 => grows toward higher addresses
+	STACK_DIRECTION < 0 => grows toward lower addresses
+	STACK_DIRECTION = 0 => direction of growth unknown */
+/* #undef STACK_DIRECTION */
+
+/* Define to 1 if you have the ANSI C header files. */
+#define STDC_HEADERS 1
+
+/* Define curl_free() as free() if our version of curl lacks curl_free. */
+/* #undef curl_free */
+
+/* Define to `unsigned int' if <sys/types.h> does not define. */
+//#define size_t unsigned int
+
+#if !defined(HAVE_STRUCT_TIMESPEC) && _MSC_VER >= 1900
+#define HAVE_STRUCT_TIMESPEC
+#endif
diff --git a/configure.ac b/configure.ac
index c369201a86..061dfdd70e 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([ccminer], [2.2], [], [ccminer], [http://github.com/tpruvot/ccminer])
+AC_INIT([ccminer], [2.2.2], [], [ccminer], [http://github.com/tpruvot/ccminer])
 
 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
diff --git a/crypto/xmr-rpc.cpp b/crypto/xmr-rpc.cpp
index e50b28a96e..82b7845f27 100644
--- a/crypto/xmr-rpc.cpp
+++ b/crypto/xmr-rpc.cpp
@@ -30,6 +30,10 @@
 #define MADV_HUGEPAGE 0
 #endif
 
+#ifndef MADV_HUGEPAGE
+#define MADV_HUGEPAGE 0
+#endif
+
 #ifndef PRIu64
 #define PRIu64 "I64u"
 #endif
diff --git a/equi/cuda_equi.cu b/equi/cuda_equi.cu
index 6fc864d307..b7bcbb5910 100644
--- a/equi/cuda_equi.cu
+++ b/equi/cuda_equi.cu
@@ -2081,7 +2081,7 @@ __host__ void eq_cuda_context<RB, SM, SSM, THREADS, PACKER>::solve(const char *t
 // destructor
 template <u32 RB, u32 SM, u32 SSM, u32 THREADS, typename PACKER>
 __host__
-eq_cuda_context<RB, SM, SSM, THREADS, PACKER>::~eq_cuda_context()
+void eq_cuda_context<RB, SM, SSM, THREADS, PACKER>::freemem()
 {
 	if (solutions)
 		free(solutions);
@@ -2104,6 +2104,12 @@ eq_cuda_context<RB, SM, SSM, THREADS, PACKER>::~eq_cuda_context()
 	}
 }
 
+template <u32 RB, u32 SM, u32 SSM, u32 THREADS, typename PACKER>
+__host__
+eq_cuda_context<RB, SM, SSM, THREADS, PACKER>::~eq_cuda_context()
+{
+	freemem();
+}
 
 #ifdef CONFIG_MODE_1
 template class eq_cuda_context<CONFIG_MODE_1>;
diff --git a/equi/eqcuda.hpp b/equi/eqcuda.hpp
index 68bdaf0265..7cb10c9630 100644
--- a/equi/eqcuda.hpp
+++ b/equi/eqcuda.hpp
@@ -90,7 +90,7 @@ template <u32 RB, u32 SM> struct equi;
 
 struct eq_cuda_context_interface
 {
-	virtual ~eq_cuda_context_interface();
+	//virtual ~eq_cuda_context_interface();
 
 	virtual void solve(const char *tequihash_header,
 		unsigned int tequihash_header_len,
@@ -99,6 +99,7 @@ struct eq_cuda_context_interface
 		fn_cancel cancelf,
 		fn_solution solutionf,
 		fn_hashdone hashdonef);
+
 public:
 	int thread_id;
 	int device_id;
@@ -125,9 +126,9 @@ class eq_cuda_context : public eq_cuda_context_interface
 		fn_cancel cancelf,
 		fn_solution solutionf,
 		fn_hashdone hashdonef);
-
 public:
 	eq_cuda_context(int thr_id, int dev_id);
+	void freemem();
 	~eq_cuda_context();
 };
 
diff --git a/equi/equihash.cpp b/equi/equihash.cpp
index 2a6e5141fe..c9ac1fcf30 100644
--- a/equi/equihash.cpp
+++ b/equi/equihash.cpp
@@ -281,7 +281,11 @@ void free_equihash(int thr_id)
 	if (!init[thr_id])
 		return;
 
-	delete(solvers[thr_id]);
+	// assume config 1 was used... interface destructor seems bad
+	eq_cuda_context<CONFIG_MODE_1>* ptr = dynamic_cast<eq_cuda_context<CONFIG_MODE_1>*>(solvers[thr_id]);
+	ptr->freemem();
+	ptr = NULL;
+
 	solvers[thr_id] = NULL;
 
 	init[thr_id] = false;
@@ -291,4 +295,3 @@ void free_equihash(int thr_id)
 void eq_cuda_context_interface::solve(const char *tequihash_header, unsigned int tequihash_header_len,
 	const char* nonce, unsigned int nonce_len,
 	fn_cancel cancelf, fn_solution solutionf, fn_hashdone hashdonef) { }
-eq_cuda_context_interface::~eq_cuda_context_interface() { }
diff --git a/miner.h b/miner.h
index 382016a547..be5e34bf87 100644
--- a/miner.h
+++ b/miner.h
@@ -289,6 +289,7 @@ extern int scanhash_fugue256(int thr_id, struct work* work, uint32_t max_nonce,
 extern int scanhash_groestlcoin(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_hmq17(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_heavy(int thr_id,struct work *work, uint32_t max_nonce, unsigned long *hashes_done, uint32_t maxvote, int blocklen);
+extern int scanhash_hsr(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_jha(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_jackpot(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); // quark method
 extern int scanhash_lbry(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done);
@@ -300,6 +301,7 @@ extern int scanhash_myriad(int thr_id, struct work* work, uint32_t max_nonce, un
 extern int scanhash_neoscrypt(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_nist5(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_pentablake(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done);
+extern int scanhash_phi(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_quark(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_qubit(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_sha256d(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done);
@@ -350,6 +352,7 @@ extern void free_fugue256(int thr_id);
 extern void free_groestlcoin(int thr_id);
 extern void free_heavy(int thr_id);
 extern void free_hmq17(int thr_id);
+extern void free_hsr(int thr_id);
 extern void free_jackpot(int thr_id);
 extern void free_jha(int thr_id);
 extern void free_lbry(int thr_id);
@@ -361,6 +364,7 @@ extern void free_myriad(int thr_id);
 extern void free_neoscrypt(int thr_id);
 extern void free_nist5(int thr_id);
 extern void free_pentablake(int thr_id);
+extern void free_phi(int thr_id);
 extern void free_quark(int thr_id);
 extern void free_qubit(int thr_id);
 extern void free_sha256d(int thr_id);
@@ -897,6 +901,7 @@ void fresh_hash(void *state, const void *input);
 void fugue256_hash(unsigned char* output, const unsigned char* input, int len);
 void heavycoin_hash(unsigned char* output, const unsigned char* input, int len);
 void hmq17hash(void *output, const void *input);
+void hsr_hash(void *output, const void *input);
 void keccak256_hash(void *state, const void *input);
 void jackpothash(void *state, const void *input);
 void groestlhash(void *state, const void *input);
@@ -909,6 +914,7 @@ void myriadhash(void *state, const void *input);
 void neoscrypt(uchar *output, const uchar *input, uint32_t profile);
 void nist5hash(void *state, const void *input);
 void pentablakehash(void *output, const void *input);
+void phihash(void *output, const void *input);
 void quarkhash(void *state, const void *input);
 void qubithash(void *state, const void *input);
 void scrypthash(void* output, const void* input);
diff --git a/res/ccminer.rc b/res/ccminer.rc
index d7cf1c2f5e..84be50d618 100644
--- a/res/ccminer.rc
+++ b/res/ccminer.rc
@@ -60,8 +60,8 @@ IDI_ICON1               ICON                    "ccminer.ico"
 //
 
 VS_VERSION_INFO VERSIONINFO
- FILEVERSION 2,2,0,0
- PRODUCTVERSION 2,2,0,0
+ FILEVERSION 2,2,2,0
+ PRODUCTVERSION 2,2,2,0
  FILEFLAGSMASK 0x3fL
 #ifdef _DEBUG
  FILEFLAGS 0x21L
@@ -76,10 +76,10 @@ BEGIN
     BEGIN
         BLOCK "040904e4"
         BEGIN
-            VALUE "FileVersion", "2.2"
+            VALUE "FileVersion", "2.2.2"
             VALUE "LegalCopyright", "Copyright (C) 2017"
             VALUE "ProductName", "ccminer"
-            VALUE "ProductVersion", "2.2"
+            VALUE "ProductVersion", "2.2.2"
         END
     END
     BLOCK "VarFileInfo"
diff --git a/skunk/cuda_skunk_streebog.cu b/skunk/cuda_skunk_streebog.cu
index c38de11d46..36ec7923c0 100644
--- a/skunk/cuda_skunk_streebog.cu
+++ b/skunk/cuda_skunk_streebog.cu
@@ -18,7 +18,7 @@
 #include <cuda_vectors.h>
 #include <cuda_vector_uint2x4.h>
 
-#include "skunk/streebog_arrays.cuh"
+#include "x11/streebog_arrays.cuh"
 
 //#define FULL_UNROLL
 __device__ __forceinline__
@@ -204,7 +204,7 @@ static void GOST_E12(const uint2 shared[8][256],uint2 *const __restrict__ K, uin
 __constant__ uint64_t target64[4];
 
 __host__
-void skunk_set_target(uint32_t* ptarget)
+void skunk_streebog_set_target(uint32_t* ptarget)
 {
 	cudaMemcpyToSymbol(target64, ptarget, 4*sizeof(uint64_t), 0, cudaMemcpyHostToDevice);
 }
diff --git a/skunk/skunk.cu b/skunk/skunk.cu
index f89c5fb458..c1add50303 100644
--- a/skunk/skunk.cu
+++ b/skunk/skunk.cu
@@ -23,12 +23,12 @@ extern void x11_cubehash512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t s
 extern void x13_fugue512_cpu_init(int thr_id, uint32_t threads);
 extern void x13_fugue512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
 extern void x13_fugue512_cpu_free(int thr_id);
-extern void streebog_cpu_hash_64_final(int thr_id, uint32_t threads, uint32_t *d_hash, uint32_t* d_resNonce);
-extern void streebog_set_target(const uint32_t* ptarget);
+extern void streebog_sm3_set_target(uint32_t* ptarget);
+extern void streebog_sm3_hash_64_final(int thr_id, uint32_t threads, uint32_t *d_hash, uint32_t* d_resNonce);
 
 // krnlx merged kernel (for high-end cards only)
 extern void skunk_cpu_init(int thr_id, uint32_t threads);
-extern void skunk_set_target(uint32_t* ptarget);
+extern void skunk_streebog_set_target(uint32_t* ptarget);
 extern void skunk_setBlock_80(int thr_id, void *pdata);
 extern void skunk_cuda_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash);
 extern void skunk_cuda_streebog(int thr_id, uint32_t threads, uint32_t *d_hash, uint32_t* d_resNonce);
@@ -117,10 +117,10 @@ extern "C" int scanhash_skunk(int thr_id, struct work* work, uint32_t max_nonce,
 	cudaMemset(d_resNonce[thr_id], 0xff, NBN*sizeof(uint32_t));
 	if (use_compat_kernels[thr_id]) {
 		skein512_cpu_setBlock_80(endiandata);
-		streebog_set_target(ptarget);
+		streebog_sm3_set_target(ptarget);
 	} else {
 		skunk_setBlock_80(thr_id, endiandata);
-		skunk_set_target(ptarget);
+		skunk_streebog_set_target(ptarget);
 	}
 
 	do {
@@ -129,7 +129,7 @@ extern "C" int scanhash_skunk(int thr_id, struct work* work, uint32_t max_nonce,
 			skein512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
 			x11_cubehash512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 			x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-			streebog_cpu_hash_64_final(thr_id, throughput, d_hash[thr_id], d_resNonce[thr_id]);
+			streebog_sm3_hash_64_final(thr_id, throughput, d_hash[thr_id], d_resNonce[thr_id]);
 		} else {
 			skunk_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]);
 			skunk_cuda_streebog(thr_id, throughput, d_hash[thr_id], d_resNonce[thr_id]);
diff --git a/tribus/cuda_echo512_aes.cuh b/tribus/cuda_echo512_aes.cuh
new file mode 100644
index 0000000000..ff205aaeb7
--- /dev/null
+++ b/tribus/cuda_echo512_aes.cuh
@@ -0,0 +1,318 @@
+#ifdef __INTELLISENSE__
+#define __byte_perm(x, y, b) x
+#define __CUDA_ARCH__ 520
+#include <cuda_helper.h>
+#endif
+
+#undef ROL8
+#undef ROR8
+#undef ROL16
+
+#ifdef __CUDA_ARCH__
+__device__ __forceinline__
+uint32_t ROL8(const uint32_t a) {
+	return __byte_perm(a, 0, 0x2103);
+}
+__device__ __forceinline__
+uint32_t ROR8(const uint32_t a) {
+	return __byte_perm(a, 0, 0x0321);
+}
+__device__ __forceinline__
+uint32_t ROL16(const uint32_t a) {
+	return __byte_perm(a, 0, 0x1032);
+}
+#else
+#define ROL8(u)  ROTL32(u, 8)
+#define ROR8(u)  ROTR32(u, 8)
+#define ROL16(u) ROTL32(u,16)
+#endif
+
+__device__ uint32_t d_AES0[256] = {
+	0xA56363C6, 0x847C7CF8, 0x997777EE, 0x8D7B7BF6, 0x0DF2F2FF, 0xBD6B6BD6, 0xB16F6FDE, 0x54C5C591,
+	0x50303060, 0x03010102, 0xA96767CE, 0x7D2B2B56, 0x19FEFEE7, 0x62D7D7B5, 0xE6ABAB4D, 0x9A7676EC,
+	0x45CACA8F, 0x9D82821F, 0x40C9C989, 0x877D7DFA, 0x15FAFAEF, 0xEB5959B2, 0xC947478E, 0x0BF0F0FB,
+	0xECADAD41, 0x67D4D4B3, 0xFDA2A25F, 0xEAAFAF45, 0xBF9C9C23, 0xF7A4A453, 0x967272E4, 0x5BC0C09B,
+	0xC2B7B775, 0x1CFDFDE1, 0xAE93933D, 0x6A26264C, 0x5A36366C, 0x413F3F7E, 0x02F7F7F5, 0x4FCCCC83,
+	0x5C343468, 0xF4A5A551, 0x34E5E5D1, 0x08F1F1F9, 0x937171E2, 0x73D8D8AB, 0x53313162, 0x3F15152A,
+	0x0C040408, 0x52C7C795, 0x65232346, 0x5EC3C39D, 0x28181830, 0xA1969637, 0x0F05050A, 0xB59A9A2F,
+	0x0907070E, 0x36121224, 0x9B80801B, 0x3DE2E2DF, 0x26EBEBCD, 0x6927274E, 0xCDB2B27F, 0x9F7575EA,
+	0x1B090912, 0x9E83831D, 0x742C2C58, 0x2E1A1A34, 0x2D1B1B36, 0xB26E6EDC, 0xEE5A5AB4, 0xFBA0A05B,
+	0xF65252A4, 0x4D3B3B76, 0x61D6D6B7, 0xCEB3B37D, 0x7B292952, 0x3EE3E3DD, 0x712F2F5E, 0x97848413,
+	0xF55353A6, 0x68D1D1B9, 0x00000000, 0x2CEDEDC1, 0x60202040, 0x1FFCFCE3, 0xC8B1B179, 0xED5B5BB6,
+	0xBE6A6AD4, 0x46CBCB8D, 0xD9BEBE67, 0x4B393972, 0xDE4A4A94, 0xD44C4C98, 0xE85858B0, 0x4ACFCF85,
+	0x6BD0D0BB, 0x2AEFEFC5, 0xE5AAAA4F, 0x16FBFBED, 0xC5434386, 0xD74D4D9A, 0x55333366, 0x94858511,
+	0xCF45458A, 0x10F9F9E9, 0x06020204, 0x817F7FFE, 0xF05050A0, 0x443C3C78, 0xBA9F9F25, 0xE3A8A84B,
+	0xF35151A2, 0xFEA3A35D, 0xC0404080, 0x8A8F8F05, 0xAD92923F, 0xBC9D9D21, 0x48383870, 0x04F5F5F1,
+	0xDFBCBC63, 0xC1B6B677, 0x75DADAAF, 0x63212142, 0x30101020, 0x1AFFFFE5, 0x0EF3F3FD, 0x6DD2D2BF,
+	0x4CCDCD81, 0x140C0C18, 0x35131326, 0x2FECECC3, 0xE15F5FBE, 0xA2979735, 0xCC444488, 0x3917172E,
+	0x57C4C493, 0xF2A7A755, 0x827E7EFC, 0x473D3D7A, 0xAC6464C8, 0xE75D5DBA, 0x2B191932, 0x957373E6,
+	0xA06060C0, 0x98818119, 0xD14F4F9E, 0x7FDCDCA3, 0x66222244, 0x7E2A2A54, 0xAB90903B, 0x8388880B,
+	0xCA46468C, 0x29EEEEC7, 0xD3B8B86B, 0x3C141428, 0x79DEDEA7, 0xE25E5EBC, 0x1D0B0B16, 0x76DBDBAD,
+	0x3BE0E0DB, 0x56323264, 0x4E3A3A74, 0x1E0A0A14, 0xDB494992, 0x0A06060C, 0x6C242448, 0xE45C5CB8,
+	0x5DC2C29F, 0x6ED3D3BD, 0xEFACAC43, 0xA66262C4, 0xA8919139, 0xA4959531, 0x37E4E4D3, 0x8B7979F2,
+	0x32E7E7D5, 0x43C8C88B, 0x5937376E, 0xB76D6DDA, 0x8C8D8D01, 0x64D5D5B1, 0xD24E4E9C, 0xE0A9A949,
+	0xB46C6CD8, 0xFA5656AC, 0x07F4F4F3, 0x25EAEACF, 0xAF6565CA, 0x8E7A7AF4, 0xE9AEAE47, 0x18080810,
+	0xD5BABA6F, 0x887878F0, 0x6F25254A, 0x722E2E5C, 0x241C1C38, 0xF1A6A657, 0xC7B4B473, 0x51C6C697,
+	0x23E8E8CB, 0x7CDDDDA1, 0x9C7474E8, 0x211F1F3E, 0xDD4B4B96, 0xDCBDBD61, 0x868B8B0D, 0x858A8A0F,
+	0x907070E0, 0x423E3E7C, 0xC4B5B571, 0xAA6666CC, 0xD8484890, 0x05030306, 0x01F6F6F7, 0x120E0E1C,
+	0xA36161C2, 0x5F35356A, 0xF95757AE, 0xD0B9B969, 0x91868617, 0x58C1C199, 0x271D1D3A, 0xB99E9E27,
+	0x38E1E1D9, 0x13F8F8EB, 0xB398982B, 0x33111122, 0xBB6969D2, 0x70D9D9A9, 0x898E8E07, 0xA7949433,
+	0xB69B9B2D, 0x221E1E3C, 0x92878715, 0x20E9E9C9, 0x49CECE87, 0xFF5555AA, 0x78282850, 0x7ADFDFA5,
+	0x8F8C8C03, 0xF8A1A159, 0x80898909, 0x170D0D1A, 0xDABFBF65, 0x31E6E6D7, 0xC6424284, 0xB86868D0,
+	0xC3414182, 0xB0999929, 0x772D2D5A, 0x110F0F1E, 0xCBB0B07B, 0xFC5454A8, 0xD6BBBB6D, 0x3A16162C
+};
+
+__device__ uint32_t d_AES3[256] = {
+	0xC6A56363, 0xF8847C7C, 0xEE997777, 0xF68D7B7B, 0xFF0DF2F2, 0xD6BD6B6B, 0xDEB16F6F, 0x9154C5C5,
+	0x60503030, 0x02030101, 0xCEA96767, 0x567D2B2B, 0xE719FEFE, 0xB562D7D7, 0x4DE6ABAB, 0xEC9A7676,
+	0x8F45CACA, 0x1F9D8282, 0x8940C9C9, 0xFA877D7D, 0xEF15FAFA, 0xB2EB5959, 0x8EC94747, 0xFB0BF0F0,
+	0x41ECADAD, 0xB367D4D4, 0x5FFDA2A2, 0x45EAAFAF, 0x23BF9C9C, 0x53F7A4A4, 0xE4967272, 0x9B5BC0C0,
+	0x75C2B7B7, 0xE11CFDFD, 0x3DAE9393, 0x4C6A2626, 0x6C5A3636, 0x7E413F3F, 0xF502F7F7, 0x834FCCCC,
+	0x685C3434, 0x51F4A5A5, 0xD134E5E5, 0xF908F1F1, 0xE2937171, 0xAB73D8D8, 0x62533131, 0x2A3F1515,
+	0x080C0404, 0x9552C7C7, 0x46652323, 0x9D5EC3C3, 0x30281818, 0x37A19696, 0x0A0F0505, 0x2FB59A9A,
+	0x0E090707, 0x24361212, 0x1B9B8080, 0xDF3DE2E2, 0xCD26EBEB, 0x4E692727, 0x7FCDB2B2, 0xEA9F7575,
+	0x121B0909, 0x1D9E8383, 0x58742C2C, 0x342E1A1A, 0x362D1B1B, 0xDCB26E6E, 0xB4EE5A5A, 0x5BFBA0A0,
+	0xA4F65252, 0x764D3B3B, 0xB761D6D6, 0x7DCEB3B3, 0x527B2929, 0xDD3EE3E3, 0x5E712F2F, 0x13978484,
+	0xA6F55353, 0xB968D1D1, 0x00000000, 0xC12CEDED, 0x40602020, 0xE31FFCFC, 0x79C8B1B1, 0xB6ED5B5B,
+	0xD4BE6A6A, 0x8D46CBCB, 0x67D9BEBE, 0x724B3939, 0x94DE4A4A, 0x98D44C4C, 0xB0E85858, 0x854ACFCF,
+	0xBB6BD0D0, 0xC52AEFEF, 0x4FE5AAAA, 0xED16FBFB, 0x86C54343, 0x9AD74D4D, 0x66553333, 0x11948585,
+	0x8ACF4545, 0xE910F9F9, 0x04060202, 0xFE817F7F, 0xA0F05050, 0x78443C3C, 0x25BA9F9F, 0x4BE3A8A8,
+	0xA2F35151, 0x5DFEA3A3, 0x80C04040, 0x058A8F8F, 0x3FAD9292, 0x21BC9D9D, 0x70483838, 0xF104F5F5,
+	0x63DFBCBC, 0x77C1B6B6, 0xAF75DADA, 0x42632121, 0x20301010, 0xE51AFFFF, 0xFD0EF3F3, 0xBF6DD2D2,
+	0x814CCDCD, 0x18140C0C, 0x26351313, 0xC32FECEC, 0xBEE15F5F, 0x35A29797, 0x88CC4444, 0x2E391717,
+	0x9357C4C4, 0x55F2A7A7, 0xFC827E7E, 0x7A473D3D, 0xC8AC6464, 0xBAE75D5D, 0x322B1919, 0xE6957373,
+	0xC0A06060, 0x19988181, 0x9ED14F4F, 0xA37FDCDC, 0x44662222, 0x547E2A2A, 0x3BAB9090, 0x0B838888,
+	0x8CCA4646, 0xC729EEEE, 0x6BD3B8B8, 0x283C1414, 0xA779DEDE, 0xBCE25E5E, 0x161D0B0B, 0xAD76DBDB,
+	0xDB3BE0E0, 0x64563232, 0x744E3A3A, 0x141E0A0A, 0x92DB4949, 0x0C0A0606, 0x486C2424, 0xB8E45C5C,
+	0x9F5DC2C2, 0xBD6ED3D3, 0x43EFACAC, 0xC4A66262, 0x39A89191, 0x31A49595, 0xD337E4E4, 0xF28B7979,
+	0xD532E7E7, 0x8B43C8C8, 0x6E593737, 0xDAB76D6D, 0x018C8D8D, 0xB164D5D5, 0x9CD24E4E, 0x49E0A9A9,
+	0xD8B46C6C, 0xACFA5656, 0xF307F4F4, 0xCF25EAEA, 0xCAAF6565, 0xF48E7A7A, 0x47E9AEAE, 0x10180808,
+	0x6FD5BABA, 0xF0887878, 0x4A6F2525, 0x5C722E2E, 0x38241C1C, 0x57F1A6A6, 0x73C7B4B4, 0x9751C6C6,
+	0xCB23E8E8, 0xA17CDDDD, 0xE89C7474, 0x3E211F1F, 0x96DD4B4B, 0x61DCBDBD, 0x0D868B8B, 0x0F858A8A,
+	0xE0907070, 0x7C423E3E, 0x71C4B5B5, 0xCCAA6666, 0x90D84848, 0x06050303, 0xF701F6F6, 0x1C120E0E,
+	0xC2A36161, 0x6A5F3535, 0xAEF95757, 0x69D0B9B9, 0x17918686, 0x9958C1C1, 0x3A271D1D, 0x27B99E9E,
+	0xD938E1E1, 0xEB13F8F8, 0x2BB39898, 0x22331111, 0xD2BB6969, 0xA970D9D9, 0x07898E8E, 0x33A79494,
+	0x2DB69B9B, 0x3C221E1E, 0x15928787, 0xC920E9E9, 0x8749CECE, 0xAAFF5555, 0x50782828, 0xA57ADFDF,
+	0x038F8C8C, 0x59F8A1A1, 0x09808989, 0x1A170D0D, 0x65DABFBF, 0xD731E6E6, 0x84C64242, 0xD0B86868,
+	0x82C34141, 0x29B09999, 0x5A772D2D, 0x1E110F0F, 0x7BCBB0B0, 0xA8FC5454, 0x6DD6BBBB, 0x2C3A1616
+};
+
+__device__ __forceinline__
+void aes_gpu_init_mt_256(uint32_t sharedMemory[4][256])
+{
+	/* each thread startup will fill a uint32 */
+	if (threadIdx.x < 256) {
+		uint32_t temp = __ldg(&d_AES0[threadIdx.x]);
+		sharedMemory[0][threadIdx.x] = temp;
+		sharedMemory[1][threadIdx.x] = ROL8(temp);
+		sharedMemory[2][threadIdx.x] = ROL16(temp);
+		sharedMemory[3][threadIdx.x] = ROR8(temp);
+	}
+}
+
+__device__ __forceinline__
+void aes_gpu_init256(uint32_t sharedMemory[4][256])
+{
+	/* each thread startup will fill a uint32 */
+	uint32_t temp = __ldg(&d_AES0[threadIdx.x]);
+	sharedMemory[0][threadIdx.x] = temp;
+	sharedMemory[1][threadIdx.x] = ROL8(temp);
+	sharedMemory[2][threadIdx.x] = ROL16(temp);
+	sharedMemory[3][threadIdx.x] = ROR8(temp);
+}
+
+__device__ __forceinline__
+void aes_gpu_init128(uint32_t sharedMemory[4][256])
+{
+	/* each thread startup will fill 2 uint32 */
+	uint2 temp = __ldg(&((uint2*)&d_AES0)[threadIdx.x]);
+
+	sharedMemory[0][(threadIdx.x << 1) + 0] = temp.x;
+	sharedMemory[0][(threadIdx.x << 1) + 1] = temp.y;
+	sharedMemory[1][(threadIdx.x << 1) + 0] = ROL8(temp.x);
+	sharedMemory[1][(threadIdx.x << 1) + 1] = ROL8(temp.y);
+	sharedMemory[2][(threadIdx.x << 1) + 0] = ROL16(temp.x);
+	sharedMemory[2][(threadIdx.x << 1) + 1] = ROL16(temp.y);
+	sharedMemory[3][(threadIdx.x << 1) + 0] = ROR8(temp.x);
+	sharedMemory[3][(threadIdx.x << 1) + 1] = ROR8(temp.y);
+}
+
+__device__ __forceinline__
+void aes_gpu_init_lt_256(uint32_t sharedMemory[4][256])
+{
+	if (threadIdx.x < 128) {
+		/* each thread startup will fill 2 uint32 */
+		uint2 temp = __ldg(&((uint2*)&d_AES0)[threadIdx.x]);
+
+		sharedMemory[0][(threadIdx.x << 1) + 0] = temp.x;
+		sharedMemory[0][(threadIdx.x << 1) + 1] = temp.y;
+		sharedMemory[1][(threadIdx.x << 1) + 0] = ROL8(temp.x);
+		sharedMemory[1][(threadIdx.x << 1) + 1] = ROL8(temp.y);
+		sharedMemory[2][(threadIdx.x << 1) + 0] = ROL16(temp.x);
+		sharedMemory[2][(threadIdx.x << 1) + 1] = ROL16(temp.y);
+		sharedMemory[3][(threadIdx.x << 1) + 0] = ROR8(temp.x);
+		sharedMemory[3][(threadIdx.x << 1) + 1] = ROR8(temp.y);
+	}
+}
+
+__device__ __forceinline__
+static void aes_round(const uint32_t sharedMemory[4][256], const uint32_t x0, const uint32_t x1, const uint32_t x2, const uint32_t x3,
+	const uint32_t k0, uint32_t &y0, uint32_t &y1, uint32_t &y2, uint32_t &y3)
+{
+	y0 = __ldg(&d_AES0[__byte_perm(x0, 0, 0x4440)]);
+	y3 = sharedMemory[1][__byte_perm(x0, 0, 0x4441)];
+	y2 = sharedMemory[2][__byte_perm(x0, 0, 0x4442)];
+	y1 = __ldg(&d_AES3[__byte_perm(x0, 0, 0x4443)]);
+
+	y1 ^= sharedMemory[0][__byte_perm(x1, 0, 0x4440)];
+	y0 ^= sharedMemory[1][__byte_perm(x1, 0, 0x4441)];
+	y3 ^= sharedMemory[2][__byte_perm(x1, 0, 0x4442)];
+#ifdef INTENSIVE_GMF
+	y2 ^= __ldg(&d_AES3[__byte_perm(x1, 0, 0x4443)]);
+#else
+	y2 ^= sharedMemory[3][__byte_perm(x1, 0, 0x4443)];
+#endif
+
+	y0 ^= k0;
+
+	y2 ^= __ldg(&d_AES0[__byte_perm(x2, 0, 0x4440)]);
+	y1 ^= sharedMemory[1][__byte_perm(x2, 0, 0x4441)];
+	y0 ^= sharedMemory[2][__byte_perm(x2, 0, 0x4442)];
+	y3 ^= __ldg(&d_AES3[__byte_perm(x2, 0, 0x4443)]);
+
+	y3 ^= sharedMemory[0][__byte_perm(x3, 0, 0x4440)];
+	y2 ^= sharedMemory[1][__byte_perm(x3, 0, 0x4441)];
+	y1 ^= sharedMemory[2][__byte_perm(x3, 0, 0x4442)];
+	y0 ^= __ldg(&d_AES3[__byte_perm(x3, 0, 0x4443)]);
+}
+
+__device__ __forceinline__
+static void aes_round_LDG(const uint32_t sharedMemory[4][256], const uint32_t x0, const uint32_t x1, const uint32_t x2, const uint32_t x3,
+	const uint32_t k0, uint32_t &y0, uint32_t &y1, uint32_t &y2, uint32_t &y3)
+{
+	y0 = __ldg(&d_AES0[__byte_perm(x0, 0, 0x4440)]);
+	y3 = sharedMemory[1][__byte_perm(x0, 0, 0x4441)];
+	y2 = sharedMemory[2][__byte_perm(x0, 0, 0x4442)];
+	y1 = __ldg(&d_AES3[__byte_perm(x0, 0, 0x4443)]);
+
+	y1 ^= sharedMemory[0][__byte_perm(x1, 0, 0x4440)];
+	y0 ^= sharedMemory[1][__byte_perm(x1, 0, 0x4441)];
+	y3 ^= sharedMemory[2][__byte_perm(x1, 0, 0x4442)];
+	y2 ^= __ldg(&d_AES3[__byte_perm(x1, 0, 0x4443)]);
+
+	y0 ^= k0;
+
+	y2 ^= __ldg(&d_AES0[__byte_perm(x2, 0, 0x4440)]);
+	y1 ^= sharedMemory[1][__byte_perm(x2, 0, 0x4441)];
+	y0 ^= sharedMemory[2][__byte_perm(x2, 0, 0x4442)];
+	y3 ^= __ldg(&d_AES3[__byte_perm(x2, 0, 0x4443)]);
+
+	y3 ^= __ldg(&d_AES0[__byte_perm(x3, 0, 0x4440)]);
+	y2 ^= sharedMemory[1][__byte_perm(x3, 0, 0x4441)];
+	y1 ^= sharedMemory[2][__byte_perm(x3, 0, 0x4442)];
+	y0 ^= __ldg(&d_AES3[__byte_perm(x3, 0, 0x4443)]);
+}
+
+__device__ __forceinline__
+static void aes_round(const uint32_t sharedMemory[4][256], const uint32_t x0, const uint32_t x1, const uint32_t x2, const uint32_t x3,
+	uint32_t &y0, uint32_t &y1, uint32_t &y2, uint32_t &y3)
+{
+	y0 = sharedMemory[0][__byte_perm(x0, 0, 0x4440)];
+	y3 = sharedMemory[1][__byte_perm(x0, 0, 0x4441)];
+	y2 = sharedMemory[2][__byte_perm(x0, 0, 0x4442)];
+	y1 = __ldg(&d_AES3[__byte_perm(x0, 0, 0x4443)]);
+
+#ifdef INTENSIVE_GMF
+	y1 ^= __ldg(&d_AES0[__byte_perm(x1, 0, 0x4440)]);
+#else
+	y1 ^= sharedMemory[0][__byte_perm(x1, 0, 0x4440)];
+#endif
+	y0 ^= sharedMemory[1][__byte_perm(x1, 0, 0x4441)];
+	y3 ^= sharedMemory[2][__byte_perm(x1, 0, 0x4442)];
+	y2 ^= __ldg(&d_AES3[__byte_perm(x1, 0, 0x4443)]);
+
+	y2 ^= sharedMemory[0][__byte_perm(x2, 0, 0x4440)];
+	y1 ^= sharedMemory[1][__byte_perm(x2, 0, 0x4441)];
+	y0 ^= sharedMemory[2][__byte_perm(x2, 0, 0x4442)];
+	y3 ^= __ldg(&d_AES3[__byte_perm(x2, 0, 0x4443)]);
+
+	y3 ^= sharedMemory[0][__byte_perm(x3, 0, 0x4440)];
+	y2 ^= sharedMemory[1][__byte_perm(x3, 0, 0x4441)];
+	y1 ^= sharedMemory[2][__byte_perm(x3, 0, 0x4442)];
+	y0 ^= __ldg(&d_AES3[__byte_perm(x3, 0, 0x4443)]);
+}
+
+__device__ __forceinline__
+static void aes_round_LDG(const uint32_t sharedMemory[4][256], const uint32_t x0, const uint32_t x1, const uint32_t x2, const uint32_t x3,
+	uint32_t &y0, uint32_t &y1, uint32_t &y2, uint32_t &y3)
+{
+	y0 = sharedMemory[0][__byte_perm(x0, 0, 0x4440)];
+	y3 = sharedMemory[1][__byte_perm(x0, 0, 0x4441)];
+	y2 = sharedMemory[2][__byte_perm(x0, 0, 0x4442)];
+	y1 = __ldg(&d_AES3[__byte_perm(x0, 0, 0x4443)]);
+
+	y1 ^= __ldg(&d_AES0[__byte_perm(x1, 0, 0x4440)]);
+	y0 ^= sharedMemory[1][__byte_perm(x1, 0, 0x4441)];
+	y3 ^= sharedMemory[2][__byte_perm(x1, 0, 0x4442)];
+	y2 ^= __ldg(&d_AES3[__byte_perm(x1, 0, 0x4443)]);
+
+	y2 ^= sharedMemory[0][__byte_perm(x2, 0, 0x4440)];
+	y1 ^= sharedMemory[1][__byte_perm(x2, 0, 0x4441)];
+	y0 ^= sharedMemory[2][__byte_perm(x2, 0, 0x4442)];
+	y3 ^= __ldg(&d_AES3[__byte_perm(x2, 0, 0x4443)]);
+
+	y3 ^= sharedMemory[0][__byte_perm(x3, 0, 0x4440)];
+	y2 ^= sharedMemory[1][__byte_perm(x3, 0, 0x4441)];
+	y1 ^= sharedMemory[2][__byte_perm(x3, 0, 0x4442)];
+	y0 ^= __ldg(&d_AES3[__byte_perm(x3, 0, 0x4443)]);
+}
+
+__device__ __forceinline__
+static void AES_2ROUND(const uint32_t sharedMemory[4][256], uint32_t &x0, uint32_t &x1, uint32_t &x2, uint32_t &x3, uint32_t &k0)
+{
+	uint32_t y0, y1, y2, y3;
+
+	aes_round(sharedMemory, x0, x1, x2, x3, k0, y0, y1, y2, y3);
+
+	aes_round(sharedMemory, y0, y1, y2, y3, x0, x1, x2, x3);
+
+	// hier werden wir ein carry brauchen (oder auch nicht)
+	k0++;
+}
+
+__device__ __forceinline__
+static void AES_2ROUND_LDG(const uint32_t sharedMemory[4][256], uint32_t &x0, uint32_t &x1, uint32_t &x2, uint32_t &x3, uint32_t &k0)
+{
+	uint32_t y0, y1, y2, y3;
+
+	aes_round_LDG(sharedMemory, x0, x1, x2, x3, k0, y0, y1, y2, y3);
+
+	aes_round_LDG(sharedMemory, y0, y1, y2, y3, x0, x1, x2, x3);
+
+	// hier werden wir ein carry brauchen (oder auch nicht)
+	k0++;
+}
+
+__device__ __forceinline__
+static void AES_ROUND_NOKEY(const uint32_t sharedMemory[4][256], uint4* x)
+{
+	uint32_t y0, y1, y2, y3;
+	aes_round(sharedMemory, x->x, x->y, x->z, x->w, y0, y1, y2, y3);
+
+	x->x = y0;
+	x->y = y1;
+	x->z = y2;
+	x->w = y3;
+}
+
+__device__ __forceinline__
+static void KEY_EXPAND_ELT(const uint32_t sharedMemory[4][256], uint32_t *k)
+{
+	uint32_t y0, y1, y2, y3;
+	aes_round(sharedMemory, k[0], k[1], k[2], k[3], y0, y1, y2, y3);
+
+	k[0] = y1;
+	k[1] = y2;
+	k[2] = y3;
+	k[3] = y0;
+}
diff --git a/tribus/cuda_echo512_final.cu b/tribus/cuda_echo512_final.cu
new file mode 100644
index 0000000000..b68a9c6ea5
--- /dev/null
+++ b/tribus/cuda_echo512_final.cu
@@ -0,0 +1,285 @@
+/**
+ *  Based on Provos Alexis work - 2016 FOR SM 5+
+ *
+ *  final touch by tpruvot for tribus - 09 2017
+ */
+#include <cuda_helper.h>
+#include <cuda_vector_uint2x4.h>
+#include <cuda_vectors.h>
+
+#define INTENSIVE_GMF
+#include "tribus/cuda_echo512_aes.cuh"
+
+#ifdef __INTELLISENSE__
+#define __byte_perm(x, y, b) x
+#define atomicExch(p,y) (*p) = y
+#endif
+
+__device__
+static void echo_round(const uint32_t sharedMemory[4][256], uint32_t *W, uint32_t &k0)
+{
+	// Big Sub Words
+	#pragma unroll 16
+	for (int idx = 0; idx < 16; idx++)
+		AES_2ROUND(sharedMemory,W[(idx<<2) + 0], W[(idx<<2) + 1], W[(idx<<2) + 2], W[(idx<<2) + 3], k0);
+
+	// Shift Rows
+	#pragma unroll 4
+	for (int i = 0; i < 4; i++)
+	{
+		uint32_t t[4];
+		/// 1, 5, 9, 13
+		t[0] = W[i +  4];
+		t[1] = W[i +  8];
+		t[2] = W[i + 24];
+		t[3] = W[i + 60];
+
+		W[i +  4] = W[i + 20];
+		W[i +  8] = W[i + 40];
+		W[i + 24] = W[i + 56];
+		W[i + 60] = W[i + 44];
+
+		W[i + 20] = W[i + 36];
+		W[i + 40] = t[1];
+		W[i + 56] = t[2];
+		W[i + 44] = W[i + 28];
+
+		W[i + 28] = W[i + 12];
+		W[i + 12] = t[3];
+		W[i + 36] = W[i + 52];
+		W[i + 52] = t[0];
+	}
+	// Mix Columns
+	#pragma unroll 4
+	for (int i = 0; i < 4; i++)
+	{
+		#pragma unroll 4
+		for (int idx = 0; idx < 64; idx += 16)
+		{
+			uint32_t a[4];
+			a[0] = W[idx + i];
+			a[1] = W[idx + i + 4];
+			a[2] = W[idx + i + 8];
+			a[3] = W[idx + i +12];
+
+			uint32_t ab = a[0] ^ a[1];
+			uint32_t bc = a[1] ^ a[2];
+			uint32_t cd = a[2] ^ a[3];
+
+			uint32_t t, t2, t3;
+			t  = (ab & 0x80808080);
+			t2 = (bc & 0x80808080);
+			t3 = (cd & 0x80808080);
+
+			uint32_t abx = (t  >> 7) * 27U ^ ((ab^t) << 1);
+			uint32_t bcx = (t2 >> 7) * 27U ^ ((bc^t2) << 1);
+			uint32_t cdx = (t3 >> 7) * 27U ^ ((cd^t3) << 1);
+
+			W[idx + i] = bc ^ a[3] ^ abx;
+			W[idx + i + 4] = a[0] ^ cd ^ bcx;
+			W[idx + i + 8] = ab ^ a[3] ^ cdx;
+			W[idx + i +12] = ab ^ a[2] ^ (abx ^ bcx ^ cdx);
+		}
+	}
+}
+
+__global__ __launch_bounds__(256, 3) /* will force 80 registers */
+static void tribus_echo512_gpu_final(uint32_t threads, uint64_t *g_hash, uint32_t* resNonce, const uint64_t target)
+{
+	__shared__ uint32_t sharedMemory[4][256];
+
+	aes_gpu_init256(sharedMemory);
+
+	const uint32_t P[48] = {
+		0xe7e9f5f5, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af,0xa4213d7e, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af,
+		//8-12
+		0x01425eb8, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af,0x65978b09, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af,
+		//21-25
+		0x2cb6b661, 0x6b23b3b3, 0xcf93a7cf, 0x9d9d3751,0x9ac2dea3, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af,
+		//34-38
+		0x579f9f33, 0xfbfbfbfb, 0xfbfbfbfb, 0xefefd3c7,0xdbfde1dd, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af,
+		0x34514d9e, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af,0xb134347e, 0xea6f7e7e, 0xbd7731bd, 0x8a8a1968,
+		0x14b8a457, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af,0x265f4382, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af
+		//58-61
+	};
+	uint32_t k0;
+	uint32_t h[16];
+
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		const uint32_t *hash = (uint32_t*)&g_hash[thread<<3];
+
+		*(uint2x4*)&h[0] = __ldg4((uint2x4*)&hash[0]);
+		*(uint2x4*)&h[8] = __ldg4((uint2x4*)&hash[8]);
+
+		uint64_t backup = *(uint64_t*)&h[6];
+
+		k0 = 512 + 8;
+
+		#pragma unroll 4
+		for (uint32_t idx = 0; idx < 16; idx += 4)
+			AES_2ROUND(sharedMemory,h[idx + 0], h[idx + 1], h[idx + 2], h[idx + 3], k0);
+
+		k0 += 4;
+
+		uint32_t W[64];
+
+		#pragma unroll 4
+		for (uint32_t i = 0; i < 4; i++)
+		{
+			uint32_t a = P[i];
+			uint32_t b = P[i + 4];
+			uint32_t c = h[i + 8];
+			uint32_t d = P[i + 8];
+
+			uint32_t ab = a ^ b;
+			uint32_t bc = b ^ c;
+			uint32_t cd = c ^ d;
+
+			uint32_t t =  ((a ^ b) & 0x80808080);
+			uint32_t t2 = ((b ^ c) & 0x80808080);
+			uint32_t t3 = ((c ^ d) & 0x80808080);
+
+			uint32_t abx = ((t  >> 7) * 27U) ^ ((ab^t) << 1);
+			uint32_t bcx = ((t2 >> 7) * 27U) ^ ((bc^t2) << 1);
+			uint32_t cdx = ((t3 >> 7) * 27U) ^ ((cd^t3) << 1);
+
+			W[0 + i] = bc ^ d ^ abx;
+			W[4 + i] = a ^ cd ^ bcx;
+			W[8 + i] = ab ^ d ^ cdx;
+			W[12+ i] = abx ^ bcx ^ cdx ^ ab ^ c;
+
+			a = P[12 + i];
+			b = h[i + 4];
+			c = P[12 + i + 4];
+			d = P[12 + i + 8];
+
+			ab = a ^ b;
+			bc = b ^ c;
+			cd = c ^ d;
+
+			t  = (ab & 0x80808080);
+			t2 = (bc & 0x80808080);
+			t3 = (cd & 0x80808080);
+
+			abx = (t  >> 7) * 27U ^ ((ab^t) << 1);
+			bcx = (t2 >> 7) * 27U ^ ((bc^t2) << 1);
+			cdx = (t3 >> 7) * 27U ^ ((cd^t3) << 1);
+
+			W[16 + i] = abx ^ bc ^ d;
+			W[16 + i + 4] = bcx ^ a ^ cd;
+			W[16 + i + 8] = cdx ^ ab ^ d;
+			W[16 + i +12] = abx ^ bcx ^ cdx ^ ab ^ c;
+
+			a = h[i];
+			b = P[24 + i];
+			c = P[24 + i + 4];
+			d = P[24 + i + 8];
+
+			ab = a ^ b;
+			bc = b ^ c;
+			cd = c ^ d;
+
+			t  = (ab & 0x80808080);
+			t2 = (bc & 0x80808080);
+			t3 = (cd & 0x80808080);
+
+			abx = (t  >> 7) * 27U ^ ((ab^t) << 1);
+			bcx = (t2 >> 7) * 27U ^ ((bc^t2) << 1);
+			cdx = (t3 >> 7) * 27U ^ ((cd^t3) << 1);
+
+			W[32 + i] = abx ^ bc ^ d;
+			W[32 + i + 4] = bcx ^ a ^ cd;
+			W[32 + i + 8] = cdx ^ ab ^ d;
+			W[32 + i +12] = abx ^ bcx ^ cdx ^ ab ^ c;
+
+			a = P[36 + i ];
+			b = P[36 + i + 4];
+			c = P[36 + i + 8];
+			d = h[i + 12];
+
+			ab = a ^ b;
+			bc = b ^ c;
+			cd = c ^ d;
+
+			t  = (ab & 0x80808080);
+			t2 = (bc & 0x80808080);
+			t3 = (cd & 0x80808080);
+
+			abx = (t  >> 7) * 27U ^ ((ab^t) << 1);
+			bcx = (t2 >> 7) * 27U ^ ((bc^t2) << 1);
+			cdx = (t3 >> 7) * 27U ^ ((cd^t3) << 1);
+
+			W[48 + i] = abx ^ bc ^ d;
+			W[48 + i + 4] = bcx ^ a ^ cd;
+			W[48 + i + 8] = cdx ^ ab ^ d;
+			W[48 + i +12] = abx ^ bcx ^ cdx ^ ab ^ c;
+		}
+
+		for (int k = 1; k < 9; k++)
+			echo_round(sharedMemory,W,k0);
+
+		// Big Sub Words
+		uint32_t y0, y1, y2, y3;
+//		AES_2ROUND(sharedMemory,W[ 0], W[ 1], W[ 2], W[ 3], k0);
+		aes_round(sharedMemory, W[ 0], W[ 1], W[ 2], W[ 3], k0, y0, y1, y2, y3);
+		aes_round(sharedMemory, y0, y1, y2, y3, W[ 0], W[ 1], W[ 2], W[ 3]);
+
+		aes_round(sharedMemory, W[ 4], W[ 5], W[ 6], W[ 7], k0, y0, y1, y2, y3);
+		aes_round(sharedMemory, y0, y1, y2, y3, W[ 4], W[ 5], W[ 6], W[ 7]);
+		aes_round(sharedMemory, W[ 8], W[ 9], W[10], W[11], k0, y0, y1, y2, y3);
+		aes_round(sharedMemory, y0, y1, y2, y3, W[ 8], W[ 9], W[10], W[11]);
+
+		aes_round(sharedMemory, W[20], W[21], W[22], W[23], k0, y0, y1, y2, y3);
+		aes_round(sharedMemory, y0, y1, y2, y3, W[20], W[21], W[22], W[23]);
+		aes_round(sharedMemory, W[28], W[29], W[30], W[31], k0, y0, y1, y2, y3);
+		aes_round(sharedMemory, y0, y1, y2, y3, W[28], W[29], W[30], W[31]);
+
+		aes_round(sharedMemory, W[32], W[33], W[34], W[35], k0, y0, y1, y2, y3);
+		aes_round(sharedMemory, y0, y1, y2, y3, W[32], W[33], W[34], W[35]);
+		aes_round(sharedMemory, W[40], W[41], W[42], W[43], k0, y0, y1, y2, y3);
+		aes_round(sharedMemory, y0, y1, y2, y3, W[40], W[41], W[42], W[43]);
+
+		aes_round(sharedMemory, W[52], W[53], W[54], W[55], k0, y0, y1, y2, y3);
+		aes_round(sharedMemory, y0, y1, y2, y3, W[52], W[53], W[54], W[55]);
+		aes_round(sharedMemory, W[60], W[61], W[62], W[63], k0, y0, y1, y2, y3);
+		aes_round(sharedMemory, y0, y1, y2, y3, W[60], W[61], W[62], W[63]);
+
+		uint32_t bc = W[22] ^ W[42];
+		uint32_t t2 = (bc & 0x80808080);
+		W[ 6] = (t2 >> 7) * 27U ^ ((bc^t2) << 1);
+
+		bc = W[23] ^ W[43];
+		t2 = (bc & 0x80808080);
+		W[ 7] = (t2 >> 7) * 27U ^ ((bc^t2) << 1);
+
+		bc = W[10] ^ W[54];
+		t2 = (bc & 0x80808080);
+		W[38] = (t2 >> 7) * 27U ^ ((bc^t2) << 1);
+
+		bc = W[11] ^ W[55];
+		t2 = (bc & 0x80808080);
+		W[39] = (t2 >> 7) * 27U ^ ((bc^t2) << 1);
+
+		uint64_t check = backup ^ *(uint64_t*)&W[2] ^ *(uint64_t*)&W[6] ^ *(uint64_t*)&W[10] ^ *(uint64_t*)&W[30]
+			^ *(uint64_t*)&W[34] ^ *(uint64_t*)&W[38] ^ *(uint64_t*)&W[42] ^ *(uint64_t*)&W[62];
+
+		if(check <= target){
+			uint32_t tmp = atomicExch(&resNonce[0], thread);
+			if (tmp != UINT32_MAX)
+				resNonce[1] = tmp;
+		}
+	}
+}
+
+__host__
+void tribus_echo512_final(int thr_id, uint32_t threads, uint32_t *d_hash, uint32_t *d_resNonce, const uint64_t target)
+{
+	const uint32_t threadsperblock = 256;
+
+	dim3 grid((threads + threadsperblock-1)/threadsperblock);
+	dim3 block(threadsperblock);
+
+	tribus_echo512_gpu_final <<<grid, block>>> (threads, (uint64_t*)d_hash, d_resNonce, target);
+}
diff --git a/tribus.cu b/tribus/tribus.cu
similarity index 71%
rename from tribus.cu
rename to tribus/tribus.cu
index ed82850085..4516e7d69c 100644
--- a/tribus.cu
+++ b/tribus/tribus.cu
@@ -1,7 +1,7 @@
 /**
  * Tribus Algo for Denarius
  *
- * tpruvot@github 06 2017 - GPLv3
+ * tpruvot@github 09 2017 - GPLv3
  *
  */
 extern "C" {
@@ -16,9 +16,10 @@ extern "C" {
 
 void jh512_setBlock_80(int thr_id, uint32_t *endiandata);
 void jh512_cuda_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNounce, uint32_t *d_hash);
+void tribus_echo512_final(int thr_id, uint32_t threads, uint32_t *d_hash, uint32_t *d_resNonce, const uint64_t target);
 
 static uint32_t *d_hash[MAX_GPUS];
-
+static uint32_t *d_resNonce[MAX_GPUS];
 
 // cpu hash
 
@@ -46,6 +47,7 @@ extern "C" void tribus_hash(void *state, const void *input)
 }
 
 static bool init[MAX_GPUS] = { 0 };
+static bool use_compat_kernels[MAX_GPUS] = { 0 };
 
 extern "C" int scanhash_tribus(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done)
 {
@@ -63,7 +65,8 @@ extern "C" int scanhash_tribus(int thr_id, struct work *work, uint32_t max_nonce
 
 	if (!init[thr_id])
 	{
-		cudaSetDevice(device_map[thr_id]);
+		int dev_id = device_map[thr_id];
+		cudaSetDevice(dev_id);
 		if (opt_cudaschedule == -1 && gpu_threads == 1) {
 			cudaDeviceReset();
 			// reduce cpu usage
@@ -74,10 +77,15 @@ extern "C" int scanhash_tribus(int thr_id, struct work *work, uint32_t max_nonce
 
 		quark_jh512_cpu_init(thr_id, throughput);
 		quark_keccak512_cpu_init(thr_id, throughput);
-		x11_echo512_cpu_init(thr_id, throughput);
+
+		cuda_get_arch(thr_id);
+		use_compat_kernels[thr_id] = (cuda_arch[dev_id] < 500);
+		if (use_compat_kernels[thr_id])
+			x11_echo512_cpu_init(thr_id, throughput);
 
 		// char[64] work space for hashes results
 		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t)64 * throughput));
+		CUDA_SAFE_CALL(cudaMalloc(&d_resNonce[thr_id], 2 * sizeof(uint32_t)));
 
 		cuda_check_cpu_init(thr_id, throughput);
 		init[thr_id] = true;
@@ -87,33 +95,43 @@ extern "C" int scanhash_tribus(int thr_id, struct work *work, uint32_t max_nonce
 		be32enc(&endiandata[k], pdata[k]);
 
 	jh512_setBlock_80(thr_id, endiandata);
-	cuda_check_cpu_setTarget(ptarget);
+	if (use_compat_kernels[thr_id])
+		cuda_check_cpu_setTarget(ptarget);
+	else
+		cudaMemset(d_resNonce[thr_id], 0xFF, 2 * sizeof(uint32_t));
 
 	work->valid_nonces = 0;
 
 	do {
 		int order = 1;
-
-		// Hash with CUDA
 		jh512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]);
 		quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-		x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+
+		if (use_compat_kernels[thr_id]) {
+			x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+			work->nonces[0] = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
+			work->nonces[1] = UINT32_MAX;
+		} else {
+			tribus_echo512_final(thr_id, throughput, d_hash[thr_id], d_resNonce[thr_id], AS_U64(&ptarget[6]));
+			cudaMemcpy(&work->nonces[0], d_resNonce[thr_id], 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost);
+		}
 
 		*hashes_done = pdata[19] - first_nonce + throughput;
 
-		work->nonces[0] = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
 		if (work->nonces[0] != UINT32_MAX)
 		{
-			const uint32_t Htarg = ptarget[7];
 			uint32_t _ALIGN(64) vhash[8];
+			const uint32_t Htarg = ptarget[7];
+			const uint32_t startNounce = pdata[19];
+			if (!use_compat_kernels[thr_id]) work->nonces[0] += startNounce;
 			be32enc(&endiandata[19], work->nonces[0]);
 			tribus_hash(vhash, endiandata);
 
 			if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) {
 				work->valid_nonces = 1;
 				work_set_target_ratio(work, vhash);
-				work->nonces[1] = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
-				if (work->nonces[1] != 0) {
+				if (work->nonces[1] != UINT32_MAX) {
+					work->nonces[1] += startNounce;
 					be32enc(&endiandata[19], work->nonces[1]);
 					tribus_hash(vhash, endiandata);
 					bn_set_target_ratio(work, vhash, 1);
@@ -127,7 +145,8 @@ extern "C" int scanhash_tribus(int thr_id, struct work *work, uint32_t max_nonce
 			else if (vhash[7] > Htarg) {
 				gpu_increment_reject(thr_id);
 				if (!opt_quiet)
-				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]);
+					gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]);
+				cudaMemset(d_resNonce[thr_id], 0xFF, 2 * sizeof(uint32_t));
 				pdata[19] = work->nonces[0] + 1;
 				continue;
 			}
@@ -144,7 +163,6 @@ extern "C" int scanhash_tribus(int thr_id, struct work *work, uint32_t max_nonce
 
 out:
 //	*hashes_done = pdata[19] - first_nonce;
-
 	return work->valid_nonces;
 }
 
@@ -157,8 +175,8 @@ extern "C" void free_tribus(int thr_id)
 	cudaThreadSynchronize();
 
 	cudaFree(d_hash[thr_id]);
+	cudaFree(d_resNonce[thr_id]);
 
-	quark_groestl512_cpu_free(thr_id);
 	cuda_check_cpu_free(thr_id);
 	init[thr_id] = false;
 
diff --git a/util.cpp b/util.cpp
index 2c6a1e9469..5b87d9652a 100644
--- a/util.cpp
+++ b/util.cpp
@@ -2211,6 +2211,9 @@ void print_hash_tests(void)
 	hmq17hash(&hash[0], &buf[0]);
 	printpfx("hmq1725", hash);
 
+	hsr_hash(&hash[0], &buf[0]);
+        printpfx("hsr", hash);
+
 	jha_hash(&hash[0], &buf[0]);
 	printpfx("jha", hash);
 
@@ -2245,6 +2248,9 @@ void print_hash_tests(void)
 	pentablakehash(&hash[0], &buf[0]);
 	printpfx("pentablake", hash);
 
+	phihash(&hash[0], &buf[0]);
+	printpfx("phi", hash);
+
 	quarkhash(&hash[0], &buf[0]);
 	printpfx("quark", hash);
 
diff --git a/x11/c11.cu b/x11/c11.cu
index 5dee17455a..8f8f6663b2 100644
--- a/x11/c11.cu
+++ b/x11/c11.cu
@@ -18,10 +18,13 @@ extern "C"
 #include "cuda_helper.h"
 #include "cuda_x11.h"
 
+void tribus_echo512_final(int thr_id, uint32_t threads, uint32_t *d_hash, uint32_t *d_resNonce, const uint64_t target);
+
 #include <stdio.h>
 #include <memory.h>
 
 static uint32_t *d_hash[MAX_GPUS];
+static uint32_t *d_resNonce[MAX_GPUS];
 
 // Flax/Chaincoin C11 CPU Hash
 extern "C" void c11hash(void *output, const void *input)
@@ -103,6 +106,7 @@ extern "C" void c11hash(void *output, const void *input)
 #endif
 
 static bool init[MAX_GPUS] = { 0 };
+static bool use_compat_kernels[MAX_GPUS] = { 0 };
 
 extern "C" int scanhash_c11(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
 {
@@ -118,7 +122,8 @@ extern "C" int scanhash_c11(int thr_id, struct work* work, uint32_t max_nonce, u
 
 	if (!init[thr_id])
 	{
-		cudaSetDevice(device_map[thr_id]);
+		int dev_id = device_map[thr_id];
+		cudaSetDevice(dev_id);
 		if (opt_cudaschedule == -1 && gpu_threads == 1) {
 			cudaDeviceReset();
 			// reduce cpu usage
@@ -127,6 +132,9 @@ extern "C" int scanhash_c11(int thr_id, struct work* work, uint32_t max_nonce, u
 		}
 		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
 
+		cuda_get_arch(thr_id);
+		use_compat_kernels[thr_id] = (cuda_arch[dev_id] < 500);
+
 		quark_blake512_cpu_init(thr_id, throughput);
 		quark_bmw512_cpu_init(thr_id, throughput);
 		quark_groestl512_cpu_init(thr_id, throughput);
@@ -135,11 +143,13 @@ extern "C" int scanhash_c11(int thr_id, struct work* work, uint32_t max_nonce, u
 		quark_jh512_cpu_init(thr_id, throughput);
 		x11_luffaCubehash512_cpu_init(thr_id, throughput);
 		x11_shavite512_cpu_init(thr_id, throughput);
-		x11_echo512_cpu_init(thr_id, throughput);
+		if (use_compat_kernels[thr_id])
+			x11_echo512_cpu_init(thr_id, throughput);
 		if (x11_simd512_cpu_init(thr_id, throughput) != 0) {
 			return 0;
 		}
-		CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], 64 * throughput), 0); // why 64 ?
+		CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], 64 * throughput), 0);
+		CUDA_SAFE_CALL(cudaMalloc(&d_resNonce[thr_id], 2 * sizeof(uint32_t)));
 
 		cuda_check_cpu_init(thr_id, throughput);
 
@@ -151,7 +161,10 @@ extern "C" int scanhash_c11(int thr_id, struct work* work, uint32_t max_nonce, u
 		be32enc(&endiandata[k], pdata[k]);
 
 	quark_blake512_cpu_setBlock_80(thr_id, endiandata);
-	cuda_check_cpu_setTarget(ptarget);
+	if (use_compat_kernels[thr_id])
+		cuda_check_cpu_setTarget(ptarget);
+	else
+		cudaMemset(d_resNonce[thr_id], 0xFF, 2 * sizeof(uint32_t));
 
 	do {
 		int order = 0;
@@ -175,24 +188,32 @@ extern "C" int scanhash_c11(int thr_id, struct work* work, uint32_t max_nonce, u
 		TRACE("shavite:");
 		x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 		TRACE("simd   :");
-		x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-		TRACE("echo => ");
+
+		if (use_compat_kernels[thr_id]) {
+			x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+			work->nonces[0] = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
+			work->nonces[1] = UINT32_MAX;
+		} else {
+			tribus_echo512_final(thr_id, throughput, d_hash[thr_id], d_resNonce[thr_id], AS_U64(&ptarget[6]));
+			cudaMemcpy(&work->nonces[0], d_resNonce[thr_id], 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost);
+		}
 
 		*hashes_done = pdata[19] - first_nonce + throughput;
 
-		work->nonces[0] = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
 		if (work->nonces[0] != UINT32_MAX)
 		{
-			const uint32_t Htarg = ptarget[7];
 			uint32_t _ALIGN(64) vhash[8];
+			const uint32_t Htarg = ptarget[7];
+			const uint32_t startNounce = pdata[19];
+			if (!use_compat_kernels[thr_id]) work->nonces[0] += startNounce;
 			be32enc(&endiandata[19], work->nonces[0]);
 			c11hash(vhash, endiandata);
 
 			if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) {
 				work->valid_nonces = 1;
 				work_set_target_ratio(work, vhash);
-				work->nonces[1] = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
-				if (work->nonces[1] != 0) {
+				if (work->nonces[1] != UINT32_MAX) {
+					work->nonces[1] += startNounce;
 					be32enc(&endiandata[19], work->nonces[1]);
 					c11hash(vhash, endiandata);
 					bn_set_target_ratio(work, vhash, 1);
@@ -206,7 +227,8 @@ extern "C" int scanhash_c11(int thr_id, struct work* work, uint32_t max_nonce, u
 			else if (vhash[7] > Htarg) {
 				gpu_increment_reject(thr_id);
 				if (!opt_quiet)
-				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]);
+					gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]);
+				cudaMemset(d_resNonce[thr_id], 0xFF, 2 * sizeof(uint32_t));
 				pdata[19] = work->nonces[0] + 1;
 				continue;
 			}
@@ -234,6 +256,8 @@ extern "C" void free_c11(int thr_id)
 	cudaThreadSynchronize();
 
 	cudaFree(d_hash[thr_id]);
+	cudaFree(d_resNonce[thr_id]);
+
 	quark_blake512_cpu_free(thr_id);
 	quark_groestl512_cpu_free(thr_id);
 	x11_simd512_cpu_free(thr_id);
diff --git a/x11/cuda_streebog.cu b/x11/cuda_streebog.cu
index 228c691312..d6e3685e62 100644
--- a/x11/cuda_streebog.cu
+++ b/x11/cuda_streebog.cu
@@ -806,10 +806,11 @@ void streebog_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash)
 #define T6(x) shared[6][x]
 #define T7(x) shared[7][x]
 
+// Streebog final for Veltor and skunk on SM 3.x
 __constant__ uint64_t target64[4];
 
 __host__
-void streebog_set_target(const uint32_t* ptarget)
+void streebog_sm3_set_target(uint32_t* ptarget)
 {
 	cudaMemcpyToSymbol(target64,ptarget,4*sizeof(uint64_t),0,cudaMemcpyHostToDevice);
 }
@@ -995,7 +996,7 @@ void streebog_gpu_hash_64_final(uint64_t *g_hash, uint32_t* resNonce)
 }
 
 __host__
-void streebog_cpu_hash_64_final(int thr_id, uint32_t threads, uint32_t *d_hash,uint32_t* d_resNonce)
+void streebog_sm3_hash_64_final(int thr_id, uint32_t threads, uint32_t *d_hash,uint32_t* d_resNonce)
 {
 	dim3 grid((threads + TPB-1) / TPB);
 	dim3 block(TPB);
diff --git a/x11/cuda_streebog_maxwell.cu b/x11/cuda_streebog_maxwell.cu
new file mode 100644
index 0000000000..6a06332933
--- /dev/null
+++ b/x11/cuda_streebog_maxwell.cu
@@ -0,0 +1,309 @@
+/*
+ * Streebog GOST R 34.10-2012 CUDA implementation.
+ *
+ * https://tools.ietf.org/html/rfc6986
+ * https://en.wikipedia.org/wiki/Streebog
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * @author   Tanguy Pruvot - 2015
+ * @author   Alexis Provos - 2016
+ */
+
+// Further improved with shared memory partial utilization
+// Tested under CUDA7.5 toolkit for cp 5.0/5.2
+
+//#include <miner.h>
+#include <cuda_helper.h>
+#include <cuda_vectors.h>
+#include <cuda_vector_uint2x4.h>
+
+#include "streebog_arrays.cuh"
+
+//#define FULL_UNROLL
+__device__ __forceinline__
+static void GOST_FS(const uint2 shared[8][256],const uint2 *const __restrict__ state,uint2* return_state)
+{
+	return_state[0] = __ldg(&T02[__byte_perm(state[7].x,0,0x44440)])
+			^ shared[1][__byte_perm(state[6].x,0,0x44440)]
+			^ shared[2][__byte_perm(state[5].x,0,0x44440)]
+			^ shared[3][__byte_perm(state[4].x,0,0x44440)]
+			^ shared[4][__byte_perm(state[3].x,0,0x44440)]
+			^ shared[5][__byte_perm(state[2].x,0,0x44440)]
+			^ shared[6][__byte_perm(state[1].x,0,0x44440)]
+			^ __ldg(&T72[__byte_perm(state[0].x,0,0x44440)]);
+
+	return_state[1] = __ldg(&T02[__byte_perm(state[7].x,0,0x44441)])
+			^ __ldg(&T12[__byte_perm(state[6].x,0,0x44441)])
+			^ shared[2][__byte_perm(state[5].x,0,0x44441)]
+			^ shared[3][__byte_perm(state[4].x,0,0x44441)]
+			^ shared[4][__byte_perm(state[3].x,0,0x44441)]
+			^ shared[5][__byte_perm(state[2].x,0,0x44441)]
+			^ shared[6][__byte_perm(state[1].x,0,0x44441)]
+			^ __ldg(&T72[__byte_perm(state[0].x,0,0x44441)]);
+
+	return_state[2] = __ldg(&T02[__byte_perm(state[7].x,0,0x44442)])
+			^ __ldg(&T12[__byte_perm(state[6].x,0,0x44442)])
+			^ shared[2][__byte_perm(state[5].x,0,0x44442)]
+			^ shared[3][__byte_perm(state[4].x,0,0x44442)]
+			^ shared[4][__byte_perm(state[3].x,0,0x44442)]
+			^ shared[5][__byte_perm(state[2].x,0,0x44442)]
+			^ __ldg(&T72[__byte_perm(state[0].x,0,0x44442)])
+			^ shared[6][__byte_perm(state[1].x,0,0x44442)];
+
+	return_state[3] = __ldg(&T02[__byte_perm(state[7].x,0,0x44443)])
+			^ shared[1][__byte_perm(state[6].x,0,0x44443)]
+			^ shared[2][__byte_perm(state[5].x,0,0x44443)]
+			^ shared[3][__byte_perm(state[4].x,0,0x44443)]
+			^ __ldg(&T42[__byte_perm(state[3].x,0,0x44443)])
+			^ shared[5][__byte_perm(state[2].x,0,0x44443)]
+			^ __ldg(&T72[__byte_perm(state[0].x,0,0x44443)])
+			^ shared[6][__byte_perm(state[1].x,0,0x44443)];
+
+	return_state[4] = __ldg(&T02[__byte_perm(state[7].y,0,0x44440)])
+			^ shared[1][__byte_perm(state[6].y,0,0x44440)]
+			^ __ldg(&T22[__byte_perm(state[5].y,0,0x44440)])
+			^ shared[3][__byte_perm(state[4].y,0,0x44440)]
+			^ shared[4][__byte_perm(state[3].y,0,0x44440)]
+			^ __ldg(&T62[__byte_perm(state[1].y,0,0x44440)])
+			^ shared[5][__byte_perm(state[2].y,0,0x44440)]
+			^ __ldg(&T72[__byte_perm(state[0].y,0,0x44440)]);
+
+	return_state[5] = __ldg(&T02[__byte_perm(state[7].y,0,0x44441)])
+			^ shared[2][__byte_perm(state[5].y,0,0x44441)]
+			^ __ldg(&T12[__byte_perm(state[6].y,0,0x44441)])
+			^ shared[3][__byte_perm(state[4].y,0,0x44441)]
+			^ shared[4][__byte_perm(state[3].y,0,0x44441)]
+			^ shared[5][__byte_perm(state[2].y,0,0x44441)]
+			^ __ldg(&T62[__byte_perm(state[1].y,0,0x44441)])
+			^ __ldg(&T72[__byte_perm(state[0].y,0,0x44441)]);
+
+	return_state[6] = __ldg(&T02[__byte_perm(state[7].y,0,0x44442)])
+			^ shared[1][__byte_perm(state[6].y,0,0x44442)]
+			^ shared[2][__byte_perm(state[5].y,0,0x44442)]
+			^ shared[3][__byte_perm(state[4].y,0,0x44442)]
+			^ shared[4][__byte_perm(state[3].y,0,0x44442)]
+			^ shared[5][__byte_perm(state[2].y,0,0x44442)]
+			^ __ldg(&T62[__byte_perm(state[1].y,0,0x44442)])
+			^ __ldg(&T72[__byte_perm(state[0].y,0,0x44442)]);
+
+	return_state[7] = __ldg(&T02[__byte_perm(state[7].y,0,0x44443)])
+			^ __ldg(&T12[__byte_perm(state[6].y,0,0x44443)])
+			^ shared[2][__byte_perm(state[5].y,0,0x44443)]
+			^ shared[3][__byte_perm(state[4].y,0,0x44443)]
+			^ shared[4][__byte_perm(state[3].y,0,0x44443)]
+			^ shared[5][__byte_perm(state[2].y,0,0x44443)]
+			^ __ldg(&T62[__byte_perm(state[1].y,0,0x44443)])
+			^ __ldg(&T72[__byte_perm(state[0].y,0,0x44443)]);
+}
+
+__device__ __forceinline__
+static void GOST_FS_LDG(const uint2 shared[8][256],const uint2 *const __restrict__ state,uint2* return_state)
+{
+	return_state[0] = __ldg(&T02[__byte_perm(state[7].x,0,0x44440)])
+			^ __ldg(&T12[__byte_perm(state[6].x,0,0x44440)])
+			^ shared[2][__byte_perm(state[5].x,0,0x44440)]
+			^ shared[3][__byte_perm(state[4].x,0,0x44440)]
+			^ shared[4][__byte_perm(state[3].x,0,0x44440)]
+			^ shared[5][__byte_perm(state[2].x,0,0x44440)]
+			^ shared[6][__byte_perm(state[1].x,0,0x44440)]
+			^ __ldg(&T72[__byte_perm(state[0].x,0,0x44440)]);
+
+	return_state[1] = __ldg(&T02[__byte_perm(state[7].x,0,0x44441)])
+			^ __ldg(&T12[__byte_perm(state[6].x,0,0x44441)])
+			^ shared[2][__byte_perm(state[5].x,0,0x44441)]
+			^ shared[3][__byte_perm(state[4].x,0,0x44441)]
+			^ shared[4][__byte_perm(state[3].x,0,0x44441)]
+			^ shared[5][__byte_perm(state[2].x,0,0x44441)]
+			^ __ldg(&T72[__byte_perm(state[0].x,0,0x44441)])
+			^ shared[6][__byte_perm(state[1].x,0,0x44441)];
+
+	return_state[2] = __ldg(&T02[__byte_perm(state[7].x,0,0x44442)])
+			^ __ldg(&T12[__byte_perm(state[6].x,0,0x44442)])
+			^ shared[2][__byte_perm(state[5].x,0,0x44442)]
+			^ shared[3][__byte_perm(state[4].x,0,0x44442)]
+			^ shared[4][__byte_perm(state[3].x,0,0x44442)]
+			^ shared[5][__byte_perm(state[2].x,0,0x44442)]
+			^ shared[6][__byte_perm(state[1].x,0,0x44442)]
+			^ __ldg(&T72[__byte_perm(state[0].x,0,0x44442)]);
+
+	return_state[3] = __ldg(&T02[__byte_perm(state[7].x,0,0x44443)])
+			^ __ldg(&T12[__byte_perm(state[6].x,0,0x44443)])
+			^ shared[2][__byte_perm(state[5].x,0,0x44443)]
+			^ shared[3][__byte_perm(state[4].x,0,0x44443)]
+			^ shared[4][__byte_perm(state[3].x,0,0x44443)]
+			^ shared[5][__byte_perm(state[2].x,0,0x44443)]
+			^ shared[6][__byte_perm(state[1].x,0,0x44443)]
+			^ __ldg(&T72[__byte_perm(state[0].x,0,0x44443)]);
+
+	return_state[4] = __ldg(&T02[__byte_perm(state[7].y,0,0x44440)])
+			^ shared[1][__byte_perm(state[6].y,0,0x44440)]
+			^ __ldg(&T22[__byte_perm(state[5].y,0,0x44440)])
+			^ shared[3][__byte_perm(state[4].y,0,0x44440)]
+			^ shared[4][__byte_perm(state[3].y,0,0x44440)]
+			^ shared[5][__byte_perm(state[2].y,0,0x44440)]
+			^ __ldg(&T72[__byte_perm(state[0].y,0,0x44440)])
+			^ __ldg(&T62[__byte_perm(state[1].y,0,0x44440)]);
+
+	return_state[5] = __ldg(&T02[__byte_perm(state[7].y,0,0x44441)])
+			^ __ldg(&T12[__byte_perm(state[6].y,0,0x44441)])
+			^ shared[2][__byte_perm(state[5].y,0,0x44441)]
+			^ shared[3][__byte_perm(state[4].y,0,0x44441)]
+			^ shared[4][__byte_perm(state[3].y,0,0x44441)]
+			^ shared[5][__byte_perm(state[2].y,0,0x44441)]
+			^ __ldg(&T72[__byte_perm(state[0].y,0,0x44441)])
+			^ __ldg(&T62[__byte_perm(state[1].y,0,0x44441)]);
+
+	return_state[6] = __ldg(&T02[__byte_perm(state[7].y,0,0x44442)])
+			^ __ldg(&T12[__byte_perm(state[6].y,0,0x44442)])
+			^ __ldg(&T22[__byte_perm(state[5].y,0,0x44442)])
+			^ shared[3][__byte_perm(state[4].y,0,0x44442)]
+			^ shared[4][__byte_perm(state[3].y,0,0x44442)]
+			^ shared[5][__byte_perm(state[2].y,0,0x44442)]
+			^ __ldg(&T72[__byte_perm(state[0].y,0,0x44442)])
+			^ __ldg(&T62[__byte_perm(state[1].y,0,0x44442)]);
+
+	return_state[7] = __ldg(&T02[__byte_perm(state[7].y,0,0x44443)])
+			^ shared[1][__byte_perm(state[6].y,0,0x44443)]
+			^ __ldg(&T22[__byte_perm(state[5].y,0,0x44443)])
+			^ shared[3][__byte_perm(state[4].y,0,0x44443)]
+			^ shared[4][__byte_perm(state[3].y,0,0x44443)]
+			^ shared[5][__byte_perm(state[2].y,0,0x44443)]
+			^ __ldg(&T72[__byte_perm(state[0].y,0,0x44443)])
+			^ __ldg(&T62[__byte_perm(state[1].y,0,0x44443)]);
+}
+
+__device__ __forceinline__
+static void GOST_E12(const uint2 shared[8][256],uint2 *const __restrict__ K, uint2 *const __restrict__ state)
+{
+	uint2 t[8];
+	for(int i=0; i<12; i++){
+		GOST_FS(shared,state, t);
+
+		#pragma unroll 8
+		for(int j=0;j<8;j++)
+			K[ j] ^= *(uint2*)&CC[i][j];
+
+		#pragma unroll 8
+		for(int j=0;j<8;j++)
+			state[ j] = t[ j];
+
+		GOST_FS_LDG(shared,K, t);
+
+		#pragma unroll 8
+		for(int j=0;j<8;j++)
+			state[ j]^= t[ j];
+
+		#pragma unroll 8
+		for(int j=0;j<8;j++)
+			K[ j] = t[ j];
+	}
+}
+
+#define TPB 256
+__global__
+#if __CUDA_ARCH__ > 500
+__launch_bounds__(TPB, 3)
+#else
+__launch_bounds__(TPB, 3)
+#endif
+void streebog_gpu_hash_64_maxwell(uint64_t *g_hash)
+{
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	uint2 buf[8], t[8], temp[8], K0[8], hash[8];
+
+	__shared__ uint2 shared[8][256];
+	shared[0][threadIdx.x] = __ldg(&T02[threadIdx.x]);
+	shared[1][threadIdx.x] = __ldg(&T12[threadIdx.x]);
+	shared[2][threadIdx.x] = __ldg(&T22[threadIdx.x]);
+	shared[3][threadIdx.x] = __ldg(&T32[threadIdx.x]);
+	shared[4][threadIdx.x] = __ldg(&T42[threadIdx.x]);
+	shared[5][threadIdx.x] = __ldg(&T52[threadIdx.x]);
+	shared[6][threadIdx.x] = __ldg(&T62[threadIdx.x]);
+	shared[7][threadIdx.x] = __ldg(&T72[threadIdx.x]);
+
+	uint64_t* inout = &g_hash[thread<<3];
+
+	*(uint2x4*)&hash[0] = __ldg4((uint2x4*)&inout[0]);
+	*(uint2x4*)&hash[4] = __ldg4((uint2x4*)&inout[4]);
+
+	__threadfence_block();
+
+	K0[0] = vectorize(0x74a5d4ce2efc83b3);
+
+	#pragma unroll 8
+	for(int i=0;i<8;i++){
+		buf[ i] = K0[ 0] ^ hash[ i];
+	}
+
+	for(int i=0; i<12; i++){
+		GOST_FS(shared, buf, temp);
+		#pragma unroll 8
+		for(uint32_t j=0;j<8;j++){
+			buf[ j] = temp[ j] ^ *(uint2*)&precomputed_values[i][j];
+		}
+	}
+	#pragma unroll 8
+	for(int j=0;j<8;j++){
+		buf[ j]^= hash[ j];
+	}
+	#pragma unroll 8
+	for(int j=0;j<8;j++){
+		K0[ j] = buf[ j];
+	}
+
+	K0[7].y ^= 0x00020000;
+
+	GOST_FS(shared, K0, t);
+
+	#pragma unroll 8
+	for(int i=0;i<8;i++)
+		K0[ i] = t[ i];
+
+	t[7].y ^= 0x01000000;
+
+	GOST_E12(shared, K0, t);
+
+	#pragma unroll 8
+	for(int j=0;j<8;j++)
+		buf[ j] ^= t[ j];
+
+	buf[7].y ^= 0x01000000;
+
+	GOST_FS(shared, buf,K0);
+
+	buf[7].y ^= 0x00020000;
+
+	#pragma unroll 8
+	for(int j=0;j<8;j++)
+		t[ j] = K0[ j];
+
+	t[7].y ^= 0x00020000;
+
+	GOST_E12(shared, K0, t);
+
+	#pragma unroll 8
+	for(int j=0;j<8;j++)
+		buf[ j] ^= t[ j];
+
+	GOST_FS(shared, buf,K0); // K = F(h)
+
+	hash[7]+= vectorize(0x0100000000000000);
+
+	#pragma unroll 8
+	for(int j=0;j<8;j++)
+		t[ j] = K0[ j] ^ hash[ j];
+
+	GOST_E12(shared, K0, t);
+
+	*(uint2x4*)&inout[0] = *(uint2x4*)&t[0] ^ *(uint2x4*)&hash[0] ^ *(uint2x4*)&buf[0];
+	*(uint2x4*)&inout[4] = *(uint2x4*)&t[4] ^ *(uint2x4*)&hash[4] ^ *(uint2x4*)&buf[4];
+}
+
+__host__
+void streebog_hash_64_maxwell(int thr_id, uint32_t threads, uint32_t *d_hash)
+{
+	dim3 grid((threads + TPB-1) / TPB);
+	dim3 block(TPB);
+	streebog_gpu_hash_64_maxwell <<<grid, block>>> ((uint64_t*)d_hash);
+}
diff --git a/x11/phi.cu b/x11/phi.cu
new file mode 100644
index 0000000000..ab1f30833c
--- /dev/null
+++ b/x11/phi.cu
@@ -0,0 +1,223 @@
+//
+//
+//  PHI1612 algo
+//  Skein + JH + CubeHash + Fugue + Gost + Echo
+//
+//  Implemented by anorganix @ bitcointalk on 01.10.2017
+//  Feel free to send some satoshis to 1Bitcoin8tfbtGAQNFxDRUVUfFgFWKoWi9
+//
+//
+
+extern "C" {
+#include "sph/sph_skein.h"
+#include "sph/sph_jh.h"
+#include "sph/sph_cubehash.h"
+#include "sph/sph_fugue.h"
+#include "sph/sph_streebog.h"
+#include "sph/sph_echo.h"
+}
+
+#include "miner.h"
+#include "cuda_helper.h"
+#include "cuda_x11.h"
+
+extern void skein512_cpu_setBlock_80(void *pdata);
+extern void skein512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_hash, int swap);
+extern void streebog_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash);
+extern void streebog_hash_64_maxwell(int thr_id, uint32_t threads, uint32_t *d_hash);
+
+extern void x13_fugue512_cpu_init(int thr_id, uint32_t threads);
+extern void x13_fugue512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void x13_fugue512_cpu_free(int thr_id);
+
+extern void tribus_echo512_final(int thr_id, uint32_t threads, uint32_t *d_hash, uint32_t *d_resNonce, const uint64_t target);
+
+#include <stdio.h>
+#include <memory.h>
+
+static uint32_t *d_hash[MAX_GPUS];
+static uint32_t *d_resNonce[MAX_GPUS];
+
+extern "C" void phihash(void *output, const void *input)
+{
+	unsigned char _ALIGN(128) hash[128] = { 0 };
+
+	sph_skein512_context ctx_skein;
+	sph_jh512_context ctx_jh;
+	sph_cubehash512_context ctx_cubehash;
+	sph_fugue512_context ctx_fugue;
+	sph_gost512_context ctx_gost;
+	sph_echo512_context ctx_echo;
+
+	sph_skein512_init(&ctx_skein);
+	sph_skein512(&ctx_skein, input, 80);
+	sph_skein512_close(&ctx_skein, (void*)hash);
+
+	sph_jh512_init(&ctx_jh);
+	sph_jh512(&ctx_jh, (const void*)hash, 64);
+	sph_jh512_close(&ctx_jh, (void*)hash);
+
+	sph_cubehash512_init(&ctx_cubehash);
+	sph_cubehash512(&ctx_cubehash, (const void*)hash, 64);
+	sph_cubehash512_close(&ctx_cubehash, (void*)hash);
+
+	sph_fugue512_init(&ctx_fugue);
+	sph_fugue512(&ctx_fugue, (const void*)hash, 64);
+	sph_fugue512_close(&ctx_fugue, (void*)hash);
+
+	sph_gost512_init(&ctx_gost);
+	sph_gost512(&ctx_gost, (const void*)hash, 64);
+	sph_gost512_close(&ctx_gost, (void*)hash);
+
+	sph_echo512_init(&ctx_echo);
+	sph_echo512(&ctx_echo, (const void*)hash, 64);
+	sph_echo512_close(&ctx_echo, (void*)hash);
+
+	memcpy(output, hash, 32);
+}
+
+#define _DEBUG_PREFIX "phi"
+#include "cuda_debug.cuh"
+
+static bool init[MAX_GPUS] = { 0 };
+static bool use_compat_kernels[MAX_GPUS] = { 0 };
+
+extern "C" int scanhash_phi(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
+{
+	uint32_t *pdata = work->data;
+	uint32_t *ptarget = work->target;
+
+	const uint32_t first_nonce = pdata[19];
+	const int dev_id = device_map[thr_id];
+
+	int intensity = (device_sm[dev_id] >= 500 && !is_windows()) ? 19 : 18; // 2^18 = 262144 cuda threads
+	if (device_sm[dev_id] >= 600) intensity = 20;
+
+	uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity);
+	if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
+
+	if (opt_benchmark)
+		ptarget[7] = 0xf;
+
+	if (!init[thr_id])
+	{
+		cudaSetDevice(dev_id);
+		if (opt_cudaschedule == -1 && gpu_threads == 1) {
+			cudaDeviceReset();
+			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+
+			CUDA_LOG_ERROR();
+		}
+		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
+
+		cuda_get_arch(thr_id);
+		use_compat_kernels[thr_id] = (cuda_arch[dev_id] < 500);
+
+		quark_skein512_cpu_init(thr_id, throughput);
+		quark_jh512_cpu_init(thr_id, throughput);
+		x11_cubehash512_cpu_init(thr_id, throughput);
+		x13_fugue512_cpu_init(thr_id, throughput);
+		if (use_compat_kernels[thr_id])
+			x11_echo512_cpu_init(thr_id, throughput);
+
+		CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], (size_t)64 * throughput), -1);
+		CUDA_SAFE_CALL(cudaMalloc(&d_resNonce[thr_id], 2 * sizeof(uint32_t)));
+
+		cuda_check_cpu_init(thr_id, throughput);
+		init[thr_id] = true;
+	}
+
+	uint32_t endiandata[20];
+
+	for (int k = 0; k < 20; k++)
+		be32enc(&endiandata[k], pdata[k]);
+
+	skein512_cpu_setBlock_80((void*)endiandata);
+	if (use_compat_kernels[thr_id])
+		cuda_check_cpu_setTarget(ptarget);
+	else
+		cudaMemset(d_resNonce[thr_id], 0xFF, 2 * sizeof(uint32_t));
+
+	do {
+		int order = 0;
+
+		skein512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], 1); order++;
+		quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x11_cubehash512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		if (use_compat_kernels[thr_id]) {
+			streebog_cpu_hash_64(thr_id, throughput, d_hash[thr_id]);
+			x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+			work->nonces[0] = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
+		} else {
+			streebog_hash_64_maxwell(thr_id, throughput, d_hash[thr_id]);
+			tribus_echo512_final(thr_id, throughput, d_hash[thr_id], d_resNonce[thr_id], AS_U64(&ptarget[6]));
+			cudaMemcpy(&work->nonces[0], d_resNonce[thr_id], 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost);
+		}
+
+		if (work->nonces[0] != UINT32_MAX)
+		{
+			const uint32_t Htarg = ptarget[7];
+			const uint32_t startNonce = pdata[19];
+			uint32_t _ALIGN(64) vhash[8];
+			if (!use_compat_kernels[thr_id]) work->nonces[0] += startNonce;
+			be32enc(&endiandata[19], work->nonces[0]);
+			phihash(vhash, endiandata);
+
+			if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) {
+				work->valid_nonces = 1;
+				work_set_target_ratio(work, vhash);
+				*hashes_done = pdata[19] - first_nonce + throughput;
+				//work->nonces[1] = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
+				//if (work->nonces[1] != 0) {
+				if (work->nonces[1] != UINT32_MAX) {
+					work->nonces[1] += startNonce;
+					be32enc(&endiandata[19], work->nonces[1]);
+					phihash(vhash, endiandata);
+					bn_set_target_ratio(work, vhash, 1);
+					work->valid_nonces++;
+					pdata[19] = max(work->nonces[0], work->nonces[1]) + 1;
+				}
+				else {
+					pdata[19] = work->nonces[0] + 1; // cursor
+				}
+				return work->valid_nonces;
+			}
+			else if (vhash[7] > Htarg) {
+				gpu_increment_reject(thr_id);
+				if (!opt_quiet)
+					gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]);
+				cudaMemset(d_resNonce[thr_id], 0xFF, 2 * sizeof(uint32_t));
+				pdata[19] = work->nonces[0] + 1;
+				continue;
+			}
+		}
+
+		if ((uint64_t)throughput + pdata[19] >= max_nonce) {
+			pdata[19] = max_nonce;
+			break;
+		}
+		pdata[19] += throughput;
+
+	} while (!work_restart[thr_id].restart);
+
+	*hashes_done = pdata[19] - first_nonce;
+	return 0;
+}
+
+// cleanup
+extern "C" void free_phi(int thr_id)
+{
+	if (!init[thr_id])
+		return;
+
+	cudaThreadSynchronize();
+	cudaFree(d_hash[thr_id]);
+	cudaFree(d_resNonce[thr_id]);
+	x13_fugue512_cpu_free(thr_id);
+
+	cuda_check_cpu_free(thr_id);
+	init[thr_id] = false;
+
+	cudaDeviceSynchronize();
+}
diff --git a/x11/sib.cu b/x11/sib.cu
index 158f85e287..c437523d03 100644
--- a/x11/sib.cu
+++ b/x11/sib.cu
@@ -18,6 +18,7 @@ extern "C" {
 #include "cuda_x11.h"
 
 extern void streebog_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash);
+extern void streebog_hash_64_maxwell(int thr_id, uint32_t threads, uint32_t *d_hash);
 
 #include <stdio.h>
 #include <memory.h>
@@ -98,6 +99,7 @@ extern "C" void sibhash(void *output, const void *input)
 #include "cuda_debug.cuh"
 
 static bool init[MAX_GPUS] = { 0 };
+static bool use_compat_kernels[MAX_GPUS] = { 0 };
 
 extern "C" int scanhash_sib(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
 {
@@ -124,6 +126,9 @@ extern "C" int scanhash_sib(int thr_id, struct work* work, uint32_t max_nonce, u
 		}
 		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
 
+		cuda_get_arch(thr_id);
+		use_compat_kernels[thr_id] = (cuda_arch[dev_id] < 500);
+
 		quark_blake512_cpu_init(thr_id, throughput);
 		quark_bmw512_cpu_init(thr_id, throughput);
 		quark_groestl512_cpu_init(thr_id, throughput);
@@ -166,7 +171,10 @@ extern "C" int scanhash_sib(int thr_id, struct work* work, uint32_t max_nonce, u
 		TRACE("jh512  :");
 		quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 		TRACE("keccak :");
-		streebog_cpu_hash_64(thr_id, throughput, d_hash[thr_id]);
+		if (use_compat_kernels[thr_id])
+			streebog_cpu_hash_64(thr_id, throughput, d_hash[thr_id]);
+		else
+			streebog_hash_64_maxwell(thr_id, throughput, d_hash[thr_id]);
 		TRACE("gost   :");
 		x11_luffaCubehash512_cpu_hash_64(thr_id, throughput, d_hash[thr_id], order++);
 		TRACE("luffa+c:");
diff --git a/skunk/streebog_arrays.cuh b/x11/streebog_arrays.cuh
similarity index 100%
rename from skunk/streebog_arrays.cuh
rename to x11/streebog_arrays.cuh
diff --git a/x11/veltor.cu b/x11/veltor.cu
index be05f5a7d5..7bc1e18dab 100644
--- a/x11/veltor.cu
+++ b/x11/veltor.cu
@@ -10,11 +10,17 @@ extern "C" {
 #include "cuda_x11.h"
 
 extern void skein512_cpu_setBlock_80(void *pdata);
-extern void quark_skein512_cpu_init(int thr_id, uint32_t threads);
 extern void skein512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int swap);
 extern void x14_shabal512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
-extern void streebog_cpu_hash_64_final(int thr_id, uint32_t threads, uint32_t *d_hash, uint32_t* d_resNonce);
-extern void streebog_set_target(const uint32_t* ptarget);
+
+// for SM3.x
+extern void streebog_sm3_set_target(uint32_t* ptarget);
+extern void streebog_sm3_hash_64_final(int thr_id, uint32_t threads, uint32_t *d_hash, uint32_t* d_resNonce);
+
+// for latest cards only
+extern void skunk_cpu_init(int thr_id, uint32_t threads);
+extern void skunk_streebog_set_target(uint32_t* ptarget);
+extern void skunk_cuda_streebog(int thr_id, uint32_t threads, uint32_t *d_hash, uint32_t* d_resNonce);
 
 #include <stdio.h>
 #include <memory.h>
@@ -23,7 +29,7 @@ extern void streebog_set_target(const uint32_t* ptarget);
 static uint32_t *d_hash[MAX_GPUS];
 static uint32_t *d_resNonce[MAX_GPUS];
 
-// veltorcoin CPU Hash
+// veltor CPU Hash
 extern "C" void veltorhash(void *output, const void *input)
 {
 	unsigned char _ALIGN(128) hash[128] = { 0 };
@@ -53,6 +59,7 @@ extern "C" void veltorhash(void *output, const void *input)
 }
 
 static bool init[MAX_GPUS] = { 0 };
+static bool use_compat_kernels[MAX_GPUS] = { 0 };
 
 extern "C" int scanhash_veltor(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
 {
@@ -80,7 +87,9 @@ extern "C" int scanhash_veltor(int thr_id, struct work* work, uint32_t max_nonce
 		}
 		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
 
-		quark_skein512_cpu_init(thr_id, throughput);
+		skunk_cpu_init(thr_id, throughput);
+		use_compat_kernels[thr_id] = (cuda_arch[dev_id] < 500);
+
 		x11_shavite512_cpu_init(thr_id, throughput);
 
 		CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput), 0);
@@ -97,14 +106,20 @@ extern "C" int scanhash_veltor(int thr_id, struct work* work, uint32_t max_nonce
 	skein512_cpu_setBlock_80(endiandata);
 
 	cudaMemset(d_resNonce[thr_id], 0xff, NBN*sizeof(uint32_t));
-	streebog_set_target(ptarget);
+	if(use_compat_kernels[thr_id])
+		streebog_sm3_set_target(ptarget);
+	else
+		skunk_streebog_set_target(ptarget);
 
 	do {
 		int order = 0;
 		skein512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], 1); order++;
 		x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 		x14_shabal512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-		streebog_cpu_hash_64_final(thr_id, throughput, d_hash[thr_id], d_resNonce[thr_id]);
+		if(use_compat_kernels[thr_id])
+			streebog_sm3_hash_64_final(thr_id, throughput, d_hash[thr_id], d_resNonce[thr_id]);
+		else
+			skunk_cuda_streebog(thr_id, throughput, d_hash[thr_id], d_resNonce[thr_id]);
 
 		cudaMemcpy(h_resNonce, d_resNonce[thr_id], NBN*sizeof(uint32_t), cudaMemcpyDeviceToHost);
 
diff --git a/x13/cuda_hsr_sm3.cu b/x13/cuda_hsr_sm3.cu
new file mode 100644
index 0000000000..5ce018626e
--- /dev/null
+++ b/x13/cuda_hsr_sm3.cu
@@ -0,0 +1,139 @@
+#include <stdio.h>
+#include <stdint.h>
+#include <memory.h>
+
+#include <cuda_helper.h>
+#include <miner.h>
+
+#define  F(x, y, z) (((x) ^ (y) ^ (z)))
+#define FF(x, y, z) (((x) & (y)) | ((x) & (z)) | ((y) & (z)))
+#define GG(x, y, z) ((z)  ^ ((x) & ((y) ^ (z))))
+
+#define P0(x) x ^ ROTL32(x,  9) ^ ROTL32(x, 17)
+#define P1(x) x ^ ROTL32(x, 15) ^ ROTL32(x, 23)
+
+static __forceinline__ __device__
+void sm3_compress2(uint32_t digest[8], const uint32_t pblock[16])
+{
+	uint32_t tt1, tt2, i, t, ss1, ss2, x, y;
+	uint32_t w[68];
+	uint32_t a = digest[0];
+	uint32_t b = digest[1];
+	uint32_t c = digest[2];
+	uint32_t d = digest[3];
+	uint32_t e = digest[4];
+	uint32_t f = digest[5];
+	uint32_t g = digest[6];
+	uint32_t h = digest[7];
+
+	#pragma unroll
+	for (i = 0; i<16; i++) {
+		w[i] = cuda_swab32(pblock[i]);
+	}
+
+	for (i = 16; i<68; i++) {
+		x = ROTL32(w[i - 3], 15);
+		y = ROTL32(w[i - 13], 7);
+
+		x ^= w[i - 16];
+		x ^= w[i - 9];
+		y ^= w[i - 6];
+
+		w[i] = P1(x) ^ y;
+	}
+
+	for (i = 0; i<64; i++) {
+
+		t = (i < 16) ? 0x79cc4519 : 0x7a879d8a;
+
+		ss2 = ROTL32(a, 12);
+		ss1 = ROTL32(ss2 + e + ROTL32(t, i), 7);
+		ss2 ^= ss1;
+
+		tt1 = d + ss2 + (w[i] ^ w[i + 4]);
+		tt2 = h + ss1 + w[i];
+
+		if (i < 16) {
+			tt1 += F(a, b, c);
+			tt2 += F(e, f, g);
+		}
+		else {
+			tt1 += FF(a, b, c);
+			tt2 += GG(e, f, g);
+		}
+		d = c;
+		c = ROTL32(b, 9);
+		b = a;
+		a = tt1;
+		h = g;
+		g = ROTL32(f, 19);
+		f = e;
+		e = P0(tt2);
+	}
+
+	digest[0] ^= a;
+	digest[1] ^= b;
+	digest[2] ^= c;
+	digest[3] ^= d;
+	digest[4] ^= e;
+	digest[5] ^= f;
+	digest[6] ^= g;
+	digest[7] ^= h;
+}
+
+/***************************************************/
+// GPU Hash Function
+__global__
+void sm3_gpu_hash_64(const uint32_t threads, uint32_t *g_hash)
+{
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+
+	if (thread < threads)
+	{
+		const size_t hashPosition = thread;
+
+		uint32_t digest[8];
+		digest[0] = 0x7380166F;
+		digest[1] = 0x4914B2B9;
+		digest[2] = 0x172442D7;
+		digest[3] = 0xDA8A0600;
+		digest[4] = 0xA96F30BC;
+		digest[5] = 0x163138AA;
+		digest[6] = 0xE38DEE4D;
+		digest[7] = 0xB0FB0E4E;
+
+		uint32_t *pHash = &g_hash[hashPosition << 4];
+		sm3_compress2(digest, pHash);
+
+		uint32_t block[16];
+		block[0] = 0x80;
+
+		#pragma unroll
+		for (int i = 1; i < 14; i++)
+			block[i] = 0;
+
+		// count
+		block[14] = cuda_swab32(1 >> 23);
+		block[15] = cuda_swab32((1 << 9) + (0 << 3));
+
+		sm3_compress2(digest, block);
+
+		for (int i = 0; i < 8; i++)
+			pHash[i] = cuda_swab32(digest[i]);
+
+		for (int i = 8; i < 16; i++)
+			pHash[i] = 0;
+	}
+}
+
+__host__
+void sm3_cuda_hash_64(int thr_id, uint32_t threads, uint32_t *g_hash, int order)
+{
+	const uint32_t threadsperblock = 256;
+
+	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
+	dim3 block(threadsperblock);
+
+	sm3_gpu_hash_64 <<<grid, block>>>(threads, g_hash);
+	//MyStreamSynchronize(NULL, order, thr_id);
+}
diff --git a/x13/hsr.cu b/x13/hsr.cu
new file mode 100644
index 0000000000..e86444628d
--- /dev/null
+++ b/x13/hsr.cu
@@ -0,0 +1,265 @@
+/*
+ * X13 algorithm
+ */
+extern "C"
+{
+#include "sph/sph_blake.h"
+#include "sph/sph_bmw.h"
+#include "sph/sph_groestl.h"
+#include "sph/sph_skein.h"
+#include "sph/sph_jh.h"
+#include "sph/sph_keccak.h"
+
+#include "sph/sph_luffa.h"
+#include "sph/sph_cubehash.h"
+#include "sph/sph_shavite.h"
+#include "sph/sph_simd.h"
+#include "sph/sph_echo.h"
+
+#include "sph/sph_hamsi.h"
+#include "sph/sph_fugue.h"
+}
+#include "sm3.h"
+
+#include "miner.h"
+
+#include "cuda_helper.h"
+#include "x11/cuda_x11.h"
+
+static uint32_t *d_hash[MAX_GPUS];
+
+extern void sm3_cuda_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash, int order);
+
+extern void x13_hamsi512_cpu_init(int thr_id, uint32_t threads);
+extern void x13_hamsi512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+
+extern void x13_fugue512_cpu_init(int thr_id, uint32_t threads);
+extern void x13_fugue512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void x13_fugue512_cpu_free(int thr_id);
+
+// HSR CPU Hash
+extern "C" void hsr_hash(void *output, const void *input)
+{
+	// blake1-bmw2-grs3-skein4-jh5-keccak6-luffa7-cubehash8-shavite9-simd10-echo11-hamsi12-fugue13
+
+	sph_blake512_context ctx_blake;
+	sph_bmw512_context ctx_bmw;
+	sph_groestl512_context ctx_groestl;
+	sph_jh512_context ctx_jh;
+	sph_keccak512_context ctx_keccak;
+	sph_skein512_context ctx_skein;
+	sph_luffa512_context ctx_luffa;
+	sph_cubehash512_context ctx_cubehash;
+	sph_shavite512_context ctx_shavite;
+	sph_simd512_context ctx_simd;
+	sph_echo512_context ctx_echo;
+	sm3_ctx_t ctx_sm3;
+	sph_hamsi512_context ctx_hamsi;
+	sph_fugue512_context ctx_fugue;
+
+	uint32_t hash[32];
+	memset(hash, 0, sizeof hash);
+
+	sph_blake512_init(&ctx_blake);
+	sph_blake512(&ctx_blake, input, 80);
+	sph_blake512_close(&ctx_blake, (void*) hash);
+
+	sph_bmw512_init(&ctx_bmw);
+	sph_bmw512(&ctx_bmw, (const void*) hash, 64);
+	sph_bmw512_close(&ctx_bmw, (void*) hash);
+
+	sph_groestl512_init(&ctx_groestl);
+	sph_groestl512(&ctx_groestl, (const void*) hash, 64);
+	sph_groestl512_close(&ctx_groestl, (void*) hash);
+
+	sph_skein512_init(&ctx_skein);
+	sph_skein512(&ctx_skein, (const void*) hash, 64);
+	sph_skein512_close(&ctx_skein, (void*) hash);
+
+	sph_jh512_init(&ctx_jh);
+	sph_jh512(&ctx_jh, (const void*) hash, 64);
+	sph_jh512_close(&ctx_jh, (void*) hash);
+
+	sph_keccak512_init(&ctx_keccak);
+	sph_keccak512(&ctx_keccak, (const void*) hash, 64);
+	sph_keccak512_close(&ctx_keccak, (void*) hash);
+
+	sph_luffa512_init(&ctx_luffa);
+	sph_luffa512(&ctx_luffa, (const void*) hash, 64);
+	sph_luffa512_close (&ctx_luffa, (void*) hash);
+
+	sph_cubehash512_init(&ctx_cubehash);
+	sph_cubehash512(&ctx_cubehash, (const void*) hash, 64);
+	sph_cubehash512_close(&ctx_cubehash, (void*) hash);
+
+	sph_shavite512_init(&ctx_shavite);
+	sph_shavite512(&ctx_shavite, (const void*) hash, 64);
+	sph_shavite512_close(&ctx_shavite, (void*) hash);
+
+	sph_simd512_init(&ctx_simd);
+	sph_simd512(&ctx_simd, (const void*) hash, 64);
+	sph_simd512_close(&ctx_simd, (void*) hash);
+
+	sph_echo512_init(&ctx_echo);
+	sph_echo512(&ctx_echo, (const void*) hash, 64);
+	sph_echo512_close(&ctx_echo, (void*) hash);
+
+	sm3_init(&ctx_sm3);
+	sm3_update(&ctx_sm3, (const unsigned char*) hash, 64);
+	memset(hash, 0, sizeof hash);
+	sm3_close(&ctx_sm3, (void*) hash);
+
+	sph_hamsi512_init(&ctx_hamsi);
+	sph_hamsi512(&ctx_hamsi, (const void*) hash, 64);
+	sph_hamsi512_close(&ctx_hamsi, (void*) hash);
+
+	sph_fugue512_init(&ctx_fugue);
+	sph_fugue512(&ctx_fugue, (const void*) hash, 64);
+	sph_fugue512_close(&ctx_fugue, (void*) hash);
+
+	memcpy(output, hash, 32);
+}
+
+static bool init[MAX_GPUS] = { 0 };
+
+extern "C" int scanhash_hsr(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
+{
+	uint32_t *pdata = work->data;
+	uint32_t *ptarget = work->target;
+	const uint32_t first_nonce = pdata[19];
+	int intensity = 19; // (device_sm[device_map[thr_id]] > 500 && !is_windows()) ? 20 : 19;
+	uint32_t throughput =  cuda_default_throughput(thr_id, 1 << intensity); // 19=256*256*8;
+	//if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
+
+	if (opt_benchmark)
+		((uint32_t*)ptarget)[7] = 0x000f;
+
+	if (!init[thr_id])
+	{
+		cudaSetDevice(device_map[thr_id]);
+		if (opt_cudaschedule == -1 && gpu_threads == 1) {
+			cudaDeviceReset();
+			// reduce cpu usage
+			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+			CUDA_LOG_ERROR();
+		}
+		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
+
+		quark_blake512_cpu_init(thr_id, throughput);
+		quark_groestl512_cpu_init(thr_id, throughput);
+		quark_skein512_cpu_init(thr_id, throughput);
+		quark_bmw512_cpu_init(thr_id, throughput);
+		quark_keccak512_cpu_init(thr_id, throughput);
+		quark_jh512_cpu_init(thr_id, throughput);
+		x11_luffaCubehash512_cpu_init(thr_id, throughput);
+		x11_shavite512_cpu_init(thr_id, throughput);
+		if (x11_simd512_cpu_init(thr_id, throughput) != 0) {
+			return 0;
+		}
+		x11_echo512_cpu_init(thr_id, throughput);
+		x13_hamsi512_cpu_init(thr_id, throughput);
+		x13_fugue512_cpu_init(thr_id, throughput);
+
+		CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput), 0);
+
+		cuda_check_cpu_init(thr_id, throughput);
+
+		init[thr_id] = true;
+	}
+
+	uint32_t endiandata[20];
+	for (int k=0; k < 20; k++)
+		be32enc(&endiandata[k], pdata[k]);
+
+	quark_blake512_cpu_setBlock_80(thr_id, endiandata);
+	cuda_check_cpu_setTarget(ptarget);
+
+	do {
+		int order = 0;
+
+		quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
+		quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x11_luffaCubehash512_cpu_hash_64(thr_id, throughput, d_hash[thr_id], order++);
+		x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		sm3_cuda_hash_64(thr_id, throughput, d_hash[thr_id], order++);
+		x13_hamsi512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+
+		*hashes_done = pdata[19] - first_nonce + throughput;
+
+		CUDA_LOG_ERROR();
+
+		work->nonces[0] = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
+		if (work->nonces[0] != UINT32_MAX)
+		{
+			const uint32_t Htarg = ptarget[7];
+			uint32_t _ALIGN(64) vhash[8];
+			be32enc(&endiandata[19], work->nonces[0]);
+			hsr_hash(vhash, endiandata);
+
+			if (vhash[7] <= ptarget[7] && fulltest(vhash, ptarget)) {
+				work->valid_nonces = 1;
+				work->nonces[1] = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
+				work_set_target_ratio(work, vhash);
+				if (work->nonces[1] != 0) {
+					be32enc(&endiandata[19], work->nonces[1]);
+					hsr_hash(vhash, endiandata);
+					bn_set_target_ratio(work, vhash, 1);
+					work->valid_nonces++;
+					pdata[19] = max(work->nonces[0], work->nonces[1]) + 1;
+				} else {
+					pdata[19] = work->nonces[0] + 1; // cursor
+				}
+				return work->valid_nonces;
+			}
+			else if (vhash[7] > Htarg) {
+				gpu_increment_reject(thr_id);
+				if (!opt_quiet)
+				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]);
+				pdata[19] = work->nonces[0] + 1;
+				continue;
+			}
+		}
+
+		if ((uint64_t)throughput + pdata[19] >= max_nonce) {
+			pdata[19] = max_nonce;
+			break;
+		}
+		pdata[19] += throughput;
+
+	} while (!work_restart[thr_id].restart);
+
+	*hashes_done = pdata[19] - first_nonce;
+
+	CUDA_LOG_ERROR();
+
+	return 0;
+}
+
+// cleanup
+extern "C" void free_hsr(int thr_id)
+{
+	if (!init[thr_id])
+		return;
+
+	cudaThreadSynchronize();
+
+	cudaFree(d_hash[thr_id]);
+
+	quark_blake512_cpu_free(thr_id);
+	quark_groestl512_cpu_free(thr_id);
+	x11_simd512_cpu_free(thr_id);
+	x13_fugue512_cpu_free(thr_id);
+
+	cuda_check_cpu_free(thr_id);
+	CUDA_LOG_ERROR();
+
+	cudaDeviceSynchronize();
+	init[thr_id] = false;
+}
diff --git a/x13/sm3.c b/x13/sm3.c
new file mode 100644
index 0000000000..295ba15086
--- /dev/null
+++ b/x13/sm3.c
@@ -0,0 +1,220 @@
+/* ====================================================================
+ * Copyright (c) 2014 - 2017 The GmSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the GmSSL Project.
+ *    (http://gmssl.org/)"
+ *
+ * 4. The name "GmSSL Project" must not be used to endorse or promote
+ *    products derived from this software without prior written
+ *    permission. For written permission, please contact
+ *    guanzhi1980@gmail.com.
+ *
+ * 5. Products derived from this software may not be called "GmSSL"
+ *    nor may "GmSSL" appear in their names without prior written
+ *    permission of the GmSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the GmSSL Project
+ *    (http://gmssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE GmSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE GmSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ */
+
+#include <string.h>
+
+#include "sm3.h"
+
+void sm3_init(sm3_ctx_t *ctx)
+{
+	ctx->digest[0] = 0x7380166F;
+	ctx->digest[1] = 0x4914B2B9;
+	ctx->digest[2] = 0x172442D7;
+	ctx->digest[3] = 0xDA8A0600;
+	ctx->digest[4] = 0xA96F30BC;
+	ctx->digest[5] = 0x163138AA;
+	ctx->digest[6] = 0xE38DEE4D;
+	ctx->digest[7] = 0xB0FB0E4E;
+
+	ctx->nblocks = 0;
+	ctx->num = 0;
+}
+
+void sm3_update(sm3_ctx_t *ctx, const unsigned char* data, size_t data_len)
+{
+	if (ctx->num) {
+		unsigned int left = SM3_BLOCK_SIZE - ctx->num;
+		if (data_len < left) {
+			memcpy(ctx->block + ctx->num, data, data_len);
+			ctx->num += data_len;
+			return;
+		} else {
+			memcpy(ctx->block + ctx->num, data, left);
+			sm3_compress(ctx->digest, ctx->block);
+			ctx->nblocks++;
+			data += left;
+			data_len -= left;
+		}
+	}
+	while (data_len >= SM3_BLOCK_SIZE) {
+		sm3_compress(ctx->digest, data);
+		ctx->nblocks++;
+		data += SM3_BLOCK_SIZE;
+		data_len -= SM3_BLOCK_SIZE;
+	}
+	ctx->num = data_len;
+	if (data_len) {
+		memcpy(ctx->block, data, data_len);
+	}
+}
+
+void sm3_close(void *cc, void *dst)
+{
+	sm3_final(cc, dst);
+	memset(cc, 0, sizeof(sm3_ctx_t));
+}
+
+void sm3_final(sm3_ctx_t *ctx, unsigned char *digest)
+{
+	int i;
+	uint32_t *pdigest = (uint32_t *)digest;
+	uint32_t *count = (uint32_t *)(ctx->block + SM3_BLOCK_SIZE - 8);
+
+	ctx->block[ctx->num] = 0x80;
+
+	if (ctx->num + 9 <= SM3_BLOCK_SIZE) {
+		memset(ctx->block + ctx->num + 1, 0, SM3_BLOCK_SIZE - ctx->num - 9);
+	} else {
+		memset(ctx->block + ctx->num + 1, 0, SM3_BLOCK_SIZE - ctx->num - 1);
+		sm3_compress(ctx->digest, ctx->block);
+		memset(ctx->block, 0, SM3_BLOCK_SIZE - 8);
+	}
+
+	count[0] = cpu_to_be32((ctx->nblocks) >> 23);
+	count[1] = cpu_to_be32((ctx->nblocks << 9) + (ctx->num << 3));
+
+	sm3_compress(ctx->digest, ctx->block);
+	for (i = 0; i < sizeof(ctx->digest)/sizeof(ctx->digest[0]); i++) {
+		pdigest[i] = cpu_to_be32(ctx->digest[i]);
+	}
+}
+
+#define ROTATELEFT(X,n)  (((X)<<(n)) | ((X)>>(32-(n))))
+
+#define P0(x) ((x) ^  ROTATELEFT((x),9)  ^ ROTATELEFT((x),17))
+#define P1(x) ((x) ^  ROTATELEFT((x),15) ^ ROTATELEFT((x),23))
+
+#define FF0(x,y,z) ( (x) ^ (y) ^ (z))
+#define FF1(x,y,z) (((x) & (y)) | ( (x) & (z)) | ( (y) & (z)))
+
+#define GG0(x,y,z) ( (x) ^ (y) ^ (z))
+#define GG1(x,y,z) (((x) & (y)) | ( (~(x)) & (z)) )
+
+
+void sm3_compress(uint32_t digest[8], const unsigned char block[64])
+{
+	int j;
+	uint32_t W[68], W1[64];
+	const uint32_t *pblock = (const uint32_t *)block;
+
+	uint32_t A = digest[0];
+	uint32_t B = digest[1];
+	uint32_t C = digest[2];
+	uint32_t D = digest[3];
+	uint32_t E = digest[4];
+	uint32_t F = digest[5];
+	uint32_t G = digest[6];
+	uint32_t H = digest[7];
+	uint32_t SS1,SS2,TT1,TT2,T[64];
+
+	for (j = 0; j < 16; j++) {
+		W[j] = cpu_to_be32(pblock[j]);
+	}
+	for (j = 16; j < 68; j++) {
+		W[j] = P1( W[j-16] ^ W[j-9] ^ ROTATELEFT(W[j-3],15)) ^ ROTATELEFT(W[j - 13],7 ) ^ W[j-6];;
+	}
+	for( j = 0; j < 64; j++) {
+		W1[j] = W[j] ^ W[j+4];
+	}
+
+	for(j =0; j < 16; j++) {
+
+		T[j] = 0x79CC4519;
+		SS1 = ROTATELEFT((ROTATELEFT(A,12) + E + ROTATELEFT(T[j],j)), 7);
+		SS2 = SS1 ^ ROTATELEFT(A,12);
+		TT1 = FF0(A,B,C) + D + SS2 + W1[j];
+		TT2 = GG0(E,F,G) + H + SS1 + W[j];
+		D = C;
+		C = ROTATELEFT(B,9);
+		B = A;
+		A = TT1;
+		H = G;
+		G = ROTATELEFT(F,19);
+		F = E;
+		E = P0(TT2);
+	}
+
+	for(j =16; j < 64; j++) {
+
+		T[j] = 0x7A879D8A;
+		SS1 = ROTATELEFT((ROTATELEFT(A,12) + E + ROTATELEFT(T[j],j)), 7);
+		SS2 = SS1 ^ ROTATELEFT(A,12);
+		TT1 = FF1(A,B,C) + D + SS2 + W1[j];
+		TT2 = GG1(E,F,G) + H + SS1 + W[j];
+		D = C;
+		C = ROTATELEFT(B,9);
+		B = A;
+		A = TT1;
+		H = G;
+		G = ROTATELEFT(F,19);
+		F = E;
+		E = P0(TT2);
+	}
+
+	digest[0] ^= A;
+	digest[1] ^= B;
+	digest[2] ^= C;
+	digest[3] ^= D;
+	digest[4] ^= E;
+	digest[5] ^= F;
+	digest[6] ^= G;
+	digest[7] ^= H;
+}
+
+void sm3(const unsigned char *msg, size_t msglen,
+	unsigned char dgst[SM3_DIGEST_LENGTH])
+{
+	sm3_ctx_t ctx;
+
+	sm3_init(&ctx);
+	sm3_update(&ctx, msg, msglen);
+	sm3_final(&ctx, dgst);
+
+	memset(&ctx, 0, sizeof(sm3_ctx_t));
+}
diff --git a/x13/sm3.h b/x13/sm3.h
new file mode 100644
index 0000000000..05c6595d98
--- /dev/null
+++ b/x13/sm3.h
@@ -0,0 +1,109 @@
+/* ====================================================================
+ * Copyright (c) 2014 - 2016 The GmSSL Project.  All rights reserved.
+ * Copyright (c) 2017 - YiiMP (cleaned hmac dead stuff)
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the GmSSL Project.
+ *    (http://gmssl.org/)"
+ *
+ * 4. The name "GmSSL Project" must not be used to endorse or promote
+ *    products derived from this software without prior written
+ *    permission. For written permission, please contact
+ *    guanzhi1980@gmail.com.
+ *
+ * 5. Products derived from this software may not be called "GmSSL"
+ *    nor may "GmSSL" appear in their names without prior written
+ *    permission of the GmSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the GmSSL Project
+ *    (http://gmssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE GmSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE GmSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ */
+
+#ifndef _SM3_H
+#define _SM3_H
+
+#define SM3_DIGEST_LENGTH	32
+#define SM3_BLOCK_SIZE		64
+#define SM3_CBLOCK		(SM3_BLOCK_SIZE)
+#define SM3_HMAC_SIZE		(SM3_DIGEST_LENGTH)
+
+
+#include <sys/types.h>
+#include <stdint.h>
+#include <string.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+typedef struct {
+	uint32_t digest[8];
+	int nblocks;
+	unsigned char block[64];
+	int num;
+} sm3_ctx_t;
+
+void sm3_init(sm3_ctx_t *ctx);
+void sm3_update(sm3_ctx_t *ctx, const unsigned char* data, size_t data_len);
+void sm3_close(void *cc, void *dst);
+
+void sm3_final(sm3_ctx_t *ctx, unsigned char digest[SM3_DIGEST_LENGTH]);
+void sm3_compress(uint32_t digest[8], const unsigned char block[SM3_BLOCK_SIZE]);
+void sm3(const unsigned char *data, size_t datalen,
+	unsigned char digest[SM3_DIGEST_LENGTH]);
+
+#ifdef CPU_BIGENDIAN
+
+#define cpu_to_be16(v) (v)
+#define cpu_to_be32(v) (v)
+#define be16_to_cpu(v) (v)
+#define be32_to_cpu(v) (v)
+
+#else
+
+#define cpu_to_le16(v) (v)
+#define cpu_to_le32(v) (v)
+#define le16_to_cpu(v) (v)
+#define le32_to_cpu(v) (v)
+
+#define cpu_to_be16(v) (((v)<< 8) | ((v)>>8))
+#define cpu_to_be32(v) (((v)>>24) | (((v)>>8)&0xff00) | (((v)<<8)&0xff0000) | ((v)<<24))
+#define be16_to_cpu(v) cpu_to_be16(v)
+#define be32_to_cpu(v) cpu_to_be32(v)
+
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+#endif