Skip to content

Commit

Permalink
Cuda fixes for Centos and compile from source (#10)
Browse files Browse the repository at this point in the history
* cudnn upgrade to 6.0

* syntax error

* kernel-devel

* minor

* cudnn fixes

* disable clang - tensorflow/tensorflow#11043

* clang/gcc problems

* update building instrs

* remove expect configure scripts

* enabling sse-4.x for bazel build

* enabling sse-4.x for bazel build

* clang/gcc problems

* break up compile and pip install for gpu

* retry bazel build

* more python libs

* gpu compilation still not fixed

* fix default username attr

* install cuda for centos with yum

* rpm installer fixes

* yum fixes for cuda

* remove rpms after correct install

* only download runfile for cuda for ubuntu

* hotfix

* cuda rpm

* simplify for centos

* rename inifinband attr to rdma
  • Loading branch information
Jim Dowling committed Aug 1, 2017
1 parent 59731fc commit 2ea406b
Show file tree
Hide file tree
Showing 9 changed files with 191 additions and 178 deletions.
14 changes: 8 additions & 6 deletions attributes/default.rb
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,10 @@
default.tensorflow.base_version = "1.2.0"
default.tensorflow.version = node.tensorflow.base_version

#default.tensorflow.install = "src" # or 'dist'
default.tensorflow.install = "dist" # or 'src'
default.tensorflow.install = "dist" # or 'src' or 'custom'

# tensorflow-1.2.1-debian-gcc_version-python_version.whl
default.tensorflow.custom_url = "#{download_url}/tensorflow-#{node['tensorflow']['version']}-#{node['platform']}-5.4-2.7.whl"

default.tensorflow.git_url = "https://github.com/tensorflow/tensorflow"
default.tensorflow.python_url = "http://snurran.sics.se/hops/Python.zip"
Expand All @@ -22,7 +24,7 @@


default.tensorflow.mpi = "false"
default.tensorflow.infiniband = "false"
default.tensorflow.rdma = "false"
default.tensorflow.mkl = "false"

default.cuda.major_version = "8.0"
Expand All @@ -38,8 +40,8 @@
default.cuda.url_patch = "#{node.download_url}/cuda_#{node.cuda.version_patch}_linux.run"


default.cudnn.major_version = "5"
default.cudnn.minor_version = "1"
default.cudnn.major_version = "6"
default.cudnn.minor_version = "0"
default.cudnn.version = node.cudnn.major_version + "." + node.cudnn.minor_version
default.cudnn.url = "#{node.download_url}/cudnn-#{node.cuda.major_version}-linux-x64-v#{node.cudnn.version}.tgz"

Expand All @@ -56,7 +58,7 @@
default.tensorflow.need_cuda = 0
default.tensorflow.need_mpi = 0
default.tensorflow.need_mkl = 0
default.tensorflow.need_infiniband = 0
default.tensorflow.need_rdma = 0

# https://github.com/bazelbuild/bazel/releases/download/0.5.2/bazel-0.5.2-installer-linux-x86_64.sh
default.bazel.major_version = "0.5"
Expand Down
4 changes: 2 additions & 2 deletions metadata.rb
Original file line number Diff line number Diff line change
Expand Up @@ -54,8 +54,8 @@
:description => "'true' to install Intel MKL support, 'false' (default) for no support. ",
:type => "string"

attribute "tensorflow/infiniband",
:description => "Used by TensorflowOnSpark. 'true' to install inifinband support, 'false' (default) for no infiniband support. ",
attribute "tensorflow/rdma",
:description => "Used by TensorflowOnSpark. 'true' to install rdma (infiniband) support, 'false' (default) for no rdma support. ",
:type => "string"

attribute "install/dir",
Expand Down
79 changes: 68 additions & 11 deletions providers/compile.rb
Original file line number Diff line number Diff line change
Expand Up @@ -53,13 +53,18 @@
action :tf do


# https://github.com/lakshayg/tensorflow-build


bash "git_clone_tensorflow_server" do
user node.tensorflow.user
code <<-EOF
set -e
cd /home/#{node.tensorflow.user}
git clone --recurse-submodules --branch v#{node.tensorflow.base_version} #{node.tensorflow.git_url}
# cd tensorflow
# git checkout v#{node.tensorflow.base_version}
EOF
not_if { ::File.exists?( "/home/#{node.tensorflow.user}/tensorflow/configure" ) }
end
Expand Down Expand Up @@ -111,43 +116,96 @@

if node.cuda.enabled == "true"

# Try and download+install a custom python wheel first. If that fails, build from source
begin
wheel = File.basename("#{node['tensorflow']['custom_url']}")
remote_file "#{Chef::Config[:file_cache_path]}/#{wheel}" do
source node['tensorflow']['custom_url']
owner node['tensorflow']['user']
group node['tensorflow']['group']
mode "0755"
action :create_if_missing
end

bash "pip_install_custom_tensorflow" do
user "root"
code <<-EOF
set -e
export LC_CTYPE=en_US.UTF-8
export LC_ALL=en_US.UTF-8
pip install --ignore-installed --upgrade #{Chef::Config[:file_cache_path]}/#{wheel}
EOF
end

rescue


# https://github.com/bazelbuild/bazel/issues/739
bash "workaround_bazel_build" do
user "root"
code <<-EOF
set -e
chown -R #{node.tensorflow.user} /home/#{node.tensorflow.user}/tensorflow
rm -rf /home/#{node.tensorflow.user}/.cache/bazel
# rm -rf /home/#{node.tensorflow.user}/.cache/bazel
EOF
end




bash "build_install_tensorflow_server" do
# user node.tensorflow.user
user "root"
timeout 10800
timeout 30800
code <<-EOF
set -e
export LC_CTYPE=en_US.UTF-8
export LC_ALL=en_US.UTF-8
cd /home/#{node.tensorflow.user}/tensorflow
./#{config}
# PATH change needed for Centos
# Compile instructions - https://stackoverflow.com/questions/41293077/how-to-compile-tensorflow-with-sse4-2-and-avx-instructions
export PATH=$PATH:/usr/local/bin
bazel build -c opt --config=cuda //tensorflow/core/distributed_runtime/rpc:grpc_tensorflow_server
# Create the pip package and install
bazel build -c opt --config=cuda //tensorflow/tools/pip_package:build_pip_package
bazel-bin/tensorflow/tools/pip_package/build_pip_package /tmp/tensorflow_pkg
# bazel build -c opt --config=cuda //tensorflow/core/distributed_runtime/rpc:grpc_tensorflow_server
# This works for ubuntu but not for centos
# Build fails for centos: https://github.com/tensorflow/tensorflow/issues/10665
# bazel build -c opt --cxxopt="-D_GLIBCXX_USE_CXX11_ABI=0" --config=cuda --copt=-mavx --copt=-mavx2 --copt=-mfma --copt=-mfpmath=both --copt=-msse4.1 --copt=-msse4.2 //tensorflow/tools/pip_package:build_pip_package
# This works
bazel build -c opt --cxxopt="-D_GLIBCXX_USE_CXX11_ABI=0" --config=cuda //tensorflow/tools/pip_package:build_pip_package
pip install /tmp/tensorflow_pkg/tensorflow-#{node.tensorflow.base_version}-py2-none-any.whl
# --user
touch .installed
EOF
not_if { ::File.exists?( "/home/#{node.tensorflow.user}/tensorflow/.installed" ) }
end


bash "pip_install_tensorflow" do
# user node.tensorflow.user
user "root"
timeout 30800
code <<-EOF
set -e
export LC_CTYPE=en_US.UTF-8
export LC_ALL=en_US.UTF-8
export PATH=$PATH:/usr/local/bin
cd /home/#{node.tensorflow.user}/tensorflow
#install -Dm755 bazel-bin/tensorflow/libtensorflow.so /usr/lib/
#install -Dm644 tensorflow/c/c_api.h /usr/include/tensorflow-cuda/c_api.h
bazel-bin/tensorflow/tools/pip_package/build_pip_package /tmp/tensorflow_pkg
pip install --ignore-installed --upgrade /tmp/tensorflow_pkg/tensorflow-#{node.tensorflow.base_version}-py2-none-any.whl
touch .installed_pip
EOF
not_if { ::File.exists?( "/home/#{node.tensorflow.user}/tensorflow/.installed_pip" ) }
end

end # End rescue

else

# https://github.com/bazelbuild/bazel/issues/739
Expand Down Expand Up @@ -180,8 +238,7 @@
# Needed for Centos
export PATH=$PATH:/usr/local/bin
# bazel build -c opt //tensorflow/tools/pip_package:build_pip_package
bazel build --config=mkl --copt="-DEIGEN_USE_VML" -c opt //tensorflow/tools/pip_package:build_pip_package
bazel build -c opt --copt=-mavx --copt=-mavx2 --copt=-mfma --copt=-mfpmath=both --copt=-msse4.1 --copt=-msse4.2 //tensorflow/tools/pip_package:build_pip_package
bazel-bin/tensorflow/tools/pip_package/build_pip_package /tmp/tensorflow_pkg
pip install /tmp/tensorflow_pkg/tensorflow-#{node.tensorflow.base_version}-cp27-cp27mu-linux_x86_64.whl
#--user
Expand Down
75 changes: 58 additions & 17 deletions providers/install.rb
Original file line number Diff line number Diff line change
Expand Up @@ -3,18 +3,8 @@
cuda = ::File.basename(node.cuda.url)

case node.platform_family
#when "debian"
when "debian"

when "rhel"
bash "install_cuda_preliminaries" do
user "root"
timeout 72000
code <<-EOF
set -e
yum install kernel-devel-$(uname -r) kernel-headers-$(uname -r)
EOF
end
end

bash "install_cuda" do
user "root"
Expand All @@ -35,10 +25,64 @@
code <<-EOF
set -e
cd #{Chef::Config[:file_cache_path]}
./#{patch} --silent --accept-eula --verbose
./#{patch} --silent --accept-eula
EOF
not_if { ::File.exists?( "/usr/local/cuda/version.txt" ) }
end


when "rhel"

bash "install_cuda_preliminaries" do
user "root"
code <<-EOF
set -e
# versioned header install doesnt work
# yum install -y kernel-devel-$(uname -r)
# yum install -y kernel-headers-$(uname -r)
yum install kernel-devel -y
yum install kernel-headers -y
yum install libglvnd-glx -y
EOF
not_if { ::File.exists?( "/usr/local/cuda/version.txt" ) }
end

bash "install_cuda_rpm" do
user "root"
timeout 72000
code <<-EOF
set -e
cd #{Chef::Config[:file_cache_path]}
wget #{node['download_url']}/cuda-repo-rhel7-8-0-local-ga2-8.0.61-1.x86_64.rpm
rpm -ivh --replacepkgs cuda-repo-rhel7-8-0-local-ga2-8.0.61-1.x86_64.rpm
yum clean expire-cache
yum install cuda -y
if [ ! -f /usr/lib64/libcuda.so ] ; then
ln -s /usr/lib64/nvidia/libcuda.so /usr/lib64
fi
rm -f cuda-repo-rhel*
EOF
not_if { ::File.exists?( "/usr/lib64/libcuda.so" ) }
end

bash "install_cuda_rpm_patch" do
user "root"
timeout 72000
code <<-EOF
set -e
cd #{Chef::Config[:file_cache_path]}
wget #{node['download_url']}/cuda-repo-rhel7-8-0-local-cublas-performance-update-8.0.61-1.x86_64.rpm
rpm -ivh --replacepkgs cuda-repo-rhel7-8-0-local-cublas-performance-update-8.0.61-1.x86_64.rpm
#yum clean expire-cache
#yum inst cuda -y
rm -f cuda-repo-rhel*
EOF
#not_if { ::File.exists?( "/usr/lib64/libcuda.so" ) }
end


end


end

Expand All @@ -58,22 +102,19 @@
cd #{Chef::Config[:file_cache_path]}
tar zxf #{cached_cudnn_file}
cp -rf cuda/lib64 /usr
cp -rf cuda/lib64/* /usr/local/cuda/lib64/
cp -rf cuda/include/* /usr/include
chmod a+r /usr/include/cudnn.h /usr/lib64/libcudnn*
chmod a+r /usr/include/cudnn.h /usr/local/cuda/lib64/libcudnn*
EOF
not_if { ::File.exists?( "/usr/include/cudnn.h" ) }
end



end




action :cpu do
0
if node.tensorflow.install == "dist"
bash "install_tf_cpu" do
user "root"
Expand Down

0 comments on commit 2ea406b

Please sign in to comment.