## System Basics

### Python

In [None]:
import sys
print(sys.executable)
!python --version

### Conda

In [None]:
!conda info

### Memory Usage

In [None]:
import psutil, os
process = psutil.Process(os.getpid())
process.memory_info()

### Memory Test

In [None]:
!pip install memory_profiler
%load_ext memory_profiler

In [None]:
import psutil
psutil.virtual_memory()

In [None]:
%memit some_str = ' ' * 100000000

### Numpy Config

In [None]:
import numpy as np
np.show_config()

### MKL Support

In [None]:
from tensorflow.python import pywrap_tensorflow
pywrap_tensorflow.IsMklEnabled()

In [None]:
import torch
print(torch.backends.mkl.is_available())

# svd is only supported on MKL?
# torch.svd(torch.tensor(np.random.randn(10,10),  dtype=torch.float))

In [None]:
import mxnet.runtime
fs=mxnet.runtime.Features()
fs.is_enabled('MKLDNN')

## Numpy Benchmark

In [None]:
import numpy as np
from time import time

print("Start")
# Let's take the randomness out of random numbers (for reproducibility)
np.random.seed(0)

size = 4096
A, B = np.random.random((size, size)), np.random.random((size, size))
C, D = np.random.random((size * 128,)), np.random.random((size * 128,))
E = np.random.random((int(size / 2), int(size / 4)))
F = np.random.random((int(size / 2), int(size / 2)))
F = np.dot(F, F.T)
G = np.random.random((int(size / 2), int(size / 2)))

# Matrix multiplication
N = 20
t = time()
for i in range(N):
    np.dot(A, B)
delta = time() - t
print('Dotted two %dx%d matrices in %0.2f s.' % (size, size, delta / N))
del A, B

# Vector multiplication
N = 5000
t = time()
for i in range(N):
    np.dot(C, D)
delta = time() - t
print('Dotted two vectors of length %d in %0.2f ms.' % (size * 128, 1e3 * delta / N))
del C, D

# Singular Value Decomposition (SVD)
N = 3
t = time()
for i in range(N):
    np.linalg.svd(E, full_matrices = False)
delta = time() - t
print("SVD of a %dx%d matrix in %0.2f s." % (size / 2, size / 4, delta / N))
del E

# Cholesky Decomposition
N = 3
t = time()
for i in range(N):
    np.linalg.cholesky(F)
delta = time() - t
print("Cholesky decomposition of a %dx%d matrix in %0.2f s." % (size / 2, size / 2, delta / N))

# Eigendecomposition
t = time()
for i in range(N):
    np.linalg.eig(G)
delta = time() - t
print("Eigendecomposition of a %dx%d matrix in %0.2f s." % (size / 2, size / 2, delta / N))

## Numba Benchmark

In [None]:
from numba import jit
import numpy as np
import time

x = np.arange(100).reshape(10, 10)

@jit(nopython=True)
def go_fast(a): # Function is compiled and runs in machine code
    trace = 0
    for i in range(a.shape[0]):
        trace += np.tanh(a[i, i])
    return a + trace

# DO NOT REPORT THIS... COMPILATION TIME IS INCLUDED IN THE EXECUTION TIME!
start = time.time()
go_fast(x)
end = time.time()
print("Elapsed (with compilation) = %s" % (end - start))

# NOW THE FUNCTION IS COMPILED, RE-TIME IT EXECUTING FROM CACHE
start = time.time()
go_fast(x)
end = time.time()
print("Elapsed (after compilation) = %s" % (end - start))

## Jax Lib

In [None]:
import jax.numpy as np
from jax import jit

def slow_f(x):
  # Element-wise ops see a large benefit from fusion
  return x * x + x * 2.0

x = np.ones((5000, 5000))
fast_f = jit(slow_f)
%timeit -n10 -r3 fast_f(x)
%timeit -n10 -r3 slow_f(x) 

## Tensorflow / Keras

In [None]:
from __future__ import print_function
import matplotlib
import matplotlib.pyplot as plt
import tensorflow as tf
import time

import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()

def get_times(maximum_time):

    device_times = {
        # "/gpu:0":[],
        "/cpu:0":[]
    }
    matrix_sizes = range(500,50000,50)

    for size in matrix_sizes:
        for device_name in device_times.keys():

            print("####### Calculating on the " + device_name + " #######")

            shape = (size,size)
            data_type = tf.float16
            with tf.device(device_name):
                r1 = tf.random.uniform(shape=shape, minval=0, maxval=1, dtype=data_type)
                r2 = tf.random.uniform(shape=shape, minval=0, maxval=1, dtype=data_type)
                dot_operation = tf.matmul(r2, r1)


            with tf.Session(config=tf.ConfigProto(log_device_placement=True)) as session:
                    start_time = time.time()
                    result = session.run(dot_operation)
                    time_taken = time.time() - start_time
                    print(result)
                    device_times[device_name].append(time_taken)

            print(device_times)

            if time_taken > maximum_time:
                return device_times, matrix_sizes


device_times, matrix_sizes = get_times(1.5)
if "/gpu:0" in device_times:
    gpu_times = device_times["/gpu:0"]
    plt.plot(matrix_sizes[:len(gpu_times)], gpu_times, 'o-', color='blue')

cpu_times = device_times["/cpu:0"]
plt.plot(matrix_sizes[:len(cpu_times)], cpu_times, 'o-', color='green')
plt.ylabel('Time')
plt.xlabel('Matrix size')
plt.show()

In [None]:
import tensorflow as tf
mnist = tf.keras.datasets.mnist

(x_train, y_train),(x_test, y_test) = mnist.load_data()
x_train, x_test = x_train / 255.0, x_test / 255.0

model = tf.keras.models.Sequential([
  tf.keras.layers.Flatten(input_shape=(28, 28)),
  tf.keras.layers.Dense(512, activation=tf.nn.relu),
  tf.keras.layers.Dropout(0.2),
  tf.keras.layers.Dense(10, activation=tf.nn.softmax)
])
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

model.fit(x_train, y_train, epochs=5)
model.evaluate(x_test, y_test)

## Pytorch

In [None]:
import torch.nn as nn
import torch.nn.functional as F
import torch
import numpy as np

class Model(nn.Module):
    "A typical CNN text classification model."
    def __init__(self, vocab_size, num_classes, embedding_dim=100, 
                 kernel_sizes=[3,4,5],  num_filters=64, dropout_prob=0.2):
        super(Model, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)  # embedding layer
        self.convs = nn.ModuleList([
            nn.Conv1d(embedding_dim, num_filters, k) for k in kernel_sizes])
        self.dropout = nn.Dropout(dropout_prob)  # a dropout layer
        self.fc1 = nn.Linear(len(kernel_sizes)*num_filters, num_classes)  # a dense layer for classification

    @staticmethod
    def conv_and_max_pool(x, conv):
        """Convolution and global max pooling layer"""
        return F.relu(conv(x).permute(0, 2, 1).max(1)[0])

    def forward(self, inputs):
        # Conv1d takes in (batch, channels, seq_len), but raw embedded is (batch, seq_len, channels)
        embedded = self.embedding(inputs).permute(0, 2, 1)
        x = [self.conv_and_max_pool(embedded, k) for k in self.convs]  # convolution and global max pooling
        x = self.fc1(self.dropout(torch.cat(x, 1)))  # concatenation and dropout
        return x

vocab_size = 1000
seq_length = 200
batch_size = 8

input = torch.tensor(np.random.randint(0, 1000, size=(batch_size, seq_length)))
model = Model(vocab_size, 10, num_filters=64, embedding_dim=128, dropout_prob=0.1)

# Should be a few milliseconds
%timeit model(input)

In [None]:
import torch

device = torch.device('cpu')
# device = torch.device('cuda') # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random input and output data
x = torch.randn(N, D_in, device=device)
y = torch.randn(N, D_out, device=device)

# Randomly initialize weights
w1 = torch.randn(D_in, H, device=device)
w2 = torch.randn(H, D_out, device=device)

learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y
    h = x.mm(w1)
    h_relu = h.clamp(min=0)
    y_pred = h_relu.mm(w2)
    
    # Compute and print loss; loss is a scalar, and is stored in a PyTorch Tensor
    # of shape (); we can get its value as a Python number with loss.item().
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.item())
    
    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.t().mm(grad_y_pred)
    grad_h_relu = grad_y_pred.mm(w2.t())
    grad_h = grad_h_relu.clone()
    grad_h[h < 0] = 0
    grad_w1 = x.t().mm(grad_h)
    
    # Update weights using gradient descent
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

## XGBoost

In [None]:
import shap
import xgboost
X,y = shap.datasets.boston()
model = xgboost.train({"learning_rate": 0.01}, xgboost.DMatrix(X, label=y), 100)
model

## MXNet

In [None]:
import mxnet as mx
a = mx.nd.ones((2, 3))
b = a * 2 + 1
b.asnumpy()

## Fastai

In [None]:
from fastai.script import *
from fastai.vision import *
from fastai.distributed import *

path = untar_data(URLs.MNIST_SAMPLE)
tfms = (rand_pad(2, 28), [])
data = ImageDataBunch.from_folder(path, ds_tfms=tfms, bs=64).normalize(imagenet_stats)
learn = cnn_learner(data, models.resnet18, metrics=accuracy)
learn.fit_one_cycle(1, 0.02)

In [None]:
import fastai.utils
fastai.utils.show_install(True)

# Temp workaround to fix missing PILLOW_VERSION on 7.0.0
import PIL
PIL.Image.PILLOW_VERSION = PIL.__version__

fastai.utils.check_perf()

## Theano

In [None]:
import theano
import numpy
 
x = theano.tensor.fvector('x')
W = theano.shared(numpy.asarray([0.2, 0.7]), 'W')
y = (x * W).sum()
 
f = theano.function([x], y)
 
output = f([1.0, 1.0])
output

## Fasttext

In [None]:
# fasttext via python api
import fasttext
help(fasttext)

## Spacy

In [None]:
import spacy
spacy.prefer_gpu()
nlp = spacy.load('en')
doc = nlp(u'Apple is looking at buying U.K. startup for $1 billion')
for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
          token.shape_, token.is_alpha, token.is_stop)

## GPU Support

In [None]:
# Basic GPU test
!nvidia-smi

In [None]:
# Cuda Version
!nvcc --version

In [None]:
import subprocess
gpus = []
sp = subprocess.Popen(['nvidia-smi', '-q'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
out_str = sp.communicate()
out_list = out_str[0].decode("utf-8").split('\n')
out_dict = {}
count_gpu = 0
for item in out_list:
    try:
        key, val = item.split(':')
        key, val = key.strip(), val.strip()
        if key == 'Product Name':
            count_gpu += 1
            gpus.append(val)
        out_dict[key + "_" + str(count_gpu)] = val
    except:
        pass
gpus

### Test Theano GPU Support

In [None]:
import os
os.environ["THEANO_FLAGS"] = 'device=cuda,floatX=float32'
from theano import function, config, shared, tensor
import numpy
import time

vlen = 10 * 30 * 768  # 10 x #cores x # threads per core
iters = 1000

rng = numpy.random.RandomState(22)
x = shared(numpy.asarray(rng.rand(vlen), config.floatX))
f = function([], tensor.exp(x))
print(f.maker.fgraph.toposort())
t0 = time.time()
for i in range(iters):
    r = f()
t1 = time.time()
print("Looping %d times took %f seconds" % (iters, t1 - t0))
print("Result is %s" % (r,))
if numpy.any([isinstance(x.op, tensor.Elemwise) and
              ('Gpu' not in type(x.op).__name__)
              for x in f.maker.fgraph.toposort()]):
    print('Used the cpu')
else:
    print('Used the gpu')

### Test Tensorflow GPU Support

In [None]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

In [None]:
import tensorflow as tf
print(tf.test.is_gpu_available())
print(tf.test.gpu_device_name())

### Test PyTorch GPU Support

In [None]:
import torch
print(torch.cuda.device_count())
if torch.cuda.device_count() > 0:
    for i in range(torch.cuda.device_count()):
        print(torch.cuda.get_device_name(i))

### Test Spacy GPU Support

In [None]:
import spacy
spacy.require_gpu()

### Test XGBoost GPU Support

In [None]:
import xgboost as xgb
from sklearn.datasets import load_boston

boston = load_boston()

# XGBoost API example
params = {'tree_method': 'gpu_hist', 'max_depth': 3, 'learning_rate': 0.1}
dtrain = xgb.DMatrix(boston.data, boston.target)
xgb.train(params, dtrain, evals=[(dtrain, "train")])

# sklearn API example
gbm = xgb.XGBRegressor(silent=False, n_estimators=10, tree_method='gpu_hist')
gbm.fit(boston.data, boston.target, eval_set=[(boston.data, boston.target)])

### Test MXNet GPU

In [None]:
import mxnet as mx
a = mx.nd.ones((2, 3), mx.gpu())
b = a * 2 + 1
b.asnumpy()

### Test RAPIDS

#### cudf

In [None]:
import cudf, io, requests
from io import StringIO

url="https://github.com/plotly/datasets/raw/master/tips.csv"
content = requests.get(url).content.decode('utf-8')

tips_df = cudf.read_csv(StringIO(content))
tips_df['tip_percentage'] = tips_df['tip']/tips_df['total_bill']*100

# display average tip by dining party size
print(tips_df.groupby('size').tip_percentage.mean())

#### cuml

In [None]:
import cudf
from cuml.cluster import DBSCAN

# Create and populate a GPU DataFrame
gdf_float = cudf.DataFrame()
gdf_float['0'] = [1.0, 2.0, 5.0]
gdf_float['1'] = [4.0, 2.0, 1.0]
gdf_float['2'] = [4.0, 2.0, 1.0]

# Setup and fit clusters
dbscan_float = DBSCAN(eps=1.0, min_samples=1)
dbscan_float.fit(gdf_float)

print(dbscan_float.labels_)

#### custrings

In [None]:
import nvstrings, nvcategory
import requests

url="https://github.com/plotly/datasets/raw/master/tips.csv"
content = requests.get(url).content.decode('utf-8')

#split content into a list, remove header
host_lines = content.strip().split('\n')[1:]

#copy strings to gpu
gpu_lines = nvstrings.to_device(host_lines)

#split into columns on gpu
gpu_columns = gpu_lines.split(',')
gpu_day_of_week = gpu_columns[4]

#use gpu `replace` to re-encode tokens on GPU
for idx, day in enumerate(['Sun', 'Mon', 'Tues', 'Wed', 'Thur', 'Fri', 'Sat']):
    gpu_day_of_week = gpu_day_of_week.replace(day, str(idx))

# or, use nvcategory's builtin GPU categorization
cat = nvcategory.from_strings(gpu_columns[4])

# copy category keys to host and print
print(cat.keys())

# copy "cleaned" strings to host and print
print(gpu_day_of_week)

#### dask-cuda

In [None]:
from dask_cuda import LocalCUDACluster
from dask.distributed import Client

cluster = LocalCUDACluster(local_dir="/tmp/dask")
client = Client(cluster)

### Test PyCuda

In [None]:
import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule
import numpy
a = numpy.random.randn(4,4)
a = a.astype(numpy.float32)
a_gpu = cuda.mem_alloc(a.nbytes)
cuda.memcpy_htod(a_gpu, a)

### Test pynvml

In [None]:
from pynvml import *
nvmlInit()
try:
    deviceCount = nvmlDeviceGetCount()
    for i in range(deviceCount):
        handle = nvmlDeviceGetHandleByIndex(i)
        print("Device", i, ":", nvmlDeviceGetName(handle))
except NVMLError as error:
    print(error)

### Test py3nvml

In [None]:
import py3nvml
free_gpus = py3nvml.get_free_gpus()
if True not in free_gpus:
    print('No free gpus found')
!py3smi

### Test gputil

In [None]:
import GPUtil
GPUtil.showUtilization()

### Test GPUStat

In [None]:
!gpustat

### Relase GPU Memory

In [None]:
import torch
def consume_gpu(n):
    return torch.ones((n,n)).cuda()
!gpustat
consume_gpu(1000)
!gpustat
torch.cuda.empty_cache()
!gpustat

In [None]:
from numba import cuda
cuda.select_device(0)
cuda.close()
cuda.select_device(0)
!gpustat

### Links & Resources
- https://docs.fast.ai/dev/gpu.html
- https://docs.fast.ai/troubleshoot.html#memory-leakage-on-exception

## Optional Installs

### Starspace

In [None]:
# Install starspace
!/resources/tools/starspace.sh --install
# Starspace command line instructions:
!starspace --help

### Fasttext

In [None]:
# Install fasttext
!/resources/tools/fasttext.sh --install
# Starspace command line instructions:
!fasttext --help

### Pillow SIMD
Replacement of pillow for faster image processing. More information here: https://docs.fast.ai/performance.html#faster-image-processing

In [None]:
# Install pillow simd
!/resources/tools/pillow-simd.sh --install

In [None]:
# Check pillow install
# You most likely need to restart the kernel after pillow simd installation

# Temp workaround to fix missing PILLOW_VERSION on 7.0.0
import PIL
PIL.Image.PILLOW_VERSION = PIL.__version__

import fastai.utils
fastai.utils.check_perf()

### CNTK

_CNTK does not work with Python 3.7_

In [None]:
# Install cntk
!/resources/tools/cntk.sh --install

In [None]:
import cntk
print(cntk.__version__)
cntk.minus([1, 2, 3], [4, 5, 6]).eval()

In [None]:
from __future__ import print_function
import numpy as np
import cntk as C
from cntk.learners import sgd
from cntk.logging import ProgressPrinter
from cntk.layers import Dense, Sequential

def generate_random_data(sample_size, feature_dim, num_classes):
    # Create synthetic data using NumPy.
    Y = np.random.randint(size=(sample_size, 1), low=0, high=num_classes)

    # Make sure that the data is separable
    X = (np.random.randn(sample_size, feature_dim) + 3) * (Y + 1)
    X = X.astype(np.float32)
    # converting class 0 into the vector "1 0 0",
    # class 1 into vector "0 1 0", ...
    class_ind = [Y == class_number for class_number in range(num_classes)]
    Y = np.asarray(np.hstack(class_ind), dtype=np.float32)
    return X, Y

def ffnet():
    inputs = 2
    outputs = 2
    layers = 2
    hidden_dimension = 50

    # input variables denoting the features and label data
    features = C.input_variable((inputs), np.float32)
    label = C.input_variable((outputs), np.float32)

    # Instantiate the feedforward classification model
    my_model = Sequential ([
                    Dense(hidden_dimension, activation=C.sigmoid),
                    Dense(outputs)])
    z = my_model(features)

    ce = C.cross_entropy_with_softmax(z, label)
    pe = C.classification_error(z, label)

    # Instantiate the trainer object to drive the model training
    lr_per_minibatch = C.learning_parameter_schedule(0.125)
    progress_printer = ProgressPrinter(0)
    trainer = C.Trainer(z, (ce, pe), [sgd(z.parameters, lr=lr_per_minibatch)], [progress_printer])

    # Get minibatches of training data and perform model training
    minibatch_size = 25
    num_minibatches_to_train = 1024

    aggregate_loss = 0.0
    for i in range(num_minibatches_to_train):
        train_features, labels = generate_random_data(minibatch_size, inputs, outputs)
        # Specify the mapping of input variables in the model to actual minibatch data to be trained with
        trainer.train_minibatch({features : train_features, label : labels})
        sample_count = trainer.previous_minibatch_sample_count
        aggregate_loss += trainer.previous_minibatch_loss_average * sample_count

    last_avg_error = aggregate_loss / trainer.total_number_of_samples_seen

    test_features, test_labels = generate_random_data(minibatch_size, inputs, outputs)
    avg_error = trainer.test_minibatch({features : test_features, label : test_labels})
    print(' error rate on an unseen minibatch: {}'.format(avg_error))
    return last_avg_error, avg_error

np.random.seed(98052)
ffnet()

#### Test CNTK GPU Support

In [None]:
from cntk.device import try_set_default_device, gpu
try_set_default_device(gpu(0))

### Catboost

In [None]:
!pip install -U catboost

In [None]:
from catboost import CatBoostClassifier
# Initialize data
cat_features = [0,1,2]
train_data = [["a","b",1,4,5,6],["a","b",4,5,6,7],["c","d",30,40,50,60]]
train_labels = [1,1,-1]
test_data = [["a","b",2,4,6,8],["a","d",1,4,50,60]]
# Initialize CatBoostClassifier
model = CatBoostClassifier(iterations=2, 
                           learning_rate=1, 
                           depth=2, 
                           loss_function='Logloss', 
                           train_dir="/tmp/catboost/")
# Fit model
model.fit(train_data, train_labels, cat_features)
# Get predicted classes
preds_class = model.predict(test_data)
# Get predicted probabilities for each class
preds_proba = model.predict_proba(test_data)
# Get predicted RawFormulaVal
preds_raw = model.predict(test_data, prediction_type='RawFormulaVal')

#### Test Catboost GPU Support

In [None]:
from catboost import CatBoostClassifier

train_data = [[0, 3],
              [4, 1],
              [8, 1],
              [9, 1]]
train_labels = [0, 0, 1, 1]

model = CatBoostClassifier(iterations=1000, 
                           task_type = "GPU",
                           devices='0',
                           train_dir="/tmp/catboost/")
model.fit(train_data,
          train_labels,
          verbose = False)