## Import Libary

In [1]:
import numpy as np
import pandas as pd
from numba import njit, prange, jit
from sklearn.preprocessing import StandardScaler

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
import matplotlib
matplotlib.rc('figure', figsize=(10, 5))

## Read Data

In [2]:
df_data = pd.read_csv('../data/CC GENERAL.csv')
df_data = df_data.drop('CUST_ID', axis=1)
df_data_np = df_data.to_numpy()

In [3]:
def highlight_minimum_time(s):
    is_min = s == s.min()
    return ['background-color: yellow' if v else '' for v in is_min]

## Preprocessing

### Standard CPU

In [4]:
@njit(parallel=False)
def standard(df):
    n = df.shape[1]
    res = np.empty_like(df, dtype=np.float64)

    for i in range(n):
        data_i = df[:, i]
        res[:, i] = (data_i - np.mean(data_i)) / np.std(data_i)

    return res

### Standard CPU Parallel

In [5]:
@jit(nopython=True, parallel=True)
def standard_parallel(df):
    n = df.shape[1]
    res = np.empty_like(df, dtype=np.float64)

    for i in prange(n):
        data_i = df[:, i]
        res[:, i] = (data_i - np.mean(data_i)) / np.std(data_i)

    return res

### Expected result using Sklearn

In [6]:
expected = StandardScaler().fit_transform(df_data_np)

### Output result running by CPU

In [7]:
output = standard(df_data_np)

### Comparasion diff

In [8]:
output - expected

array([[-1.66533454e-15, -4.17166302e-14, -1.08246745e-14, ...,
                    nan, -1.66533454e-15, -6.66133815e-16],
       [-1.33226763e-15, -2.29816166e-14, -1.18238752e-14, ...,
                    nan, -4.63518113e-15, -6.66133815e-16],
       [-1.44328993e-15, -4.21884749e-15, -3.89965837e-15, ...,
                    nan, -1.66533454e-15, -6.66133815e-16],
       ...,
       [-1.77635684e-15, -3.86080057e-14, -1.03250741e-14, ...,
                    nan, -4.99600361e-15,  7.99360578e-15],
       [-1.66533454e-15, -3.86080057e-14, -1.18238752e-14, ...,
                    nan, -4.99600361e-15,  7.99360578e-15],
       [-1.66533454e-15, -7.30526750e-14, -6.17561557e-16, ...,
                    nan, -1.66533454e-15,  7.99360578e-15]])

### Output result running by CPU Parallel

In [9]:
output_parallel = standard_parallel(df_data_np)

In [10]:
#output_parallel.parallel_diagnostics(level=4)

AttributeError: 'numpy.ndarray' object has no attribute 'parallel_diagnostics'

### Comparasion diff

In [None]:
output_parallel - expected

## Measure running time with different strategies

### Measure time with running with increasing number of columns

In [None]:
res = []
multiples = range(1, 30, 4)

for idx, i in enumerate(multiples):
    data = np.tile(df_data_np, i)
    
    o_1 = %timeit -o -q StandardScaler().fit_transform(data)
    o_2 = %timeit -o -q standard(data)
    o_3 = %timeit -o -q standard_parallel(data)
    
    res.append((data.shape[1], o_1.best, o_2.best, o_3.best))
    print('{0} of {1} complete {2}'.format(idx + 1, len(multiples), data.shape))

In [None]:
df_viz = pd.DataFrame(res, columns = ['num_cols', 'sklearn', 'CPU', 'numba CPU parallel'])

df_viz = df_viz.set_index('num_cols')
df_viz = df_viz.apply(lambda x: 1000 * x)

ax = df_viz.plot()
ax.set_title('Standard scale: by n columns')
ax.set_xlabel('Number of columns')
ax.set_ylabel('Time (ms)')
plt.legend(prop={'size': 14})

### Table measuring strategies check time

In [None]:
df_viz.style.apply(highlight_minimum_time, axis=1)

### Measure time with running with increasing number of row

In [None]:
res = []

for idx, i in enumerate(multiples):
    data = np.tile(df_data_np.T, i).T
    o_1 = %timeit -o -q StandardScaler().fit_transform(data)
    o_2 = %timeit -o -q standard(data)
    o_3 = %timeit -o -q standard_parallel(data)
    
    res.append((data.shape[0], o_1.best, o_2.best, o_3.best))
    print('{0} of {1} complete {2}'.format(idx + 1, len(multiples), data.shape))

In [None]:
df_viz = pd.DataFrame(res, columns = ['num_rows', 'sklearn', 'numba CPU', 'numba CPU parallel'])
df_viz = df_viz.set_index('num_rows')
df_viz = df_viz.apply(lambda x: 1000 * x)

In [None]:
ax = df_viz.plot()
ax.set_title('Standard scale: n rows by 17 columns')
ax.set_xlabel('Number of rows')
ax.set_ylabel('Time (ms)')
plt.legend(prop={'size': 14})

### Table measuring strategies check time

In [None]:
df_viz.style.apply(highlight_minimum_time, axis=1)

In [None]:
df_viz.parallel_diagnostics(level=4)