In [17]:
import pandas as pd
import numpy as np 

## Simple Vectorization

In [18]:
n_rows = 10000
n_cols = 10000
a = np.random.randint(low=0, high=20, size=(n_rows,n_cols), dtype=np.int64)
df = pd.DataFrame(a)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9990,9991,9992,9993,9994,9995,9996,9997,9998,9999
0,18,6,15,14,11,10,10,9,6,18,...,16,12,12,10,11,1,7,17,3,9
1,12,4,8,9,0,7,15,3,14,3,...,10,14,14,16,19,17,1,19,7,3
2,3,13,9,5,3,6,4,6,2,13,...,11,14,9,14,16,17,10,12,13,1
3,9,7,9,17,18,8,0,1,10,18,...,19,8,14,16,3,12,14,10,13,13
4,5,16,16,11,11,1,10,19,10,8,...,3,19,16,2,17,5,8,19,0,14


In [19]:
%%time

# Worse case scenario: double python for loop (O(n^2)) !!
sum = 0
for row_id in range(n_rows):
    for i in a[row_id,:]:
        sum = sum + i
sum

CPU times: user 8.06 s, sys: 66 ms, total: 8.13 s
Wall time: 8.13 s


950031982

In [20]:
%%time
# Using pandas vectorized sum is much faster (here, it is called n_cols + 1 times)
df.sum().sum()

CPU times: user 43 ms, sys: 1.37 ms, total: 44.4 ms
Wall time: 43.6 ms


950031982

In [21]:
%%time
a.sum()

CPU times: user 18.4 ms, sys: 630 µs, total: 19.1 ms
Wall time: 18.4 ms


950031982

## Column and Row Major

In [22]:
%%time 
#column major sum
np.apply_along_axis(np.sum, 0, a).sum()

CPU times: user 264 ms, sys: 1.88 ms, total: 266 ms
Wall time: 266 ms


950031982

In [23]:
%%time
#row major sum
np.apply_along_axis(np.sum, 1, a).sum()

CPU times: user 45.8 ms, sys: 828 µs, total: 46.6 ms
Wall time: 45.9 ms


950031982

In [24]:
a_col_major = np.asfortranarray(a)
a_col_major.flags['F_CONTIGUOUS']

True

In [25]:
%%time
# Column-wise operation

np.apply_along_axis(np.sum, 0, a_col_major).sum()

CPU times: user 65.9 ms, sys: 34.9 ms, total: 101 ms
Wall time: 100 ms


950031982

In [26]:
%%time
# Row-wise operation
np.apply_along_axis(np.sum, 1, a_col_major).sum()

CPU times: user 264 ms, sys: 1.83 ms, total: 266 ms
Wall time: 265 ms


950031982

In [27]:
N = 100000
A_list = np.random.randint(1, 4, N)
B_list = np.random.randint(1, 4, N)
df_small = pd.DataFrame({'A': A_list, 'B': B_list})
df_small

Unnamed: 0,A,B
0,1,2
1,1,3
2,3,2
3,1,1
4,2,2
...,...,...
99995,1,3
99996,1,1
99997,2,3
99998,1,2


In [28]:
def divide_without_nan(row):
    if row[0] == row[1]:
        return 1
    return float(row[0]/row[1])

In [29]:
%%time
new = df_small.apply(lambda row: divide_without_nan(row), axis=1)
assert new.isna().sum() == 0

CPU times: user 383 ms, sys: 5.65 ms, total: 389 ms
Wall time: 388 ms


In [30]:
%%time
new = np.apply_along_axis(divide_without_nan, 1, df_small.values)
assert np.isnan(new).sum() == 0

CPU times: user 88.4 ms, sys: 1.25 ms, total: 89.6 ms
Wall time: 89.2 ms


In [31]:
%%time
# taking away the colum
new = np.where(df_small['A'] == df_small['B'], 1, df_small['A'] / df_small['B'])
np.isnan(new).sum() == 0

CPU times: user 692 µs, sys: 521 µs, total: 1.21 ms
Wall time: 859 µs


True