In [2]:
import pandas as pd
import numpy as np 

## Simple Vectorization

In [3]:
n_rows = 10000
n_cols = 10000
a = np.random.randint(low=0, high=20, size=(n_rows,n_cols), dtype=np.int64)
df = pd.DataFrame(a)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9990,9991,9992,9993,9994,9995,9996,9997,9998,9999
0,2,0,17,10,12,7,0,4,16,18,...,0,7,15,12,11,8,18,15,10,6
1,6,8,15,8,11,18,11,15,6,19,...,4,2,19,7,19,4,15,8,6,12
2,4,2,6,10,5,3,16,2,10,0,...,2,3,0,7,16,0,15,10,10,18
3,17,14,19,0,9,6,12,18,12,8,...,7,19,11,8,17,13,2,19,14,11
4,14,2,7,12,4,11,14,1,19,3,...,16,6,3,14,2,3,9,1,8,4


In [4]:
%%time

# Worse case scenario: double python for loop (O(n^2)) !!
sum = 0
for row_id in range(n_rows):
    for i in a[row_id,:]:
        sum = sum + i
sum

CPU times: user 7.37 s, sys: 91 ms, total: 7.46 s
Wall time: 7.47 s


949941316

In [5]:
%%time
# Using pandas vectorized sum is much faster (here, it is called n_cols + 1 times)
df.sum().sum()

CPU times: user 43.2 ms, sys: 1.1 ms, total: 44.3 ms
Wall time: 45.2 ms


949941316

In [6]:
%%time
a.sum()

CPU times: user 17.7 ms, sys: 940 µs, total: 18.7 ms
Wall time: 17.8 ms


949941316

## Column and Row Major

In [7]:
%%time 
#column major sum
np.apply_along_axis(np.sum, 0, a).sum()

CPU times: user 262 ms, sys: 1.92 ms, total: 264 ms
Wall time: 263 ms


949941316

In [8]:
%%time
#row major sum
np.apply_along_axis(np.sum, 1, a).sum()

CPU times: user 44 ms, sys: 817 µs, total: 44.8 ms
Wall time: 44.2 ms


949941316

In [16]:
a_col_major = np.asfortranarray(a)
a_col_major.flags['F_CONTIGUOUS']

True

In [14]:
%%time
# Column-wise operation

np.apply_along_axis(np.sum, 0, a_col_major).sum()

CPU times: user 65.1 ms, sys: 36.7 ms, total: 102 ms
Wall time: 101 ms


949941316

In [15]:
%%time
# Row-wise operation
np.apply_along_axis(np.sum, 1, a_col_major).sum()

CPU times: user 261 ms, sys: 1.53 ms, total: 262 ms
Wall time: 261 ms


949941316

In [26]:
N = 100000
A_list = np.random.randint(1, 4, N)
B_list = np.random.randint(1, 4, N)
df_small = pd.DataFrame({'A': A_list, 'B': B_list})
df_small

Unnamed: 0,A,B
0,1,2
1,3,3
2,3,3
3,3,3
4,2,2
...,...,...
99995,1,3
99996,1,2
99997,3,3
99998,3,1


In [31]:
def divide_without_nan(row):
    if row[0] == row[1]:
        return 1
    return float(row[0]/row[1])

In [32]:
%%time
new = df_small.apply(lambda row: divide_without_nan(row), axis=1)
assert new.isna().sum() == 0

CPU times: user 382 ms, sys: 5.15 ms, total: 387 ms
Wall time: 387 ms


In [33]:
%%time
new = np.apply_along_axis(divide_without_nan, 1, df_small.values)
assert np.isnan(new).sum() == 0

CPU times: user 83.9 ms, sys: 1.09 ms, total: 85 ms
Wall time: 84.2 ms


In [34]:
%%time
# taking away the colum
new = np.where(df_small['A'] == df_small['B'], 1, df_small['A'] / df_small['B'])
np.isnan(new).sum() == 0

CPU times: user 710 µs, sys: 310 µs, total: 1.02 ms
Wall time: 659 µs


True