1. For loop(baseline)
2. iterrows - pandas
3. apply - pandas
4. cut - pandas

In [14]:
import seaborn as sns
import pandas as pd
import numpy as np
import time

In [99]:
data = sns.load_dataset('iris')

def compute_class(petal_length):
    if petal_length <= 2:
        return 1
    elif 2 < petal_length < 5:
        return 2
    else:
        return 3
    
def get_avg_time(fn, iter=5):
    runtime = []
    for _ in range(iter):
        start = time.time()
        classlist = fn()
        end = time.time()
        
        runtime.append(end-start)
     
    avg_runtime = round(np.mean(runtime), 3)
    
    print("{} average run time over {} iterations = {:.3f}s".format(fn.__name__, iter, avg_runtime))
    
    return (avg_runtime, classlist)

def get_speedup(baseline, curr):
    speedupp = round(baseline/curr, 2)
    print("Speedup of {:.2f}X".format(speedupp))
    return speedupp

result = {'fn':[], 'runtime':[], 'speedup':[]}

### 1. For loop

In [100]:
def for_loop():
    class_list = list()

    for i in range(len(data)):
        petal_length = data.iloc[i]['petal_length']
        class_num = compute_class(petal_length)
        class_list.append(class_num)

    return class_list

result['fn'].append('for_loop')
baseline, classlist = get_avg_time(for_loop)

result['runtime'].append(baseline)
result['speedup'].append(1)

for_loop average run time over 5 iterations = 0.030s


### 2. iterrows()

The **.iterrows()** function from Pandas implements a generator function internally which will yield a row of the Dataframe on each iteration. More precisely, **.iterrows()** yields pairs (tuples) of **(index, Series)** for each row in the DataFrame. This is effectively the same as using something like **enumerate()** in raw Python but runs much much faster

In [101]:
def iterrows():
    class_list = list()

    for i, datarow in data.iterrows():
        petal_length = datarow['petal_length']
        class_num = compute_class(petal_length)
        class_list.append(class_num)
        
    return class_list

runtime, classlist2 = get_avg_time(iterrows)
assert classlist == classlist2
speedup = get_speedup(baseline, runtime)

result['fn'].append('iterrows')
result['runtime'].append(runtime)
result['speedup'].append(speedup)

iterrows average run time over 5 iterations = 0.014s
Speedup of 2.14X


### 3. apply()

In [102]:
def apply():
    class_list = data.apply(lambda row: compute_class(row['petal_length']), axis=1)
    return list(class_list)

runtime, classlist3 = get_avg_time(apply)
assert classlist == classlist3
speedup = get_speedup(baseline, runtime)

result['fn'].append('apply')
result['runtime'].append(runtime)
result['speedup'].append(speedup)

apply average run time over 5 iterations = 0.005s
Speedup of 6.00X


### 4. cut()

The **.cut()** function from Pandas takes as input a set of **bins** which define each range of our If-Else and a set of labels which define which value to return for each range. It then performs the exact same operation we wrote manually with the compute_class() function.

In [103]:
def cut():
    class_list = pd.cut(data['petal_length'],
                       bins=[0, 2, 5, 100], # if-else condition of compute_class()
                       right=False, # exclude upper limit
                       include_lowest=True, # include lower limit
                       labels=[1, 2, 3]).astype(int)
    return list(class_list)

runtime, classlist4 = get_avg_time(cut)
assert classlist == classlist4
speedup = get_speedup(baseline, runtime)

result['fn'].append('cut')
result['runtime'].append(runtime)
result['speedup'].append(speedup)

cut average run time over 5 iterations = 0.003s
Speedup of 10.00X


In [104]:
df = pd.DataFrame(result)
df.head()

Unnamed: 0,fn,runtime,speedup
0,for_loop,0.03,1.0
1,iterrows,0.014,2.14
2,apply,0.005,6.0
3,cut,0.003,10.0


<hr>