In [1]:
import pandas as pd
import numpy as np
from math import *

Original Blog: https://engineering.upside.com/a-beginners-guide-to-optimizing-pandas-code-for-speed-c09ef2c6a4d6

Load Data

In [2]:
df = pd.read_csv('new_york_hotels.csv', encoding='cp1252')

In [3]:
df.dtypes

ean_hotel_id        int64
name               object
address1           object
city               object
state_province     object
postal_code        object
latitude          float64
longitude         float64
star_rating       float64
high_rate         float64
low_rate          float64
dtype: object

In [4]:
# Define a basic Haversine distance formula
def haversine(lat1, lon1, lat2, lon2):
    MILES = 3959
    lat1, lon1, lat2, lon2 = map(np.deg2rad, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1 
    dlon = lon2 - lon1 
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arcsin(np.sqrt(a)) 
    total_miles = MILES * c
    return total_miles

In [5]:
df_loop = df.copy()

Crude Loop

In [6]:
# Define a function to manually loop over all rows and return a series of distances
def haversine_looping(df):
    distance_list = []
    for i in range(0, len(df)):
        d = haversine(40.671, -73.985, df.iloc[i]['latitude'], df.iloc[i]['longitude'])
        distance_list.append(d)
    return distance_list

In [7]:
%%timeit
# Run the haversine looping function
df_loop['distance'] = haversine_looping(df_loop)

475 ms ± 25.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


Iterrows

In [8]:
%%timeit
# Haversine applied on rows via iteration
haversine_series = []
for index, row in df.iterrows():
    haversine_series.append(haversine(40.671, -73.985, row['latitude'], row['longitude']))

df_loop['distance'] = haversine_series

159 ms ± 1.95 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


apply

In [9]:
%%timeit 
df_loop['distance'] = df_loop.apply(lambda row: haversine(40.671, -73.985, row['latitude'], row['longitude']), axis=1)

51.1 ms ± 1.65 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [10]:
# Load the profiler into your Jupyter notebook
%load_ext line_profiler

In [11]:
%lprun -f haversine \
df_loop.apply(lambda row: haversine(40.671, -73.985, row['latitude'], row['longitude']), axis=1)
"""
Timer unit: 1e-06 s

Total time: 0.034127 s
File: <ipython-input-4-1f441d863f0f>
Function: haversine at line 2

Line #      Hits         Time  Per Hit   % Time  Line Contents
==============================================================
     2                                           def haversine(lat1, lon1, lat2, lon2):
     3      1631        775.0      0.5      2.3      MILES = 3959
     4      1631       8232.0      5.0     24.1      lat1, lon1, lat2, lon2 = map(np.deg2rad, [lat1, lon1, lat2, lon2])
     5      1631        980.0      0.6      2.9      dlat = lat2 - lat1 
     6      1631        733.0      0.4      2.1      dlon = lon2 - lon1 
     7      1631      15243.0      9.3     44.7      a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
     8      1631       6392.0      3.9     18.7      c = 2 * np.arcsin(np.sqrt(a)) 
     9      1631       1169.0      0.7      3.4      total_miles = MILES * c
    10      1631        603.0      0.4      1.8      return total_miles
"""



#### Vectorization on Pandas Series

In [12]:
%%timeit 
# Vectorized implementation of Haversine applied on Pandas series
df['distance'] = haversine(40.671, -73.985, df['latitude'], df['longitude'])

2.4 ms ± 502 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [13]:
# Vectorized implementation profile
%lprun -f haversine haversine(40.671, -73.985,\
                              df['latitude'], df['longitude'])
"""
Timer unit: 1e-06 s

Total time: 0.009673 s
File: <ipython-input-4-1f441d863f0f>
Function: haversine at line 2

Line #      Hits         Time  Per Hit   % Time  Line Contents
==============================================================
     2                                           def haversine(lat1, lon1, lat2, lon2):
     3         1          2.0      2.0      0.0      MILES = 3959
     4         1       2787.0   2787.0     28.8      lat1, lon1, lat2, lon2 = map(np.deg2rad, [lat1, lon1, lat2, lon2])
     5         1       1644.0   1644.0     17.0      dlat = lat2 - lat1 
     6         1        319.0    319.0      3.3      dlon = lon2 - lon1 
     7         1       3824.0   3824.0     39.5      a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
     8         1        800.0    800.0      8.3      c = 2 * np.arcsin(np.sqrt(a)) 
     9         1        296.0    296.0      3.1      total_miles = MILES * c
    10         1          1.0      1.0      0.0      return total_miles
"""



#### Vectorization on NumPy arrays

In [14]:
%%timeit
# See how long it takes to convert pandas arrays to NumPy ndarrays
np_lat = df['latitude'].values
np_lon = df['longitude'].values

4.87 µs ± 677 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [15]:
# Vectorized implementation of Haversine applied on NumPy arrays
%timeit df['distance'] = haversine(40.671, -73.985,\
                         df['latitude'].values, df['longitude'].values)

212 µs ± 14.1 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [16]:
%lprun -f haversine df['distance'] = haversine(40.671, -73.985,\
                        df['latitude'].values, df['longitude'].values)
"""
Timer unit: 1e-06 s

Total time: 0.000779 s
File: <ipython-input-4-1f441d863f0f>
Function: haversine at line 2

Line #      Hits         Time  Per Hit   % Time  Line Contents
==============================================================
     2                                           def haversine(lat1, lon1, lat2, lon2):
     3         1          2.0      2.0      0.3      MILES = 3959
     4         1         33.0     33.0      4.2      lat1, lon1, lat2, lon2 = map(np.deg2rad, [lat1, lon1, lat2, lon2])
     5         1         11.0     11.0      1.4      dlat = lat2 - lat1 
     6         1          4.0      4.0      0.5      dlon = lon2 - lon1 
     7         1        697.0    697.0     89.5      a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
     8         1         29.0     29.0      3.7      c = 2 * np.arcsin(np.sqrt(a)) 
     9         1          3.0      3.0      0.4      total_miles = MILES * c
    10         1          0.0      0.0      0.0      return total_miles
"""

