# profiling different methods for getting column or index values.

In [1]:
import pandas as pd
import obsplus
import obspy
import obspy.core.event as ev
from obsplus.utils.pd import loc_by_name

ds = obsplus.load_dataset('bingham_test')
cat = ds.event_client.get_events()

new_cat = obspy.Catalog(cat.events * 1000)

df = obsplus.events_to_df(new_cat)

In [2]:
df = pd.concat([df] * 100, axis=0, ignore_index=True)

In [3]:
import numpy as np
# add new indices
df['resource_id'] = [str(ev.ResourceIdentifier()) for _ in range(len(df))]
df['parent_id'] = [str(ev.ResourceIdentifier()) for _ in range(len(df))]
df['scope_id'] = [str(ev.ResourceIdentifier()) for _ in range(len(df))]
df['index'] = np.random.randint(1, 10, size=(len(df)))
df['attr'] = 'something'

# Profile indexing

In [4]:
sub_query = df['resource_id'].values[::20]

In [5]:
# no index
%timeit df[df['resource_id'].isin(sub_query)]

110 ms ± 625 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [6]:
# single index
df_ = df.set_index('resource_id')

In [7]:
%timeit df_.loc[sub_query]

58.2 ms ± 765 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [22]:
df__ = df.set_index(['resource_id', 'attr', 'index'])

In [23]:
%timeit df__.loc[(sub_query, slice(None), slice(None))]

32.6 s ± 14.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [10]:
# multindex, resource_id last
df_ = df.set_index(['scope_id', 'parent_id', 'index', 'attr', 'resource_id']).sort_index()

In [17]:
something = set(sub_query)

In [20]:
%timeit df.query("attr=='something' & index==1")

69.1 ms ± 1.14 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [24]:
%timeit df[(df['resource_id'].isin(sub_query)) & (df['attr'].isin(['something'])) & (df['index'].isin([1]))]

107 ms ± 693 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [None]:
%load_ext snakeviz


In [None]:
%%snakeviz
loc_by_name(df_, resource_id=sub_query)