In [52]:
import pandas as pd
import numpy as np
import requests

In [2]:
def get_dataset(url, fname, ext):
    target_path = f'{fname}.{ext}'
    response = requests.get(url)
    response.raise_for_status()
    with open(target_path, 'wb') as f:
        f.write(response.content)
    print('Download ready.')

In [3]:
url = 'https://raw.githubusercontent.com/realpython/materials/master/pandas-fast-flexible-intuitive/tutorial/demand_profile.csv'
get_dataset(url, 'electricity', 'csv')

Download ready.


In [4]:
pd.__version__

'1.0.5'

In [5]:
df = pd.read_csv('electricity.csv')
df.head()

Unnamed: 0,date_time,energy_kwh
0,1/1/13 0:00,0.586
1,1/1/13 1:00,0.58
2,1/1/13 2:00,0.572
3,1/1/13 3:00,0.596
4,1/1/13 4:00,0.592


In [6]:
df.dtypes

date_time      object
energy_kwh    float64
dtype: object

The first issue we run into is that the DataFrame created the times as the generic 'object' data type. 

In [7]:
type(df.iat[0,0])

str

In [8]:
df['date_time'] = pd.to_datetime(df['date_time'])
df['date_time'].dtype

dtype('<M8[ns]')

In [9]:
df.head()

Unnamed: 0,date_time,energy_kwh
0,2013-01-01 00:00:00,0.586
1,2013-01-01 01:00:00,0.58
2,2013-01-01 02:00:00,0.572
3,2013-01-01 03:00:00,0.596
4,2013-01-01 04:00:00,0.592


In [14]:
@timeit(repeat=3, number=10)
def convert(df, column_name):
    return pd.to_datetime(df[column_name])

In [15]:
df = pd.read_csv('electricity.csv')

In [16]:
df['date_time'] = convert(df, 'date_time')

Best of 3 trials with 10 function calls per trial:
Function `convert` ran in average of 0.620 seconds.



This can be faster than waiting 0.62 seconds for about 8000 lines of data to be converted. Instead, we'll tell the convert function what format we want the date to be returned as.

In [27]:
@timeit(repeat=3, number=100)
def convert_with_format(df, column_name):
    return pd.to_datetime(df[column_name],
                         format='%d/%m/%y %H:%M')

In [28]:
df = pd.read_csv('electricity.csv')
df['date_time'] = convert_with_format(df, 'date_time')

Best of 3 trials with 100 function calls per trial:
Function `convert_with_format` ran in average of 0.019 seconds.



Formatting sped up the operation by over 32 times.

# Simple Looping

We want to calculate the electricity cost, and the cost varies throughout the day. The non-Pythonic way to perform this calculation would be to write a loop.

In [29]:
def apply_tariff(kwh, hour):
    if 0 <= hour < 7:
        rate = 12
    elif 7 <= hour < 17:
        rate = 20
    elif 17 <= hour < 24:
        rate = 28
    else:
        raise ValueError(f'Invalid hour: {hour}')

In [32]:
# Non-Pythonic version
@timeit(repeat=3, number=100)
def apply_tariff_loop(df):
    energy_cost_list = []
    for i in range(len(df)):
        energy_used = df.iloc[i]['energy_kwh']
        hour = df.iloc[i]['date_time'].hour
        energy_cost = apply_tariff(energy_used, hour)
        energy_cost_list.append(energy_cost)
    df['cost_cents'] = energy_cost_list

This would take too long to run, so I won't call it.

## Looping with ```itertuples()``` and ```iterrows()```

These two functions are generators that yield one row at a time. ```itertuples()``` yields a tuple with the index as the first entry.

In [36]:
@timeit(repeat=3, number=100)
def apply_tariff_iterrows(df):
    energy_cost_list = []
    for index, row in df.iterrows():
        energy_used = row['energy_kwh']
        hour = row['date_time'].hour
        energy_cost = apply_tariff(energy_used, hour)
        energy_cost_list.append(energy_cost)
    df['cost_cents'] = energy_cost_list

In [37]:
apply_tariff_iterrows(df)

Best of 3 trials with 100 function calls per trial:
Function `apply_tariff_iterrows` ran in average of 0.722 seconds.



This is much faster, but there's more we can do.

## Using Panda's ```apply()``` method

In [38]:
@timeit(repeat=3, number=100)
def apply_tariff_withapply(df):
    df['cost_cents'] = df.apply(
        lambda row: apply_tariff(
            kwh=row['energy_kwh'],
            hour=row['date_time'].hour),
        axis=1)

In [39]:
apply_tariff_withapply(df)

Best of 3 trials with 100 function calls per trial:
Function `apply_tariff_withapply` ran in average of 0.198 seconds.



## Using vectorized operations with ```isin()```

In [44]:
df.set_index('date_time', inplace=True)
@timeit(repeat=3, number=100)
def apply_tariff_isin(df):
    # Define hour range Boolean arrays
    peak_hours = df.index.hour.isin(range(17, 24))
    shoulder_hours = df.index.hour.isin(range(7,17))
    off_peak_hours = df.index.hour.isin(range(0,7))
    
    # Apply tariffs to hour ranges
    df.loc[peak_hours, 'cost_cents'] = df.loc[peak_hours, 'energy_kwh'] * 28
    df.loc[shoulder_hours, 'cost_cents'] = df.loc[shoulder_hours, 'energy_kwh'] * 20
    df.loc[off_peak_hours, 'cost_cents'] = df.loc[off_peak_hours, 'energy_kwh'] * 12

In [45]:
apply_tariff_isin(df)

Best of 3 trials with 100 function calls per trial:
Function `apply_tariff_isin` ran in average of 0.004 seconds.



Much much faster. Now, there are no conditional statements for the code to review. The Boolean array results in a vectorized operation, as does the multiplication of the energy consumption by the tariff.

## Using Panda's ```cut()``` method

In [50]:
@timeit(repeat=3, number=100)
def apply_tariff_cut(df):
    cents_per_kwh = pd.cut(x=df.index.hour,
                          bins=[0, 7, 17, 24],
                          include_lowest=True,
                          labels=[12, 20, 28]).astype(int)
    df['cost_cents'] = cents_per_kwh * df['energy_kwh']

In [51]:
apply_tariff_cut(df)

Best of 3 trials with 100 function calls per trial:
Function `apply_tariff_cut` ran in average of 0.001 seconds.



## Using NumPy

In [53]:
@timeit(repeat=3, number=100)
def apply_tariff_digitize(df):
    prices = np.array([12, 20, 28])
    bins = np.digitize(df.index.hour.values, bins=[7, 17, 24])
    df['cost_cents'] = prices[bins] * df['energy_kwh'].values

In [54]:
apply_tariff_digitize(df)

Best of 3 trials with 100 function calls per trial:
Function `apply_tariff_digitize` ran in average of 0.000 seconds.



In [59]:
bins = np.digitize(df.index.hour.values, bins=[7, 17, 24])
np.unique(bins)

array([0, 1, 2])

# Saving Preprocessed Data in HDF5

Just like saving pretrained machine learning models, it would be inconvenient to have to reprocess the Pandas dataframe each time you wanted to use it. The datetime conversion can take a significant amount of time, for instance. If you saved is as a CSV, the conversion would need to occur every time. HDF5 formatting prevents this.

In [60]:
# Create a storage object with filename 'processed_data'
data_store = pd.HDFStore('processed_data.h5')

# Put dataframe into the object setting the key as 'preprocessed_df'
data_store['preprocessed_df'] = df
data_store.close()

Now, the dataframe does not need to be reprocessed if the notebook kernel is shutdown or is the computer is rebooted.

In [61]:
# Access the data store
data_store = pd.HDFStore('processed_data.h5')

preprocessed_df = data_store['preprocessed_df']
data_store.close()

In [62]:
preprocessed_df.dtypes

energy_kwh    float64
cost_cents    float64
dtype: object

In [13]:
import functools
import gc
import itertools
import sys
from timeit import default_timer as _timer


def timeit(_func=None, *, repeat=3, number=1000, file=sys.stdout):
    """Decorator: prints time from best of `repeat` trials.
    Mimics `timeit.repeat()`, but avg. time is printed.
    Returns function result and prints time.
    You can decorate with or without parentheses, as in
    Python's @dataclass class decorator.
    kwargs are passed to `print()`.
    >>> @timeit
    ... def f():
    ...     return "-".join(str(n) for n in range(100))
    ...
    >>> @timeit(number=100000)
    ... def g():
    ...     return "-".join(str(n) for n in range(10))
    ...
    """

    _repeat = functools.partial(itertools.repeat, None)

    def wrap(func):
        @functools.wraps(func)
        def _timeit(*args, **kwargs):
            # Temporarily turn off garbage collection during the timing.
            # Makes independent timings more comparable.
            # If it was originally enabled, switch it back on afterwards.
            gcold = gc.isenabled()
            gc.disable()

            try:
                # Outer loop - the number of repeats.
                trials = []
                for _ in _repeat(repeat):
                    # Inner loop - the number of calls within each repeat.
                    total = 0
                    for _ in _repeat(number):
                        start = _timer()
                        result = func(*args, **kwargs)
                        end = _timer()
                        total += end - start
                    trials.append(total)

                # We want the *average time* from the *best* trial.
                # For more on this methodology, see the docs for
                # Python's `timeit` module.
                #
                # "In a typical case, the lowest value gives a lower bound
                # for how fast your machine can run the given code snippet;
                # higher values in the result vector are typically not
                # caused by variability in Python’s speed, but by other
                # processes interfering with your timing accuracy."
                best = min(trials) / number
                print(
                    "Best of {} trials with {} function"
                    " calls per trial:".format(repeat, number)
                )
                print(
                    "Function `{}` ran in average"
                    " of {:0.3f} seconds.".format(func.__name__, best),
                    end="\n\n",
                    file=file,
                )
            finally:
                if gcold:
                    gc.enable()
            # Result is returned *only once*
            return result

        return _timeit

    # Syntax trick from Python @dataclass
    if _func is None:
        return wrap
    else:
        return wrap(_func)