In [2]:
# Imports
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

In [58]:
# Reading data
data_pd = pd.read_csv('data/Avocado.csv')

# Cleaning up region names to avoid duplicates
data_pd['region'] = (data_pd['region'].str.replace('/', '')
                                      .str.replace(' ', '')
                                      .str.replace('.', ''))

# Ideas to explore
- Estimating the price elasticity of demand for avocados
- Demand forecasting
- Apply double machine learning (EconML or CausalML) to perform the above two
- Given external data with unit costs of avocado over the timeframe, can we describe the optimal price and optimize for the future?
- Round number effect?

## Data processing

In [59]:
# Convert date and sort the table by it
data_pd['Date'] = pd.to_datetime(data_pd['Date'])
data_pd = data_pd.sort_values(by='Date', ascending=True)

In [80]:
# Calculate the PED for each region and organic/regular separately
def calculate_ped(dataframe):

    # Get the percentage difference from row to row in price and volume
    dataframe['delta price[%]'] = dataframe['AveragePrice'].pct_change()
    dataframe['delta volume[%]'] = dataframe['Total Volume'].pct_change()

    # DeltaQ / DeltaP
    ped_series = (dataframe['delta volume[%]'] /
                        dataframe['delta price[%]'])
    
    # Get rid of infinite values to allow average calculation
    ped_series.replace([np.inf, -np.inf], np.nan, inplace=True)

    return ped_series

In [83]:
regions = data_pd['region'].unique()
ped_dict = {}

# Temporarily disable SettingWithCopyWarning
pd.options.mode.chained_assignment = None  # default='warn'
# For each region, calculate the Price Elasticity of Demand (Volume)
for region in regions:
    data_region_pd = data_pd[data_pd['region'] == region]

    # Calculate for conventional avocado
    data_region_pd['ped_conventional'] = (
        calculate_ped(data_region_pd[data_region_pd['type'] == 'conventional']))
    
    # Calculate for organic avocados
    data_region_pd['ped_organic'] = (
        calculate_ped(data_region_pd[data_region_pd['type'] == 'organic']))
    
    # Get the mean PED value
    ped_conv = data_region_pd['ped_conventional'].mean()
    ped_organic = data_region_pd['ped_organic'].mean()

    # Store values
    ped_dict[region] = {'PED Conventional': ped_conv, 'PED Organic': ped_organic}


# Re-enable warning
pd.options.mode.chained_assignment = 'warn'  # default='warn'

# Convert dict to dataframe
ped_df = pd.DataFrame.from_dict(ped_dict, orient='index').reset_index(names='Region')

In [78]:
collective_regions = ['TotalUS', 'South', 'Southeast', 'Northeast', 'Midsouth', 'NorthernNewEngland', 'SouthCentral', 'West', 'Plains']

# Separate the local values from the collective regions
ped_local_df = ped_df[~ped_df['Region'].isin(collective_regions)]
ped_regional_df = ped_df[ped_df['Region'].isin(collective_regions)]

# Result of basic PED calculation
As seen below we get a realistic result of around -1.5 for the separate regions.

Practically this means a 1.5% decline in volume when the price increases 1%

In [76]:
ped_local_df.drop(columns=['Region']).mean()

PED Conventional   -1.532641
PED Organic        -1.518723
dtype: float64

The collective regions are more noisy for some reason, especially for conventional avocados. I don't have time to look into it, so from now we will just look at the individual regions

In [81]:
ped_regional_df.drop(columns=['Region']).mean()

PED Conventional   -3.234559
PED Organic        -1.418591
dtype: float64

In [82]:
ped_regional_df

Unnamed: 0,Region,PED Conventional,PED Organic
1,Northeast,6.451289,-2.127297
6,Midsouth,-2.428041,-2.15998
11,NorthernNewEngland,-1.815592,-1.109736
32,Southeast,-5.097503,-1.898865
33,SouthCentral,-1.21719,-1.088508
46,Plains,-1.622128,-1.491515
51,TotalUS,-18.406334,0.444309
53,West,-1.740973,-1.917137
