**Notebook description:**

This notebook can be used to update [this][1] dataset by merging it with the new data downloaded from [the Hass Avocado Board website][2].

[1]: https://www.kaggle.com/timmate/avocado-prices-2020
[2]: https://hassavocadoboard.com/category-data/

In [None]:
import pandas as pd

In [None]:
# Put csv files with the new data into that directory.
DATASETS_DIR = 'new_avocado_data'

## Read in the original (updated upto 2020, not Justin's) dataset

In [None]:
original_dataset_path = 'avocado-updated-2020.csv'
original_df = pd.read_csv(original_dataset_path,
                          parse_dates=['date'],
                          index_col='date')

## Read in the new data and merge it with the original dataset

In [None]:
import os

# Define a dictionary for renaming names of columns of the new datasets.
RENAMED_COLS_DICT = {
    'asp_current_year': 'average_price',
    'total_bulk_and_bags_units': 'total_volume',
    '4046_units': '4046',
    '4225_units': '4225',
    '4770_units': '4770',
    'totalbagged_units': 'total_bags',
    'smlbagged_units': 'small_bags',
    'lrgbagged_units': 'large_bags',
    'x-lrgbagged_units': 'xlarge_bags'
}

cat_df = original_df.copy()  # a df for concatenating (i.e., merging) the data

print("original dataset's shape:",  original_df.shape)
print()

# Define an accumulatator of the number of entries. This accumulator is used
# for the testing purposes at the bottom of this cell.
n_total_entries_accumulator =  original_df.shape[0]

filenames = sorted(os.listdir(DATASETS_DIR))

for filename in filenames:
    base_filename, extension = filename.split('.')

    if extension != 'csv':
        continue

    else:
        print('processing', filename, '...')

        file_path = os.path.join(DATASETS_DIR, filename)
        tmp_df = pd.read_csv(file_path,
                             parse_dates=['Current Year Week Ending'],
                             index_col='Current Year Week Ending')

        print("dataset's shape:", tmp_df.shape)

        tmp_df.drop('Timeframe', axis=1, inplace=True)
        tmp_df.sort_values(['Current Year Week Ending', 'Geography'], inplace=True)

        # Lower the columns' names and replace spaces with underscores.
        tmp_df.rename(lambda col_name: col_name.lower().replace(' ', '_'), axis=1, inplace=True)
        tmp_df.rename(RENAMED_COLS_DICT, axis=1, inplace=True)
        tmp_df.index.name = 'date'
        tmp_df.type.replace('Conventional ', 'Conventional', inplace=True)
        assert tmp_df.type.nunique() == 2, 'dataset` must contain only 2 types of avocados'

        tmp_df.type = tmp_df.type.apply(lambda avocado_type: avocado_type.lower())
        tmp_df['year'] = tmp_df.index.year

        print('adding', tmp_df.shape[0], 'entries to the original dataset...')
        print()

        cat_df = pd.concat([cat_df, tmp_df], axis=0)
        n_total_entries_accumulator += tmp_df.shape[0]

print("final merged dataset's shape:", cat_df.shape)
print('number of entries in the merged dataset should be:', n_total_entries_accumulator)

assert cat_df.geography.nunique() == 54, 'merged dataset must have 54 unique geographical names'

# Drop all duplicates (if any).
print()
print('dropping duplicates...')

n_entries_before = cat_df.shape[0]
cat_df.drop_duplicates(inplace=True)
n_entries_after = cat_df.shape[0]
n_entries_dropped = n_entries_before - n_entries_after

print(f'dropped {n_entries_dropped} duplicates.')
print("merged dataset's shape after dropping duplicates:", cat_df.shape)

# Sort the merged dataset just in case (not really needed as the data should be
# already sorted).
cat_df.sort_values(['date', 'geography'], inplace=True)

original dataset's shape: (30021, 12)

processing 2020-plu-total-hab-data.csv ...
dataset's shape: (2160, 12)
adding 2160 entries to the original dataset...

final merged dataset's shape: (32181, 12)
number of entries in the merged dataset should be: 32181

dropping duplicates...
dropped 2160 duplicates.
merged dataset's shape after dropping duplicates: (30021, 12)


## Save the updated (merged) dataset

In [None]:
# Save with index as it contains the dates.
cat_df.to_csv('avocado-updated-again-2020.csv')