# Data Processing for Explanatory Variables

In [1]:
import pandas as pd
import numpy as np
import json

In [2]:
date_range = pd.date_range(start="2009-12-01", end="2011-12-09", freq='D')
combined_df = pd.DataFrame({'date': date_range})

In [3]:
# Google Trend of Search "gift ideas" in UK
google = pd.read_csv("data/google_trend_gift_ideas.csv", skiprows=1)
google['Week'] = pd.to_datetime(google['Week'])
google = google.rename(columns={'gift ideas: (United Kingdom)': 'gift_ideas'})

google_expanded = pd.DataFrame({
    'date': google['Week'].repeat(7) + pd.to_timedelta(np.tile(range(7), len(google)), unit='D'),
    'gift_ideas': google['gift_ideas'].repeat(7).values
})

google_expanded = google_expanded[google_expanded['date'].isin(date_range)]
combined_df = combined_df.merge(google_expanded, on='date', how='left')

In [4]:
# Retail Sales Index
retail = pd.read_csv("data/retail-sales-index.csv")
retail['mmm-yy'] = pd.to_datetime(retail['mmm-yy'], format='%b-%y')
retail = retail.pivot_table(index='mmm-yy', 
                            columns='type-of-prices', 
                            values='v4_1', 
                            aggfunc='first')
retail = retail.reindex(pd.date_range(retail.index.min(), date_range.max()), method='ffill')
retail = retail.loc[date_range].reset_index()
retail = retail.rename(columns={'index': 'date'})
combined_df = combined_df.merge(retail, on='date', how='left')


In [5]:
# Black Friday
black_fridays = pd.to_datetime(["2010-11-26", "2011-11-25"])
combined_df['is_black_friday'] = combined_df['date'].isin(black_fridays)

In [6]:
# Cyber Monday
cyber_mondays = pd.to_datetime(["2010-11-29", "2011-11-28"])
combined_df['is_cyber_monday'] = combined_df['date'].isin(cyber_mondays)

In [7]:
def load_json_series(filepath, column_name):
    with open(filepath, 'r') as f:
        data = json.load(f)
    df = pd.DataFrame({
        'date': pd.to_datetime(list(data.keys())),
        column_name: list(data.values())
    })
    return df

In [8]:
# Consumer Price Index
cpi_df = load_json_series('../../dataset/explanatory_variables/cpi.json', 'cpi')


In [9]:
# Consumer Confidence Index
cci_df = load_json_series('../../dataset/explanatory_variables/cci.json', 'cci')


In [10]:
# Interest Rate
interest_df = load_json_series('../../dataset/explanatory_variables/interest_rate.json', 'interest_rate')


In [11]:
# Unemployment rate (aged 16 and over, seasonally adjusted): %
unemployment_df = load_json_series('../../dataset/explanatory_variables/unemployment.json', 'unemployment_rate')

In [12]:
# UK Holidays
with open('../../dataset/explanatory_variables/public_holidays.json', 'r') as f:
    holidays_raw = json.load(f)

holidays_df = pd.DataFrame({
    'date': pd.to_datetime(list(holidays_raw.keys())),
    'is_holiday': 1
})

In [13]:
explanatory_df = cpi_df \
    .merge(cci_df, on='date', how='outer') \
    .merge(interest_df, on='date', how='outer') \
    .merge(unemployment_df, on='date', how='outer') \
    .merge(holidays_df, on='date', how='left')

In [14]:
explanatory_df['is_holiday'] = explanatory_df['is_holiday'].fillna(0)

In [15]:
# Add what day of the week it is
explanatory_df['date'] = pd.to_datetime(explanatory_df['date'])

explanatory_df['day_name'] = explanatory_df['date'].dt.day_name()

day_dummies = pd.get_dummies(explanatory_df['day_name'], prefix='is')

explanatory_df = pd.concat([explanatory_df.drop(columns='day_name'), day_dummies], axis=1)

In [16]:
combined_df = combined_df.merge(explanatory_df, on='date', how='left')

In [17]:
combined_df.to_csv("combined_explanatory_variables.csv", index=False)