# Data Processing for Explanatory Variables

In [1]:
import pandas as pd
import numpy as np

In [2]:
date_range = pd.date_range(start="2009-12-09", end="2011-12-09", freq='D')
combined_df = pd.DataFrame({'date': date_range})

In [10]:
# Google Trend of Search "gift ideas" in UK
google = pd.read_csv("data/google_trend_gift_ideas.csv", skiprows=1)
google['Week'] = pd.to_datetime(google['Week'])
google = google.rename(columns={'gift ideas: (United Kingdom)': 'gift_ideas'})

google_expanded = pd.DataFrame({
    'date': google['Week'].repeat(7) + pd.to_timedelta(np.tile(range(7), len(google)), unit='D'),
    'gift_ideas': google['gift_ideas'].repeat(7).values
})

google_expanded = google_expanded[google_expanded['date'].isin(date_range)]
combined_df = combined_df.merge(google_expanded, on='date', how='left')

In [12]:
# Retail Sales Index
retail = pd.read_csv("data/retail-sales-index.csv")
retail['mmm-yy'] = pd.to_datetime(retail['mmm-yy'], format='%b-%y')
retail = retail.pivot_table(index='mmm-yy', 
                            columns='type-of-prices', 
                            values='v4_1', 
                            aggfunc='first')
retail = retail.reindex(pd.date_range(retail.index.min(), date_range.max()), method='ffill')
retail = retail.loc[date_range].reset_index()
retail = retail.rename(columns={'index': 'date'})
combined_df = combined_df.merge(retail, on='date', how='left')


In [13]:
# Black Friday
black_fridays = pd.to_datetime(["2010-11-26", "2011-11-25"])
combined_df['is_black_friday'] = combined_df['date'].isin(black_fridays)

In [14]:
# Cyber Monday
cyber_mondays = pd.to_datetime(["2010-11-29", "2011-11-28"])
combined_df['is_cyber_monday'] = combined_df['date'].isin(cyber_mondays)

In [15]:
combined_df

Unnamed: 0,date,gift_ideas,chained-volume-percentage-change-3-months-on-same-period-a-year-earlier,chained-volume-percentage-change-on-previous-month,chained-volume-percentage-change-on-same-month-a-year-earlier,current-prices-percentage-change-on-previous-month,current-prices-percentage-change-on-same-month-a-year-earlier,is_black_friday,is_cyber_monday
0,2009-12-09,99,11.4,-1.0,9.4,-0.1,10.9,False,False
1,2009-12-10,99,11.4,-1.0,9.4,-0.1,10.9,False,False
2,2009-12-11,99,11.4,-1.0,9.4,-0.1,10.9,False,False
3,2009-12-12,99,11.4,-1.0,9.4,-0.1,10.9,False,False
4,2009-12-13,96,11.4,-1.0,9.4,-0.1,10.9,False,False
...,...,...,...,...,...,...,...,...,...
726,2011-12-05,100,8.8,-1.5,6.6,-2.0,9.4,False,False
727,2011-12-06,100,8.8,-1.5,6.6,-2.0,9.4,False,False
728,2011-12-07,100,8.8,-1.5,6.6,-2.0,9.4,False,False
729,2011-12-08,100,8.8,-1.5,6.6,-2.0,9.4,False,False


In [16]:
combined_df.to_csv("combined_explanatory_variables.csv", index=False)