# Data Processing for Explanatory Variables

In [44]:
import pandas as pd
import numpy as np

In [45]:
date_range = pd.date_range(start="2009-01-12", end="2011-09-12", freq='D')
combined_df = pd.DataFrame({'date': date_range})

In [46]:
# Google Trend of Search "gift ideas" in UK
google = pd.read_csv("data/google_trend_gift_ideas.csv")
google['Week'] = pd.to_datetime(google['Week'])
google = google.rename(columns={'gift ideas: (United Kingdom)': 'gift_ideas'})

google_expanded = pd.DataFrame({
    'date': google['Week'].repeat(7) + pd.to_timedelta(np.tile(range(7), len(google)), unit='D'),
    'gift_ideas': google['gift_ideas'].repeat(7).values
})

google_expanded = google_expanded[google_expanded['date'].isin(date_range)]
combined_df = combined_df.merge(google_expanded, on='date', how='left')

In [47]:
# UK Consumer Confidence Index
confidence = pd.read_csv("data/uk_consumer_confidence.csv")
confidence['observation_date'] = pd.to_datetime(confidence['observation_date'])
confidence = confidence.rename(columns={'CSCICP02GBM460S': 'consumer_confidence'})
confidence = confidence.set_index('observation_date')
confidence = confidence.reindex(pd.date_range(confidence.index.min(), date_range.max()), method='ffill')
confidence = confidence.loc[date_range].reset_index()
confidence.columns = ['date', 'consumer_confidence']
combined_df = combined_df.merge(confidence, on='date', how='left')

In [48]:
# Retail Sales Index
retail = pd.read_csv("data/retail_sales_index.csv")
retail['mmm-yy'] = pd.to_datetime(retail['mmm-yy'], format='%b-%y')
retail = retail.pivot_table(index='mmm-yy', 
                            columns='type-of-prices', 
                            values='v4_1', 
                            aggfunc='first')
retail = retail.reindex(pd.date_range(retail.index.min(), date_range.max()), method='ffill')
retail = retail.loc[date_range].reset_index()
retail = retail.rename(columns={'index': 'date'})
combined_df = combined_df.merge(retail, on='date', how='left')


In [49]:
# UK Holidays
holidays = pd.read_csv("data/uk_holiday.csv")
holidays['date'] = pd.to_datetime(holidays['date'])
holidays['is_holiday'] = True
holidays = holidays[['date', 'is_holiday']]
combined_df = combined_df.merge(holidays, on='date', how='left')
combined_df['is_holiday'] = combined_df['is_holiday'].fillna(False)

  combined_df['is_holiday'] = combined_df['is_holiday'].fillna(False)


In [50]:
# Black Friday
black_fridays = pd.to_datetime(["2009-11-27", "2010-11-26"])
combined_df['is_black_friday'] = combined_df['date'].isin(black_fridays)

In [51]:
# Cyber Monday
cyber_mondays = pd.to_datetime(["2009-11-30", "2010-11-29"])
combined_df['is_cyber_monday'] = combined_df['date'].isin(cyber_mondays)

In [52]:
combined_df

Unnamed: 0,date,gift_ideas,consumer_confidence,chained-volume-percentage-change-3-months-on-same-period-a-year-earlier,chained-volume-percentage-change-on-previous-month,chained-volume-percentage-change-on-same-month-a-year-earlier,current-prices-percentage-change-on-previous-month,current-prices-percentage-change-on-same-month-a-year-earlier,is_holiday,is_black_friday,is_cyber_monday
0,2009-01-12,14,-30.1,3.6,-2.8,1.1,-4.5,0.9,False,False,False
1,2009-01-13,14,-30.1,3.6,-2.8,1.1,-4.5,0.9,False,False,False
2,2009-01-14,14,-30.1,3.6,-2.8,1.1,-4.5,0.9,False,False,False
3,2009-01-15,14,-30.1,3.6,-2.8,1.1,-4.5,0.9,False,False,False
4,2009-01-16,14,-30.1,3.6,-2.8,1.1,-4.5,0.9,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...
969,2011-09-08,21,-24.6,7.2,-0.8,8.3,-0.3,13.7,False,False,False
970,2011-09-09,21,-24.6,7.2,-0.8,8.3,-0.3,13.7,False,False,False
971,2011-09-10,21,-24.6,7.2,-0.8,8.3,-0.3,13.7,False,False,False
972,2011-09-11,22,-24.6,7.2,-0.8,8.3,-0.3,13.7,False,False,False


In [53]:
combined_df.to_csv("combined_explanatory_variables.csv", index=False)