In [1]:
import datetime
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import datetime as dt
import calendar

%matplotlib inline

# Load Data

In [2]:
df_air_reserve =       pd.read_csv("../data/raw/air_reserve.csv", parse_dates=['visit_datetime', 'reserve_datetime'])
df_air_store_info =    pd.read_csv("../data/raw/air_store_info.csv")
df_air_visit_data =    pd.read_csv("../data/raw/air_visit_data.csv", parse_dates=['visit_date'])

df_hpg_reserve =       pd.read_csv("../data/raw/hpg_reserve.csv", parse_dates=['visit_datetime', 'reserve_datetime'])
df_hpg_store_info =    pd.read_csv("../data/raw/hpg_store_info.csv")

df_date_info =         pd.read_csv("../data/raw/date_info.csv", parse_dates=['calendar_date'])
df_store_id_relation = pd.read_csv("../data/raw/store_id_relation.csv")

df_sample_submission = pd.read_csv("../data/raw/sample_submission.csv")

# Overview: File structure and content

### Air Visits

In [3]:
df_air_visit_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 252108 entries, 0 to 252107
Data columns (total 3 columns):
air_store_id    252108 non-null object
visit_date      252108 non-null datetime64[ns]
visitors        252108 non-null int64
dtypes: datetime64[ns](1), int64(1), object(1)
memory usage: 5.8+ MB


In [4]:
df_air_visit_data.head()

Unnamed: 0,air_store_id,visit_date,visitors
0,air_ba937bf13d40fb24,2016-01-13,25
1,air_ba937bf13d40fb24,2016-01-14,32
2,air_ba937bf13d40fb24,2016-01-15,29
3,air_ba937bf13d40fb24,2016-01-16,22
4,air_ba937bf13d40fb24,2016-01-18,6


### Air Reserve

In [5]:
df_air_reserve.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 92378 entries, 0 to 92377
Data columns (total 4 columns):
air_store_id        92378 non-null object
visit_datetime      92378 non-null datetime64[ns]
reserve_datetime    92378 non-null datetime64[ns]
reserve_visitors    92378 non-null int64
dtypes: datetime64[ns](2), int64(1), object(1)
memory usage: 2.8+ MB


In [6]:
df_air_reserve.head()

Unnamed: 0,air_store_id,visit_datetime,reserve_datetime,reserve_visitors
0,air_877f79706adbfb06,2016-01-01 19:00:00,2016-01-01 16:00:00,1
1,air_db4b38ebe7a7ceff,2016-01-01 19:00:00,2016-01-01 19:00:00,3
2,air_db4b38ebe7a7ceff,2016-01-01 19:00:00,2016-01-01 19:00:00,6
3,air_877f79706adbfb06,2016-01-01 20:00:00,2016-01-01 16:00:00,2
4,air_db80363d35f10926,2016-01-01 20:00:00,2016-01-01 01:00:00,5


### Air Store Info

In [None]:
df_air_reserve.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 92378 entries, 0 to 92377
Data columns (total 4 columns):
air_store_id        92378 non-null object
visit_datetime      92378 non-null datetime64[ns]
reserve_datetime    92378 non-null datetime64[ns]
reserve_visitors    92378 non-null int64
dtypes: datetime64[ns](2), int64(1), object(1)
memory usage: 2.8+ MB


In [None]:
df_air_store_info.head()

### HPG Reserve

In [None]:
df_hpg_reserve.info()

In [None]:
df_hpg_reserve.head()

### HPG Store Info

In [None]:
df_hpg_store_info.info()

In [None]:
df_hpg_store_info.head()

### Date Info

In [None]:
df_date_info.info()

In [None]:
df_date_info.head()

### Sample Submission

In [None]:
df_sample_submission.info()

In [None]:
df_sample_submission.head()

# Feature Visualization

#### DATA PREPROCESSING

In [None]:
# set DateTime index to perform visualizations
df_date_info.set_index('calendar_date', inplace=True)

df_holidays = df_date_info[df_date_info['holiday_flg'] == True]

In [None]:
df_holidays.head()

## Air Visits

In [None]:
# set DateTime index to perform visualizations
df_air_visit_data.set_index('visit_date', inplace=True)

#### STATISTICS ON NUMBER OF VISITORS PER DAY IN THE WHOLE TIME FRAME

In [None]:
# display statistics per day
visits_daily = df_air_visit_data.resample('D').sum()
ax = visits_daily.plot(figsize=(18,6), title='Total number of visitors per day (full training set) and holidays');

# display holidays
for index, row in df_holidays.iterrows():
    ax.axvline(x=index, color='b', linewidth=0.5, label='holidays');

# plot styling
ax.set_xlabel("Date")
ax.set_ylabel("# of visitors");

ax.xaxis.grid(True, 'major')
ax.xaxis.grid(True, 'minor')
    
ax.legend(['visitors', 'holidays']);

#### STATISTICS ON NUMBER OF VISITORS PER MONTH

In [None]:
# display statistics per month
visits_monthly = df_air_visit_data.resample('M').sum()
ax = visits_monthly.plot(figsize=(18,6), title='Total number of visitors per month (full training set)');

ax.set_xlabel("Date")
ax.set_ylabel("# of visitors");

ax.xaxis.grid(True, 'major')
ax.xaxis.grid(True, 'minor')

#### MOST COMMON NUMBER OF VISITORS PER RESTAURANT PER DAY

In [None]:
# determine number of bins to use
bin_counts = np.bincount(df_air_visit_data['visitors'])

# display bins
ax = df_air_visit_data['visitors'].hist(bins=len(bin_counts), figsize=(18,6), color='c', edgecolor='black');

# display mode (most common value)
ax.axvline(x=np.argmax(bin_counts), color='b', linewidth=0.5, linestyle='dashed');
# display median
ax.axvline(x=df_air_visit_data['visitors'].median(), color='r', linewidth=0.5, linestyle='dashed');

# plot styling
ax.set_xscale('log')

ax.set_title("Number of visitors in restaurants per day")
ax.set_xlabel("Visitors per day")
ax.set_ylabel("Count");

ax.legend(['Most common number', 'Median']);

In [None]:
print("Most common number of visitors in restaurants per day is {0} (occurred {1} times)." \
          .format(np.argmax(bin_counts), np.max(bin_counts)))

#### STATISTICS ON NUMBER OF VISITORS PER RESTAURANT PER WEEKDAY

In [None]:
# create new columns
df_air_visit_data['visit_date'] = df_air_visit_data.index
df_air_visit_data['weekday'] = df_air_visit_data['visit_date'].dt.dayofweek
df_air_visit_data['weekday_name'] = df_air_visit_data['visit_date'].dt.weekday_name

# group data by weekday
visits_per_weekday = \
    df_air_visit_data[['weekday', 'weekday_name', 'visitors']] \
        .groupby(['weekday', 'weekday_name']) \
        .agg(['median']) \
        .sort_index(ascending=True)

# display data
ax = visits_per_weekday.plot(kind='bar')

# plot styling
ax.set_title("Median number of visitors in restaurant per day of week")
ax.set_xlabel("Days of week")
ax.set_ylabel("Median visitors");
ax.legend_.remove()

ax.set_xticklabels(list(calendar.day_name), rotation=45);
ax.axhline(y=df_air_visit_data['visitors'].median(), color='r', linewidth=0.5, linestyle='dashed');