# Data preparation

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
CITY_NAME = 'LAS VEGAS'
STORE_NAME = 'LAS VEGAS SOUTH'

## Sales/traffic Data

### Merge (if not done before)

In [None]:
# sales_clean = pd.read_csv('data/1208_LasVegas_South_sales_cleaned.csv')

In [None]:
# sales_clean.head(3)

In [None]:
# sales_clean.date = pd.to_datetime(sales_clean.date, format='%m/%d/%y')
# sales_clean = sales_clean[['date', 'sales_original', 'sales_cleaned']]

In [None]:
# traffic_clean = pd.read_csv('data/1208_LasVegas_South_traffic_cleaned.csv')

In [None]:
# traffic_clean.head(3)

In [None]:
# traffic_clean.date = pd.to_datetime(traffic_clean.date, format='%m/%d/%y')
# traffic_clean = traffic_clean[['date', 'traffic_original', 'traffic_cleaned']]

In [None]:
# assert sales_clean.shape[0] == traffic_clean.shape[0]

In [None]:
# sales_traffic_clean = pd.merge(sales_clean, traffic_clean, on='date')

In [None]:
# sales_traffic_clean.info()

### Read data (if merged before)
- Assume the sales/traffic data have the format: `'sales_traffic_' + STORE_NAME + '.csv'`
- With columns `date`, `sales_original`, `sales_cleaned`, `traffic_original`, `traffic_cleaned`

In [None]:
sales_traffic_clean = pd.read_csv('sales_traffic_' + STORE_NAME + '.csv')

In [None]:
sales_traffic_clean.date = pd.to_datetime(sales_traffic_clean.date, format='%Y-%m-%d')

In [None]:
# Constant date values
DATE_MIN = sales_traffic_clean.date.min().normalize()
DATE_MAX = sales_traffic_clean.date.max().normalize()

In [None]:
DATE_MIN, DATE_MAX

## Events Data

- Assume the events data have the format: `'events_' + CITY_NAME + '.csv'`
- With the raw columns from PredictHQ plus the generated `venue_type`

In [None]:
# Constant location for stores
LAT_STORE, LON_STORE = {}, {}

LAT_STORE['ORLANDO FOA'] = 28.473595
LAT_STORE['LAKE BUENA VISTA FOA'] = 28.387852
LAT_STORE['LANCASTER FSC'] = 40.025636
LAT_STORE['LAS VEGAS NORTH'] = 36.170727
LAT_STORE['LAS VEGAS SOUTH'] = 36.056725

LON_STORE['ORLANDO FOA'] = -81.451615
LON_STORE['LAKE BUENA VISTA FOA'] = -81.493674
LON_STORE['LANCASTER FSC'] = -76.217167
LON_STORE['LAS VEGAS NORTH'] = -115.157651
LON_STORE['LAS VEGAS SOUTH'] = -115.170121

In [None]:
events = pd.read_csv('events_' + CITY_NAME + '.csv')

In [None]:
import mpu

def draw_events_data(events, store_name, date_min=DATE_MIN, date_max=DATE_MAX, 
                     lat_store=LAT_STORE, lon_store=LON_STORE):
    
    def compute_time(df):
        if df['start'].hour >= 12:
            return 'pm'
        else:
            return 'am'
    
    def compute_dist(df, store_name):
        return mpu.haversine_distance((df.latitude, df.longitude), 
                                      (lat_store[store_name], lon_store[store_name]))
    
    def compute_capacity(df):
        # exp((rank + 19.769) / 9.6442) by reverse engineering
        return np.exp((df['rank'] + 19.769) / 9.6442)
    
    def compute_impact(df, method):
        if method == 'linear':
            return df['rank'] / df['distance']
        if method == 'exp':
            return np.sqrt(np.exp(df['rank'])) / np.square(df['distance'])
    
    # Split location variable into longitude and latitude
    lat = [float(events.location[i].strip("\"").split(',')[0]) \
           for i in range(len(events))]
    lon = [float(events.location[i].strip("\"").split(',')[1]) \
           for i in range(len(events))]
    events['longitude'] = lon
    events['latitude'] = lat
    
    events.start = pd.to_datetime(events.start)
    events.end = pd.to_datetime(events.end)
    
    # Extract time (am/pm)
    events['start_time'] = events.apply(compute_time, axis=1)
    
    # Generate a date range for the events
    events['start'] = events.start.dt.tz_localize(None).dt.normalize()
    events['end'] = events.end.dt.tz_localize(None).dt.normalize()
    
    date_range = events.apply(lambda x: pd.date_range(x.start, x.end).tolist(), 
                              axis=1)
    
    events = events.drop(['id', 'start', 'end', 'predicted_end', \
                          'timezone', 'country', 'location', \
                          'venue_formatted_address', 'state', 'first_seen'], 1)
    
    events_by_date = pd.DataFrame(columns = events.columns)
    dates_rearr = []
    for i in range(len(events)):
        for dates in date_range[i]:
            if (dates <= date_max) & (dates >= date_min):
                dates_rearr.append(dates)
                events_by_date = events_by_date.append(events.iloc[i])
    
    # Aggregate new features
    # Distance in miles
    events_by_date['distance'] = events_by_date.apply(compute_dist, 
                                                  store_name=store_name, axis=1).div(1.609)
    # Estimated capacity
    events_by_date['est_capacity'] = events_by_date.apply(compute_capacity, axis=1)
    # Impact
    events_by_date['impact_linear'] = events_by_date.apply(compute_impact, method='linear', axis=1)
#     events_by_date['impact_exp'] = events_by_date.apply(compute_impact, method='exp', axis=1)
    
    events_by_date['date'] = dates_rearr
    # Check if an event is annual
    events_by_date['year'] = events_by_date.date.dt.year
    is_annual = events_by_date.groupby('title')['year'].nunique().to_frame('is_annual').reset_index()
    is_annual['is_annual'] = is_annual.eval('is_annual == 3').astype(int)
    events_by_date = pd.merge(events_by_date, is_annual, on='title')
    
    # Reorganize the columns
    cols = ['title', 'description', 'labels', 'category', \
            'date', 'year', 'is_annual', 'start_time', 'duration', \
            'venue_name', 'scope', 'venue_type', 'est_capacity', 'distance', 'longitude', 'latitude', \
            'rank', 'local_rank', 'aviation_rank', 'impact_linear'] # 'impact_exp'
    events_by_date = events_by_date[cols]
    
    # For efficiency, write out the data
    events_by_date.to_csv('events_' + store_name + '.csv', index=False)

In [None]:
draw_events_data(events, STORE_NAME)