# Data preparation

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Sales/traffic Data

### Merge (if not done before)

In [None]:
# sales_clean = pd.read_csv('data/1208_LasVegas_South_sales_cleaned.csv')

In [None]:
# sales_clean.head(3)

In [None]:
# sales_clean.date = pd.to_datetime(sales_clean.date, format='%m/%d/%y')
# sales_clean = sales_clean[['date', 'sales_original', 'sales_cleaned']]

In [None]:
# traffic_clean = pd.read_csv('data/1208_LasVegas_South_traffic_cleaned.csv')

In [None]:
# traffic_clean.head(3)

In [None]:
# traffic_clean.date = pd.to_datetime(traffic_clean.date, format='%m/%d/%y')
# traffic_clean = traffic_clean[['date', 'traffic_original', 'traffic_cleaned']]

In [None]:
# assert sales_clean.shape[0] == traffic_clean.shape[0]

In [None]:
# sales_traffic_clean = pd.merge(sales_clean, traffic_clean, on='date')

In [None]:
# sales_traffic_clean.info()

### Read data (if merged before)

In [2]:
CITY_NAME = 'ORLANDO'
STORE_NAME = 'ORLANDO FOA'

In [3]:
sales_traffic_clean = pd.read_csv('sales_traffic_' + STORE_NAME + '.csv')

In [4]:
sales_traffic_clean.head(3)

Unnamed: 0,date,sales_original,sales_cleaned,traffic_original,traffic_cleaned
0,2017-04-02,63087.03,3.69895,21760,-14.127418
1,2017-04-03,60186.03,-9.06278,22440,1.867672
2,2017-04-04,56885.94,12.08802,22408,2.918414


In [5]:
sales_traffic_clean.date = pd.to_datetime(sales_traffic_clean.date, format='%Y-%m-%d')

In [6]:
# Constant date values
DATE_MIN = sales_traffic_clean.date.min().normalize()
DATE_MAX = sales_traffic_clean.date.max().normalize()

In [7]:
DATE_MIN, DATE_MAX

(Timestamp('2017-04-02 00:00:00'), Timestamp('2019-09-28 00:00:00'))

## Events Data

In [8]:
# Constant location for stores
LAT_STORE, LON_STORE = {}, {}

LAT_STORE['ORLANDO FOA'] = 28.473595
LAT_STORE['LAKE BUENA VISTA FOA'] = 28.387852
LAT_STORE['LANCASTER FSC'] = 40.025636
LAT_STORE['LAS VEGAS NORTH'] = 36.170727
LAT_STORE['LAS VEGAS SOUTH'] = 36.056725

LON_STORE['ORLANDO FOA'] = -81.451615
LON_STORE['LAKE BUENA VISTA FOA'] = -81.493674
LON_STORE['LANCASTER FSC'] = -76.217167
LON_STORE['LAS VEGAS NORTH'] = -115.157651
LON_STORE['LAS VEGAS SOUTH'] = -115.170121

In [9]:
events = pd.read_csv('events_' + CITY_NAME + '.csv')

In [10]:
events.head(5)

Unnamed: 0,id,title,description,start,end,predicted_end,duration,labels,category,timezone,...,location,venue_name,venue_formatted_address,scope,rank,local_rank,aviation_rank,state,first_seen,venue_type
0,tNCvM9eaqq7KgfvtaC,Screamin' Green Hauntoween,"""Spooky and colorful family fun awaits during ...",2016-10-08T14:00:00Z,2016-10-09T00:00:00Z,,36000,"""attraction,community,family""",community,America/New_York,...,"""28.444559,-81.391691""",Crayola Experience,8001 S Orange Blossom Trl\nSouth Orange Blosso...,locality,24,45.0,,active,2018-01-31T17:54:29Z,indoor
1,lxgv6VgLXele,Moderate Delays - Orlando International Airpor...,,2016-10-08T21:30:00Z,2016-10-09T00:30:00Z,,10800,"""airport,delay""",airport-delays,America/New_York,...,"""28.43126,-81.307509""",Hyatt Regency Orlando International Airport,"9300 Jeff Fuqua Boulevard\nOrlando, FL 32827\n...",locality,40,,,active,2016-10-08T01:55:49Z,indoor
2,030e6833f057d76d21,Leif Erikson Day,"""Leif Erikson Day honors the first Scandinavia...",2016-10-09T00:00:00Z,2016-10-09T23:59:59Z,,86399,"""holiday,observance""",observances,,...,"""37.09024,-95.712891""",Leif Erikson Day,,country,50,,0.0,active,2015-01-06T12:35:08Z,outdoor
3,8MwekkdMl77j,Henry Rollins (21+),,2016-10-09T00:00:00Z,2016-10-09T00:00:00Z,,0,"""concert,music""",concerts,America/New_York,...,"""28.548701,-81.351256""",The Plaza Live,"425 North Bumby Avenue\nOrlando, FL 32803\nUni...",locality,49,72.0,0.0,active,2016-09-13T04:56:15Z,indoor
4,D5jGBJZzGdpN,The Illusionists,,2016-10-09T00:00:00Z,2016-10-09T00:00:00Z,,0,"""performing-arts""",performing-arts,America/New_York,...,"""28.537531,-81.376895""",Dr. Phillips Center - Walt Disney Theater,"445 South Magnolia Avenue\nOrlando, FL 32801\n...",locality,59,77.0,,active,2016-06-10T21:57:50Z,indoor


In [11]:
events.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25379 entries, 0 to 25378
Data columns (total 21 columns):
id                         25379 non-null object
title                      25379 non-null object
description                16916 non-null object
start                      25379 non-null object
end                        25379 non-null object
predicted_end              720 non-null object
duration                   25379 non-null int64
labels                     25379 non-null object
category                   25379 non-null object
timezone                   24388 non-null object
country                    25376 non-null object
location                   25379 non-null object
venue_name                 23715 non-null object
venue_formatted_address    22648 non-null object
scope                      25379 non-null object
rank                       25379 non-null int64
local_rank                 23376 non-null float64
aviation_rank              10960 non-null float64
state      

In [12]:
events.category.unique()

array(['community', 'airport-delays', 'observances', 'concerts',
       'performing-arts', 'festivals', 'expos', 'sports', 'conferences',
       'severe-weather', 'daylight-savings', 'politics',
       'public-holidays', 'school-holidays', 'disasters', 'academic',
       'terror'], dtype=object)

In [13]:
events.timezone.unique()

array(['America/New_York', nan, 'America/Chicago'], dtype=object)

In [14]:
import mpu

def draw_events_data(events, store_name, date_min=DATE_MIN, date_max=DATE_MAX, 
                     lat_store=LAT_STORE, lon_store=LON_STORE):
    
    def compute_time(df):
        if df['start'].hour >= 12:
            return 'pm'
        else:
            return 'am'
    
    def compute_dist(df, store_name):
        return mpu.haversine_distance((df.latitude, df.longitude), 
                                      (lat_store[store_name], lon_store[store_name]))
    
    def compute_capacity(df):
        # exp((rank + 19.769) / 9.6442) by reverse engineering
        return np.exp((df['rank'] + 19.769) / 9.6442)
    
    def compute_impact(df, method):
        if method == 'linear':
            return df['rank'] / df['distance']
        if method == 'exp':
            return np.sqrt(np.exp(df['rank'])) / np.square(df['distance'])
    
    # Split location variable into longitude and latitude
    lat = [float(events.location[i].strip("\"").split(',')[0]) \
           for i in range(len(events))]
    lon = [float(events.location[i].strip("\"").split(',')[1]) \
           for i in range(len(events))]
    events['longitude'] = lon
    events['latitude'] = lat
    
    events.start = pd.to_datetime(events.start)
    events.end = pd.to_datetime(events.end)
    
    # Extract time (am/pm)
    events['start_time'] = events.apply(compute_time, axis=1)
    
    # Generate a date range for the events
    events['start'] = events.start.dt.tz_localize(None).dt.normalize()
    events['end'] = events.end.dt.tz_localize(None).dt.normalize()
    
    date_range = events.apply(lambda x: pd.date_range(x.start, x.end).tolist(), 
                              axis=1)
    
    events = events.drop(['id', 'start', 'end', 'predicted_end', \
                          'timezone', 'country', 'location', \
                          'venue_formatted_address', 'state', 'first_seen'], 1)
    
    events_by_date = pd.DataFrame(columns = events.columns)
    dates_rearr = []
    for i in range(len(events)):
        for dates in date_range[i]:
            if (dates <= date_max) & (dates >= date_min):
                dates_rearr.append(dates)
                events_by_date = events_by_date.append(events.iloc[i])
    
    # Aggregate new features
    # Distance in miles
    events_by_date['distance'] = events_by_date.apply(compute_dist, 
                                                  store_name=store_name, axis=1).div(1.609)
    # Estimated capacity
    events_by_date['est_capacity'] = events_by_date.apply(compute_capacity, axis=1)
    # Impact
    events_by_date['impact_linear'] = events_by_date.apply(compute_impact, method='linear', axis=1)
    events_by_date['impact_exp'] = events_by_date.apply(compute_impact, method='exp', axis=1)
    
    events_by_date['date'] = dates_rearr
    # Check if an event is annual
    events_by_date['year'] = events_by_date.date.dt.year
    is_annual = events_by_date.groupby('title')['year'].nunique().to_frame('is_annual').reset_index()
    is_annual['is_annual'] = is_annual.eval('is_annual == 3').astype(int)
    events_by_date = pd.merge(events_by_date, is_annual, on='title')
    
    # Reorganize the columns
    cols = ['title', 'description', 'labels', 'category', \
            'date', 'year', 'is_annual', 'start_time', 'duration', \
            'venue_name', 'scope', 'venue_type', 'est_capacity', 'distance', 'longitude', 'latitude', \
            'rank', 'local_rank', 'aviation_rank', 'impact_linear', 'impact_exp']
    events_by_date = events_by_date[cols]
    
    # For efficiency, write out the data
    events_by_date.to_csv('events_' + store_name + '.csv', index=False)

In [15]:
draw_events_data(events, STORE_NAME)

In [16]:
events_by_date = pd.read_csv('events_' + STORE_NAME + '.csv')

In [17]:
events_by_date.date = pd.to_datetime(events_by_date.date, format='%Y-%m-%d')

In [18]:
events_by_date.head(3)

Unnamed: 0,title,description,labels,category,date,year,is_annual,start_time,duration,venue_name,...,venue_type,est_capacity,distance,longitude,latitude,rank,local_rank,aviation_rank,impact_linear,impact_exp
0,2017 Florida Black Student Union Conference,"""On APRIL 1 - APRIL 2, 2017 the Florida BSU Co...","""community,conference""",conferences,2017-04-02,2017,0,pm,115200,,...,,670.800257,17.316583,-81.203304,28.596877,43,60.0,0.0,2.483169,7251152.0
1,Pity The Fools by The Humor Mill Orlando,"""The Humor Mill Orlando, Central Florida's Pre...","""performing-arts""",performing-arts,2017-04-02,2017,0,pm,5400,Orlando Shakespeare Theater,...,indoor,174.253473,8.605827,-81.366891,28.573423,30,55.0,,3.48601,44139.96
2,"Guest Artist: Allen and Laura Vizzutti, trumpe...","""Campus Location: Rehearsal Hall, Auditorium(1...","""campus,concert,music""",concerts,2017-04-02,2017,0,am,0,University of Central Florida,...,indoor,68.532467,17.677997,-81.20006,28.602427,21,38.0,,1.187917,116.2053


In [19]:
events_by_date.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23801 entries, 0 to 23800
Data columns (total 21 columns):
title            23801 non-null object
description      12745 non-null object
labels           23801 non-null object
category         23801 non-null object
date             23801 non-null datetime64[ns]
year             23801 non-null int64
is_annual        23801 non-null int64
start_time       23801 non-null object
duration         23801 non-null int64
venue_name       21405 non-null object
scope            23801 non-null object
venue_type       21405 non-null object
est_capacity     23801 non-null float64
distance         23801 non-null float64
longitude        23801 non-null float64
latitude         23801 non-null float64
rank             23801 non-null int64
local_rank       18684 non-null float64
aviation_rank    11071 non-null float64
impact_linear    23801 non-null float64
impact_exp       23801 non-null float64
dtypes: datetime64[ns](1), float64(8), int64(4), object(8)
m