In [301]:
import numpy as np 
import pandas as pd 
import warnings
warnings.filterwarnings('ignore')

In [302]:
import seaborn as sns
sns.set_palette('Set2')

import matplotlib.pyplot as plt
%matplotlib inline

In [303]:
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import LinearSVC, SVC

In [304]:
from sklearn.preprocessing import LabelEncoder
label = LabelEncoder()

In [5]:
import requests
import json
import datetime
import time

In [6]:
import os
from pandas.tseries.holiday import USFederalHolidayCalendar as calendar

In [7]:
from config import yelp_api_key
from config import darksky_api_key

In [8]:
# Import Sales Data
file = 'bar_x_sales_export.csv'

In [9]:
# Analysis Dates
start_date = '2017-01-01' # Start Date Inclusive
end_date = '2019-06-01' # End Date Exclusive

In [10]:
search_business = 'Jupiter Disco'
location = 'Brooklyn, NY'

## Import / Clean / Prep File

In [11]:
def import_parse(file):

    data = pd.read_csv(file, index_col = 'date', parse_dates=True)
    df = pd.DataFrame(data)
    
    # Rename Column to 'sales'
    df = df.rename(columns={df.columns[0]: 'sales'})
    
    # Drop NaN
    #df = df.query('sales > 0').copy()
    df.fillna(0, inplace=True)
    
    print(f'"{file}" has been imported + parsed. The file has {len(df)} rows.')
    return df

df = import_parse(file);

"bar_x_sales_export.csv" has been imported + parsed. The file has 891 rows.


In [12]:
def filter_df(df, start_date, end_date):
    
    return df[(df.index > start_date) & (df.index < end_date)]
    
df = filter_df(df, start_date, end_date)

In [13]:
def daily_average_matrix_ann(df):
    
    matrix = df.groupby([df.index.dayofweek, df.index.month, df.index.year]).agg({'sales': 'mean'})
    matrix = matrix.rename_axis(['day', 'month', 'year'])
    return matrix.unstack(level=1)

daily_average_matrix_ann(df)

Unnamed: 0_level_0,Unnamed: 1_level_0,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales
Unnamed: 0_level_1,month,1,2,3,4,5,6,7,8,9,10,11,12
day,year,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2
0,2017,516.386,647.4825,1183.9425,491.205,543.816,555.365,1212.406,533.02,418.4125,529.06,735.8525,496.3925
0,2018,398.202,407.7975,1252.305,313.994,475.595,1573.685511,376.968794,690.178243,455.24116,902.62059,1154.696929,1207.864237
0,2019,449.1125,381.1075,625.6675,390.316,440.0725,,,,,,,
1,2017,878.554,688.02,819.74,780.165,727.966,729.1025,691.96,779.938,496.82,1074.068,631.415,585.6975
1,2018,808.34,1208.02,610.0475,775.2675,759.864,1676.605227,1065.680822,800.8968,715.204696,737.921757,903.176949,605.239133
1,2019,613.31,580.3975,686.095,934.63,815.6175,,,,,,,
2,2017,829.34,1149.6025,1090.166,858.72,955.394,830.735,637.6225,665.67,645.045,876.3275,1548.482,820.735
2,2018,725.454,1342.4775,717.55,995.05,776.454,1161.861426,1137.068591,971.995906,1286.541929,1667.758492,1003.392882,857.146338
2,2019,735.004,649.0875,1626.985,1120.8225,716.862,,,,,,,
3,2017,1355.0125,1357.96,908.736,1166.2575,1293.2225,1025.39,1074.6475,1182.468,1332.415,762.65,1256.718,1510.2475


## Pull Weather Data

### Latitude + Longitude from Yelp API

In [14]:
host = 'https://api.yelp.com'
path = '/v3/businesses/search'

search_limit = 10

# Yelp Authorization Header with API Key
headers = {
        'Authorization': 'Bearer {}'.format(yelp_api_key) 
    }

# Build Requests Syntax with Yelp Host and Path and URL Paramaters
# Return JSON response
def request(host, path, url_params=None):
    
    url_params = url_params or {}
    url = '{}{}'.format(host, path)

    response = requests.get(url, headers=headers, params=url_params)
    
    return response.json()

# Build URL Params for the Request and provide the host and path
def search(term, location):
    
    url_params = {
        'term': term.replace(' ', '+'),
        'location': location.replace(' ', '+'),
        'limit': search_limit
    }
    
    return request(host, path, url_params=url_params)

# Return Coordinates if Exact Match Found
def yelp_lat_long(business, location):
    
    # Call search function here with business name and location
    response = search(business, location)
    
    # Set state to 'No Match' in case no Yelp match found
    state = 'No Match'
    possible_matches = []
    
    # Check search returns for match wtith business
    for i in range(len(response['businesses'])):

        # If match found:
        if response['businesses'][i]['name'] == business:

            # Local variables to help navigate JSON return
            response_ = response['businesses'][0]
            name_ = response_['name']

            print(f'Coordinates found for {name_}')
            state = 'Match Found'
            #print(response['businesses'][0])
            return response_['coordinates']['latitude'], response_['coordinates']['longitude']

        else:
            
            # If no exact match, append all search returns to list
            possible_matches.append(response['businesses'][i]['name'])
    
    # If no match, show user potential matches
    if state == 'No Match':
        
        print('Exact match not found, did you mean one of the following? \n')
        
        for possible_match in possible_matches:
            print(possible_match)
            
        return None, None

lat, long = yelp_lat_long(search_business, location)
#print(f'Latitude: {lat}\nLongitude: {long}')

Coordinates found for Jupiter Disco


### Darksky API Call

In [15]:
# Create List of Dates of target Weather Data
def find_dates(start_date, end_date):
    
    list_of_days = []
    daterange = pd.date_range(start_date, end_date)
    for single_date in daterange:
        list_of_days.append(single_date.strftime("%Y-%m-%d"))
    
    return list_of_days

In [16]:
# Create URL to make API Call
def build_url(api_key, lat, long, day):
    
    _base_url = 'https://api.darksky.net/forecast/'
    _time = 'T20:00:00'
    _url = f'{_base_url}{api_key}/{lat},{long},{day + _time}?America/New_York&exclude=flags'
    return _url

def make_api_call(url):
    
    r = requests.get(url)
    return r.json()

In [17]:
# Parse Data from API Call into Dictionaries
def parse_data(data):

    time = datetime.datetime.fromtimestamp(data['currently']['time']).strftime('%Y-%m-%d')
    
    try:
        entry = {'date': time,
                 'temperature': data['currently']['temperature'],
                 'apparent_temperature': data['currently']['apparentTemperature'],
                 'precip_prob': data['currently']['precipProbability'],
                 'summary': data['currently']['icon'],
                 'moonphase': data['daily']['data'][0]['moonPhase']}
    
    except KeyError:
        
        entry = {'date': time,
                 'temperature': 'NaN',
                 'apparent_temperature': 'NaN',
                 'precip_prob': 'NaN',
                 'summary': 'NaN',
                 'moonphase': 'NaN'}
    
    return entry

In [18]:
# Create List of Weather Data Dictionaries & Input Target Dates
def weather_call(start_date, end_date, _lat, _long):
    
    weather = []
    list_of_days = find_dates(start_date, end_date)
    
    for day in list_of_days:
        
        data = make_api_call(build_url(darksky_api_key, _lat, _long, day))
        
        weather.append(parse_data(data))
    
    return weather

result = weather_call(start_date, end_date, lat, long);

In [19]:
# Build DataFrame from List of Dictionaries
def build_weather_df(api_call_results):

    df = pd.DataFrame(api_call_results)

    df['date'] = pd.to_datetime(df['date'])
    df['day_of_week'] = df['date'].dt.weekday
    df.set_index('date', inplace=True)
    
    #Cast required columns as floats
    df[['temperature', 'apparent_temperature', 'precip_prob', 'moonphase']] = df[['temperature', 'apparent_temperature', 'precip_prob', 'moonphase']].astype(float)
    
    # Fill Temperature NaN with previous day
    df['temperature'].fillna(method='ffill',inplace=True)
    
    #Fill Apparent Temperature with previous day
    df['apparent_temperature'].fillna(method='ffill', inplace=True)
    
    #Fill moonphase with previous day
    df['moonphase'].fillna(method='ffill', inplace=True)
    
    #Fill Precipitation Probability with previous day
    df['precip_prob'].fillna(method='ffill', inplace=True)
    
    return df

weather_df = build_weather_df(result);

In [20]:
def join_sales_weather(sales_df, weather_df):
    
    df = pd.merge(sales_df, weather_df, how='left', on='date')
    return df

df = join_sales_weather(df, weather_df)

### CLEAN DATAFRAME HERE

In [371]:
dfx = df.copy(deep=True)

### Feature Engineering

In [372]:
# Check for Holiday & $0 in Sales
# Pass Other Closed Dates here
additional_closed_dates = ['2018-12-24', '2017-12-24', '2017-10-22']
closed_dates = [pd.to_datetime(date) for date in additional_closed_dates]

def add_features(df):
    
    try:
        # CLOSED FEATURE
        cal = calendar()
        # Local list of days with zero sales
        potential_closed_dates = df[df['sales'] == 0].index
        # Enocodes closed days with 1
        df['closed'] = np.where((((df.index.isin(potential_closed_dates)) & \
                                  (df.index.isin(cal.holidays(start_date, end_date)))) | df.index.isin(closed_dates)), 1, 0)
    except:
        # If no Sales Data available (Predict)
        df['closed'] = 0
    
    # POOR WEATHER FEATURE
    poor_weather_list = ['rain', 'snow', 'sleet']
    df['summary'] = df['summary'].apply(lambda x: x.split('-'))
    df['poor_weather'] = np.where(df['summary'].apply(lambda x: np.any(np.in1d(x, poor_weather_list))), 1, 0)
    df = df.drop(['summary'], axis=1)
    
    # THREE DAY WEEKEND FEATURE
    sunday_three_days = [date + pd.DateOffset(-1) for date in cal.holidays(start_date, end_date) if date.dayofweek == 0]
    df['sunday_three_day'] = np.where(df.index.isin(sunday_three_days), 1, 0)
    
    monday_three_days = [date for date in cal.holidays(start_date, end_date) if date.dayofweek == 0]
    df['monday_three_day'] = np.where(df.index.isin(monday_three_days), 1, 0)
    
    # MONTH
    df['month'] = df.index.month
    
    return df

dfx = add_features(dfx)

In [373]:
def add_dummies(df):
    
    daily_dummies = pd.get_dummies(df['day_of_week'], prefix='day')
    monthly_dummies = pd.get_dummies(df['month'], prefix='mo')
    
    df = pd.concat([df, daily_dummies, monthly_dummies], axis=1)
    df = df.drop(['day_of_week', 'month'], axis=1)
    
    return df

dfx = add_dummies(dfx)

In [374]:
apply_interactions = True

def add_interactions(df):
    
    if apply_interactions:
        
        for i in [col for col in df.columns if col.startswith('day_')]:
            
            col_name = i + '_X_' + 'poor_weather'
            
            df[col_name] = df[i] * df['poor_weather']
            
            for m in [col for col in df.columns if col.startswith('mo_')]:
                
                col_name = i + '_X_' + m
                
                df[col_name] = df[i] * df[m]
            
        return df

dfx = add_interactions(dfx)

## Test / Train / Split

In [375]:
def prepare_data(df):
    
    # Separate Target & Features
    y = df['sales']
    features = df.drop(['sales'], axis=1)
    
    # Test / Train / Split
    train_date_start = '2017-01-01'
    train_date_end = '2018-12-31'
    
    X_train = features[train_date_start:train_date_end]
    X_test = features[pd.to_datetime(train_date_end)+pd.DateOffset(1):]
    
    y_train = y[train_date_start:train_date_end]
    y_test = y[pd.to_datetime(train_date_end)+pd.DateOffset(1):]
    
    # Scale Data
    #scaler = StandardScaler()
    #X_scaler = scaler.fit(X_train)
    #X_train = pd.DataFrame(data = X_scaler.transform(X_train), columns=features.columns)
    #X_test = pd.DataFrame(data = X_scaler.transform(X_test), columns=features.columns)
    
    print('Train set: ', len(X_train))
    print('Test set: ', len(X_test))
    
    return X_train, X_test, y_train, y_test
    
X_train, X_test, y_train, y_test = prepare_data(dfx)

Train set:  729
Test set:  151


In [376]:
X_train.columns

Index(['apparent_temperature', 'moonphase', 'precip_prob', 'temperature',
       'closed', 'poor_weather', 'sunday_three_day', 'monday_three_day',
       'day_0', 'day_1',
       ...
       'day_6_X_mo_3', 'day_6_X_mo_4', 'day_6_X_mo_5', 'day_6_X_mo_6',
       'day_6_X_mo_7', 'day_6_X_mo_8', 'day_6_X_mo_9', 'day_6_X_mo_10',
       'day_6_X_mo_11', 'day_6_X_mo_12'],
      dtype='object', length=118)

In [377]:
cols_to_drop = ['temperature', 'moonphase', 'poor_weather', 'precip_prob', 'day_0', 'day_0_X_poor_weather', \
                'mo_1', 'mo_2', 'mo_3', 'mo_4', 'mo_5', 'mo_6', 'mo_7', 'mo_8', 'mo_9', 'mo_10', 'mo_11', 'mo_12']

def feature_selection(df):
    
    df = df.drop(cols_to_drop, axis=1)
    
    return df

X_train = feature_selection(X_train)
X_test = feature_selection(X_test)

In [378]:
def linear_regression_model(X_train, y_train):
    
    lr = LinearRegression()
    lr_clf = lr.fit(X_train, y_train)
    
    return lr_clf

In [379]:
lr_clf = linear_regression_model(X_train, y_train)

In [380]:
def lr_score(clf, X_test, y_test):
    
    score = clf.score(X_test, y_test)
    
    print('R-Squared:  ', score, '\n')
    print('Intercept:  ', clf.intercept_, '\n')
    print('Coefficients:  \n')
    
    for index, col_name in enumerate(X_test.columns):
        print(col_name, ' --> ', clf.coef_[index])
        
lr_score(lr_clf, X_test, y_test)

R-Squared:   0.8436690996275116 

Intercept:   17708763312560.555 

Coefficients:  

apparent_temperature  -->  4.879601897614076
closed  -->  -1021.2742732514957
sunday_three_day  -->  1576.0228300451497
monday_three_day  -->  -74.7353438553896
day_1  -->  -29969550086285.234
day_2  -->  -3985240646274.81
day_3  -->  -22746200452145.2
day_4  -->  -24317866504509.387
day_5  -->  -9823504718241.625
day_6  -->  -14328158390113.74
day_0_X_mo_1  -->  -17708763312091.63
day_0_X_mo_2  -->  -17708763312200.01
day_0_X_mo_3  -->  -17708763311545.273
day_0_X_mo_4  -->  -17708763312402.72
day_0_X_mo_5  -->  -17708763312356.617
day_0_X_mo_6  -->  -17708763311868.59
day_0_X_mo_7  -->  -17708763312130.15
day_0_X_mo_8  -->  -17708763312337.246
day_0_X_mo_9  -->  -17708763312459.4
day_0_X_mo_10  -->  -17708763312158.008
day_0_X_mo_11  -->  -17708763311834.164
day_0_X_mo_12  -->  -17708763311617.7
day_1_X_poor_weather  -->  -13.4794921875
day_1_X_mo_1  -->  12260786774421.73
day_1_X_mo_2  -->  12260786

In [381]:
lassoReg = Lasso(alpha=0.01)

lassoReg.fit(X_train,y_train)

pred = lassoReg.predict(X_test)
lassoReg.score(X_test, y_test)

lassoReg.intercept_

for index, col_name in enumerate(X_test.columns):
        print(col_name, ' --> ', lassoReg.coef_[index])

Lasso(alpha=0.01, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

0.8430598985191586

1137.3937978320803

apparent_temperature  -->  4.817462691449859
closed  -->  -1018.6840342538887
sunday_three_day  -->  1566.788698505379
monday_three_day  -->  -76.46194538091541
day_1  -->  -606.174467791221
day_2  -->  -507.2404257023619
day_3  -->  -199.79408513745355
day_4  -->  2025.292803086378
day_5  -->  2339.1329633955447
day_6  -->  -672.3578217030828
day_0_X_mo_1  -->  -691.8377044325637
day_0_X_mo_2  -->  -773.4832265490014
day_0_X_mo_3  -->  -110.1084392617222
day_0_X_mo_4  -->  -997.6313236323025
day_0_X_mo_5  -->  -907.2664349055415
day_0_X_mo_6  -->  -431.2227858661394
day_0_X_mo_7  -->  -727.9790354962864
day_0_X_mo_8  -->  -899.671313614318
day_0_X_mo_9  -->  -1030.6712421818154
day_0_X_mo_10  -->  -701.3394340172364
day_0_X_mo_11  -->  -402.84585414782975
day_0_X_mo_12  -->  -188.22841547830964
day_1_X_poor_weather  -->  -11.971208536692805
day_1_X_mo_1  -->  155.5569242873423
day_1_X_mo_2  -->  216.5508218765768
day_1_X_mo_3  -->  -0.0
day_1_X_mo_4  -->  5.337757606444567
day_1_X_mo_

In [370]:
def predict_df(clf, X_train, date_1, date_2):
    
    lat, long = yelp_lat_long(search_business, location)
    
    weather_df = build_weather_df(weather_call(date_1, date_2, lat, long))
    df = add_interactions(add_dummies(add_features(weather_df)))
    
    missing_cols = set(X_train.columns) - set(df.columns)
    
    for c in missing_cols:
        df[c] = 0
        
    df = df[X_train.columns]

    return clf.predict(df)

predict_df(lassoReg, X_train, pd.datetime.now().date(), pd.datetime.now().date() + pd.DateOffset(6))

Coordinates found for Jupiter Disco


array([3469.99475442, 3884.86026832,  638.55936849, 1052.75432309,
       1194.14388638,  988.22148534, 1128.13830539])