# Data 

Data utilized comes from the Daily Report.

# Goals

- 

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime as dt
import glob
import re
import string

pd.set_option('max_rows', 99999999)
pd.set_option('max_columns', 99999999)

In [None]:
def extract_date_stl(file, year):
    '''Takes date from file name'''
    regex_criteria = re.compile(r'[0-9]+-[0-9]+')    
    dat = re.findall(regex_criteria, file)
    exclude = set(string.punctuation)
    dat = ''.join(d for d in dat if d not in exclude)
    dat = str(dat + '-' + year)
    dat = dt.strptime(str(dat), "%m-%d-%Y").date()
    return dat

def extract_stl_production_tab(file, year):
    '''
    Takes in and formats Production Tab from Daily Report. 
    Extracts date from filename and creates index.
    Puts into a dictionary of dataframes 
    for input into a pandas DataFrame.
    '''
    dtypes = {'Date':dt.date, 'Warehouse':str,'LOC':str,'RTE':str,'Driver':str,'Truck#':str,
            'Stops':np.float64,'TTL Cs/splt':np.float64,'Cs':np.float64,'Btls':np.float64,
            'Start Hr':str, 'End Hr':str,'Ttl Hrs':str,'Ttl Mi':np.float64 }
    try:
        df = pd.read_excel(file, sheet_name='Production', converters=dtypes)
    except ValueError:
        df = pd.read_excel(file, sheet_name='Production')        
        
    dat = extract_date_stl(file, year)
    
    df['Date'] = dat 
    df['Month'] = dat.strftime('%B')
    df['Weekday'] = dat.strftime('%A')
    df['WeekNumber'] = dat.strftime('%U')
    df['DOTM'] = dat.strftime('%d')
    df['Warehouse'] = 'STL'
    
    keep_cols = ['Date','Warehouse','LOC','RTE','Driver','Truck#','Stops',
                 'TTL Cs/splt','Cs','Btls','Start Hr',
                 'End Hr','Ttl Hrs','Ttl Mi','Month','Weekday','WeekNumber',
                 'DOTM']
    df = df[keep_cols].drop_duplicates()
    
    WAREHOUSE, ROUTE = df.Warehouse.astype(str), df.RTE.astype(str)
    new_index = WAREHOUSE + '_' + ROUTE 
    
    df.set_index(new_index, inplace=True)
    
    df = df[df['Driver'] != 'Totals:']        
    df = df.sort_values(['Stops','TTL Cs/splt'], ascending=False).reset_index(drop=False)
    
    df['Date'] = df['Date'].replace(to_replace='NaN', value='')
    df = df[df['Date'].isnull() == False]
    
    drop_dumb_shit = lambda col: str(col).lower().replace(' ', '_').replace('#', '').replace('.', '')
    df.columns = [drop_dumb_shit(col) for col in df.columns]
    
    return df

In [None]:
## Update years like on velocity
files_2018 = 'N:\\Daily Report\\2018\\*\\*.xls*'
files_2017 = 'N:\\Daily Report\\2017\\*\\*.xls*'
files_2016 = 'N:\\Daily Report\\2016\\*\\*.xls*'

file_list = {'2016': files_2016, '2017': files_2017, '2018': files_2018}

stl_production = pd.DataFrame()        
for k, v in file_list.items():
    flist = glob.glob(v)
    yr = k
    for file in flist:
        if 'copy' in str(file).lower():
            print('Excluding file:  {}'.format(file))
            pass
        elif '~$' in str(file):
            print('Excluding file:  {}'.format(file))
            pass
        else:
            df  = extract_stl_production_tab(file, year=yr)
            stl_production = stl_production.append(df)

In [None]:
stl_production.head()

In [None]:
ix_cols = ['date', 'warehouse', 'rte']
stl_production.sort_values(ix_cols, inplace=True)
stl_production.set_index(ix_cols, inplace=True, drop=False)
stl_production.head()

In [None]:
print(sorted(stl_production.loc[stl_production['loc']=='COL', 'index'].unique()))

In [None]:
stl_production['delivery_day'] = stl_production['date'] + pd.to_timedelta(1, unit='d')
stl_production['delivery_weekday'] = stl_production['delivery_day'].apply(lambda d: d.weekday())
wday_map = dict(zip(np.arange(0, 7), ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']))
stl_production['delivery_weekday'] = stl_production['delivery_weekday'].map(wday_map)
stl_production.head()

In [None]:
nontrue_routes = ['STL_70', 'STL_91', 'STL_PR', 'STL_nan', 'STL_90', 'STL_93', 'STL_13', 'STL_3', 'STL_25']#['STL_nan', 'STL_PR', 'STL_90']#['STL_70', 'STL_91', 'STL_PR', 'STL_nan', 'STL_90', 'STL_93', 'STL_13', 'STL_3', 'STL_25']
stl_production['non_true_rtes'] = stl_production['index'].isin(nontrue_routes)

In [None]:
check_cols = ['loc', 'rte', 'driver', 'truck', 'stops', 'ttl_cs/splt']
x_check = stl_production.loc[stl_production.non_true_rtes == True, check_cols].reset_index(drop=True)
fname = 'N:/Operations Intelligence/Operations Research/Delivery Analysis Post Schlafly/for_bob_to_check.xlsx'
x_check = x_check.drop_duplicates()
# x_check.to_excel(fname)
x_check

In [None]:
def stl_daily_route_summary(stl_production):
    grpby_df = stl_production.loc[stl_production['loc'].isin(['STL', 'COL'])]
    grpby_df = grpby_df.loc[grpby_df.non_true_rtes == False]
    grp_cols = ['warehouse', 'loc', 'date']
    agg_funcs = {'rte': pd.Series.nunique, 'stops': np.sum, 'ttl_cs/splt': np.sum}
    grpby_df = pd.DataFrame(grpby_df.groupby(grp_cols).agg(agg_funcs)).reset_index(drop=False)
    return grpby_df

stl_daily = stl_daily_route_summary(stl_production)
stl_daily.head()

In [None]:
# merge in calendar data
def generate_calendar(year, drop_index=False):
    '''
    Simple function to generate a calendar containing
    US holidays, weekdays and  holiday weeks.
    '''
    from pandas.tseries.offsets import YearEnd
    from pandas.tseries.holiday import USFederalHolidayCalendar
    
    start_date = pd.to_datetime('1/1/'+str(year))
    end_date = start_date + YearEnd()
    DAT = pd.date_range(str(start_date), str(end_date), freq='D')
    MO = [d.strftime('%B') for d in DAT]
    holidays = USFederalHolidayCalendar().holidays(start=start_date, end=end_date)

    cal_df = pd.DataFrame({'date':DAT, 'month':MO})
    cal_df['year'] = [format(d, '%Y') for d in DAT]
    cal_df['weekday'] = [format(d, '%A') for d in DAT]
    cal_df['is_weekday'] = cal_df.weekday.isin(['Monday','Tuesday','Wednesday','Thursday','Friday'])
    cal_df['is_weekday'] = cal_df['is_weekday'].astype(int)
    cal_df['is_holiday'] = cal_df['date'].isin(holidays)
    cal_df['is_holiday'] = cal_df['is_holiday'].astype(int)
    cal_df['is_holiday_week'] = cal_df.is_holiday.rolling(window=7,center=True,min_periods=1).sum()
    cal_df['is_holiday_week'] = cal_df['is_holiday_week'].astype(int)
    
    if not drop_index: cal_df.set_index('date', inplace=True)
    
    return cal_df

def make_calendars(year_list, drop_index):
    cal_df = pd.DataFrame()
    for year in year_list:
        cal_df = cal_df.append(generate_calendar(year, drop_index=drop_index))
    return cal_df

year_list = ['2016', '2017', '2018']
cal_df = make_calendars(year_list, drop_index=True)
cal_df.head()

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
fig, axes = plt.subplots(1, 2, figsize=(17, 6))
for i, loc in enumerate(['STL', 'COL']):
    _df = stl_daily.loc[stl_daily['loc'] == loc]
    ax = axes[i]
    ax.plot(_df['date'], _df['rte'])
    ax.set_title('Routes per Day for {}'.format(loc))
    ax.set_xlabel('Date')
    ax.set_ylabel('Number of Market Routes')
    ax.grid(alpha=.4)
    for tick in ax.get_xticklabels():
        tick.set_rotation(90)
plt.show()

fig, axes = plt.subplots(1, 2, figsize=(17, 6))
for i, loc in enumerate(['STL', 'COL']):
    _df = stl_daily.loc[stl_daily['loc'] == loc]
    ax = axes[i]
    ax.hist(_df['rte'], bins=_df['rte'].max()-_df['rte'].min())
    ax.axvline(_df['rte'].mean(), linestyle='--', color='r')
    ax.axvline(_df['rte'].mean()-_df['rte'].std(), linestyle='-.', color='y')
    ax.axvline(_df['rte'].mean()+_df['rte'].std(), linestyle='-.', color='y')
    ax.set_title('Histogram of Routes per Day for {}'.format(loc))
    ax.set_xlabel('Count of Routes per Day')
    ax.set_ylabel('Number of Observations')
    ax.grid(alpha=.4)

plt.show()

In [None]:
stl_production.dtypes

# Delivery Equipment Leasing 2016-2018

In [None]:
?pd.ExcelFile.parse

In [None]:
base_dir = 'N:\\Operations Intelligence\\Operations Research\\Delivery Analysis Post Schlafly\\'
equip_leasing_xlsx = pd.ExcelFile(base_dir + 'Delivery Equipment Leasing  2016-2018.xlsx')
rentals = equip_leasing_xlsx.parse('All Data', skiprows=4)
rentals.Location = rentals.Location.map({1: 'Kansas City', 2: 'Saint Louis', 3: 'Columbia', 4: 'Springfield'})
rentals.Date = rentals.Date.apply(pd.to_datetime)
rentals.head()

In [None]:
rentals.Journal.unique()

In [None]:
rentals.groupby(['Location', 'Date']).agg({'Amount': np.sum, 'Reference': pd.Series.nunique})

In [None]:
rentals_byday = 