In [83]:
import pandas as pd
import numpy as np
import os
import string
import glob

pd.set_option('max_columns', 300)

# Day Hours & Productivity

Semi-structured Excel files are automatically emailed to `paul.washburn@majorbrands.com` on the 15th and last day of each month.  The data contains all hours for each worker in the warehouse.

In [86]:
from datetime import datetime as dt

base_dir = 'C:/users/pmwash/Desktop/Re-Engineered Reports/Day Hours/'

def drop_unnecessary_characters(str_list):
    str_list = [str(s).lower().replace(' ', '_') for s in str_list]
    str_list = [str(s).lower().replace('-_', '') for s in str_list]
    return str_list

def replace_unnamed_and_nans(col_list):
    new_col_list = list()
    for col in col_list:
        col = str(col)
        if '|nan' in col:
            newcol = col.replace('|nan', '')
            new_col_list.append(newcol)
        elif 'unnamed:_' in col:
            newcol = col.replace('unnamed:_', col_group)
            newcol = ''.join(c for c in newcol if not c.isdigit())
            new_col_list.append(newcol)
        else:
            new_col_list.append(col)
            col_group = col.split('|')[0]
    return new_col_list

def preprocess_hr_data(file_path):
    '''
    Accepts path to the export from ADP from HR which is emailed
    twice per month
    '''
    df = pd.read_csv(file_path, skiprows=8)
    
    # clean up column names
    df.loc[0] = col_specifier = drop_unnecessary_characters(df.loc[0])
    df.columns = drop_unnecessary_characters(df.columns)
    col_list = [a +'|'+ b for a,b in zip(df.columns, col_specifier)]
    df.columns = replace_unnamed_and_nans(col_list)
    df.drop(index=0, inplace=True)
    
    # set data types to numeric after removing miscellaneous symbols
    non_numeric_cols = ['labor_level_selected', 'employee_id', 'employee_name']
    numeric_cols = [col for col in df.columns if col not in non_numeric_cols]
    for col in numeric_cols:
        df[col] = df[col].str.replace('$', '')
        df[col] = df[col].str.replace(',', '')
        df[col] = df[col].str.replace('(', '-')
        df[col] = df[col].str.replace(')', '')
        df[col] = df[col].astype(np.float32)
        
    # capture date from the file name
    dat = file_path.split('Worked ')[1]
    df['starting_date'] = dt.strptime(dat.split(' - ')[0], '%m%d%Y')
    
    # set indices
    non_numeric_cols = ['starting_date'] + non_numeric_cols
    df.set_index(non_numeric_cols, inplace=True)
    df.index = df.index.droplevel('employee_name') #drop names for privacy
    
    return df

file_list = glob.glob(base_dir + '*.csv')
ops_hours_df = pd.DataFrame()
for file in file_list:
    ops_hours_df = ops_hours_df.append(preprocess_hr_data(file))
    
print(ops_hours_df.columns)
print(ops_hours_df.head())

Index(['absence_no_pay|days', 'absence_no_pay|hours', 'absence_no_pay|money',
       'absence_no_pay|wages', 'birthday|days', 'birthday|hours',
       'birthday|money', 'birthday|wages', 'doubletime|days',
       'doubletime|hours', 'doubletime|money', 'doubletime|wages',
       'kc_personal_day|days', 'kc_personal_day|hours',
       'kc_personal_day|money', 'kc_personal_day|wages', 'kc_sick|days',
       'kc_sick|hours', 'kc_sick|money', 'kc_sick|wages', 'no_pay_hours|days',
       'no_pay_hours|hours', 'no_pay_hours|money', 'no_pay_hours|wages',
       'overtime|days', 'overtime|hours', 'overtime|money', 'overtime|wages',
       'personal_day|days', 'personal_day|hours', 'personal_day|money',
       'personal_day|wages', 'pto|days', 'pto|hours', 'pto|money', 'pto|wages',
       'regular|days', 'regular|hours', 'regular|money', 'regular|wages',
       'total|days', 'total|hours', 'total|money', 'total|wages',
       'unnamed:_10', 'vacation|days', 'vacation|hours', 'vacation|money',
 

In [93]:
ops_hours_df.index.get_level_values('labor_level_selected').unique()

Index(['/50/5220////', '/50/6502////', '/50/6513////', '/70/5220////',
       '/70/7202////', '/70/7214////', '/70/7201////'],
      dtype='object', name='labor_level_selected')