In [154]:
import pandas as pd
import numpy as np
from sklearn import tree 
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import preprocessing
import datetime
import warnings
warnings.filterwarnings("ignore")

# Usage Example

1.User read data for training:

`raw_data = pd.read_csv("user_file_path",index_col=[0])`

2.User read data for prediction:

`expetion_data = pd.read_csv("user_file_path",index_col=[0])`

3.User do training data wrangling and prediction data wrangling:

`train = training_wrangling(raw_data)`

`test = prediction_wrangling(expetion_data)`

4.User do predict exception using `train` and `test`, then can see the result dataframe:

`exception_prediction(train,test)`

In [140]:
# read dataframe
raw_data = pd.read_csv("exception_hours.csv",index_col=[0])
training = raw_data[(raw_data["SHIFT_DATE"]>"2012-12-31") & (raw_data["SHIFT_DATE"]<"2018-01-01")]
test = raw_data[(raw_data["SHIFT_DATE"]>"2018-12-31") & (raw_data["SHIFT_DATE"]<"2020-01-01")]

  mask |= (ar1 == a)


# Data Wrangling for Training Set 

In [141]:
def training_wrangling(dataframe):
    '''
    Given a dataframe, wrangling to format using for training the model.
    
    Paramenters
    -----------
    dataframe: dataframe
        Raw data 
        
    Returns
    -------
    dataframe
        One for natural prediction, one for conservative prediction
    '''
    df = dataframe.copy()
    
    # filter target group: Nurse
    df = df[df["LABOR_AGREEMENT"]=="NURS"]

    # filter the 6 site the client suggested
    df = df[(df["SITE"]=="St Paul's Hospital") |
            (df["SITE"]=="Mt St Joseph") |
            (df["SITE"]=="Holy Family") |
            (df["SITE"]=="SVH Langara") |
            (df["SITE"]=="Brock Fahrni") |
            (df["SITE"]=="Youville Residence")]
    
    # filter NaN in MIN_CALL_TO_MAX_CALL_MINUTES and EXCEPTION_CREATION_TO_SHIFTSTART_MINUTES
    df = df[pd.notnull(df["MIN_CALL_TO_MAX_CALL_MINUTES"])]
    df = df[pd.notnull(df["EXCEPTION_CREATION_TO_SHIFTSTART_MINUTES"])]

    # filter MIN_CALL_TO_MAX_CALL_MINUTES > 0 which means the call time is after exception creation time
    df = df[(df["MIN_CALL_TO_MAX_CALL_MINUTES"] >= 0)]

    # filter EXCEPTION_CREATION_TO_SHIFTSTART_MINUTES < 0 which means the creation time is ahead of shift start
    df = df[(df["EXCEPTION_CREATION_TO_SHIFTSTART_MINUTES"] <= 0)]

    # create NOTICE = EXCEPTION_CREATION_TO_MAXCALL_MINUTES - MIN_CALL_TO_MAX_CALL_MINUTES 
    #               =EXCEPTION_CREATION_TO_MINCALL_MINUTES
    df["NOTICE_"] = df["EXCEPTION_CREATION_TO_MAXCALL_MINUTES"] + df["MIN_CALL_TO_MAX_CALL_MINUTES"]

    # create a column called WEEKDAY to indicate if the date is weekday or not.
    df["WEEKDAY_"] = 0
    df["WEEKDAY_"][(pd.to_datetime(df['SHIFT_DATE']).dt.weekday >= 1) & 
                  (pd.to_datetime(df['SHIFT_DATE']).dt.weekday <= 5)] = 1


    # create a column to indicate the month of the SHIFT_DATE
    df["MONTH_"] = pd.to_datetime(df['SHIFT_DATE']).dt.month

    # create a column to indicate the SHIFT of the exception
    df["START_TIME"] = pd.to_datetime(df["START_TIME"])
    df["SHIFT_"] = 3
    df["SHIFT_"][(df["START_TIME"] >= "06:00:00") & (df["START_TIME"] < "13:00:00")] = 1
    df["SHIFT_"][(df["START_TIME"] >= "13:00:00") & (df["START_TIME"] < "19:00:00")] = 2

    # convert "EXCEPTION_GROUP","PROGRAM","SITE","DEPARTMENT" value from str to numeric for randomforest model
    le = preprocessing.LabelEncoder()
    df["EXCEPTION_GROUP_"] = le.fit_transform(df["EXCEPTION_GROUP"])                     
    df["PROGRAM_"] = le.fit_transform(df["PROGRAM"])
    df["SITE_"] = le.fit_transform(df["SITE"])
    df["JOB_FAMILY_"] = le.fit_transform(df["JOB_FAMILY"])
    df["SUB_PROGRAM_"] = le.fit_transform(df["SUB_PROGRAM"])
    df["DEPARTMENT_"] = le.fit_transform(df["DEPARTMENT"])
    return df

In [None]:
#df2 = training_wrangling(test)
#df2.head(20)

# Data Wrangling for Prediction Set

In [156]:
def prediction_wrangling(dataframe):
    '''
    Given a dataframe using for prediction, wrangling to format using for training the model.
    
    Paramenters
    -----------
    dataframe: dataframe
        Raw data of prediction, which should not contains "EARNING_CATEGORY"
        
    Returns
    -------
    dataframe
        Dataframe that contains converted value
        
    '''
    df = dataframe.copy()
    
    # if the dataframe contains EARNING_CATEGORY, drop the column
    if 'EARNING_CATEGORY' in df.columns:
        df = df.drop('EARNING_CATEGORY', 1)
    
    # filter target group: Nurse
    df = df[df["LABOR_AGREEMENT"]=="NURS"]

    # filter the 6 site the client suggested
    df = df[(df["SITE"]=="St Paul's Hospital") |
            (df["SITE"]=="Mt St Joseph") |
            (df["SITE"]=="Holy Family") |
            (df["SITE"]=="SVH Langara") |
            (df["SITE"]=="Brock Fahrni") |
            (df["SITE"]=="Youville Residence")]
    
    # filter NaN in MIN_CALL_TO_MAX_CALL_MINUTES and EXCEPTION_CREATION_TO_SHIFTSTART_MINUTES
    df["NOTICE_"] = pd.to_datetime(df["EXCEPTION_CREATION_DATE"]).map(lambda x:int((x - datetime.datetime.now()).total_seconds()/60))

    # create a column called WEEKDAY to indicate if the date is weekday or not.
    df["WEEKDAY_"] = 0
    df["WEEKDAY_"][(pd.to_datetime(df['SHIFT_DATE']).dt.weekday >= 1) & 
                  (pd.to_datetime(df['SHIFT_DATE']).dt.weekday <= 5)] = 1


    # create a column to indicate the month of the SHIFT_DATE
    df["MONTH_"] = pd.to_datetime(df['SHIFT_DATE']).dt.month

    # create a column to indicate the SHIFT of the exception
    df["START_TIME"] = pd.to_datetime(df["START_TIME"])
    df["SHIFT_"] = 3
    df["SHIFT_"][(df["START_TIME"] >= "06:00:00") & (df["START_TIME"] < "13:00:00")] = 1
    df["SHIFT_"][(df["START_TIME"] >= "13:00:00") & (df["START_TIME"] < "19:00:00")] = 2

    # convert "EXCEPTION_GROUP","PROGRAM","SITE","DEPARTMENT" value from str to numeric for randomforest model
    le = preprocessing.LabelEncoder()
    df["EXCEPTION_GROUP_"] = le.fit_transform(df["EXCEPTION_GROUP"])                     
    df["PROGRAM_"] = le.fit_transform(df["PROGRAM"])
    df["SITE_"] = le.fit_transform(df["SITE"])
    df["JOB_FAMILY_"] = le.fit_transform(df["JOB_FAMILY"])
    df["SUB_PROGRAM_"] = le.fit_transform(df["SUB_PROGRAM"])
    df["DEPARTMENT_"] = le.fit_transform(df["DEPARTMENT"])
    return df

In [157]:
# df3 = prediction_wrangling(test)
# df3.head(20)

# Exception Prediction

In [226]:
CAT_1 = ["Regular Relief Utilized",
         "Casual at Straight-Time",
         "PT Over FTE",
         "Miscellaneous Straight-Time",
         "PT Employee Moved - Straight-Time",
         "FT Employee Moved - Straight-Time"]
CAT_2 = ["Overtime",
         "Relief Not Found",
         "Agency",
         "Insufficient Notice",
         "On-Call"]

# create function for grouping label
def replace_str(string):
    if string in CAT_1:
        return string.replace(string, "Straight Time")
    elif string in CAT_2:
        return string.replace(string, "Overtime and Beyond")
    else:
        return string

def exception_prediction(dataframe1,dataframe2):
    '''
    Given training dataframe and prediction dataframe doing prediction.
    
    Paramenters
    -----------
    dataframe1: 
        dataframe wrangled by training_wrangling
    
    dataframe2: 
        dataframe wrangled by prediction_wrangling
        
    Returns
    -------
    dataframe
        Contains prediction result and Suggestion
    '''
    # create dataframe for Natural Prediction and Conservative Prediction
    natural_df = dataframe1.copy()
    natural_df["EARNING_CATEGORY"] = natural_df["EARNING_CATEGORY"].apply(replace_str)


    conserv_df = natural_df.drop(index=natural_df[(natural_df["EARNING_CATEGORY"].str.contains("Relief Not Needed"))].index)
    
    # Prepare data for model fitting
    feature_cols = ["EXCEPTION_HOURS",
                    "EXCEPTION_CREATION_TO_SHIFTSTART_MINUTES",
                    "SITE_",
                    "EXCEPTION_GROUP_",
                    "PROGRAM_",
                    "MONTH_",
                    "SUB_PROGRAM_",
                    "DEPARTMENT_",
                    "NOTICE_",
                    "SHIFT_"]
    X_nat = natural_df.loc[:, feature_cols]
    y_nat = natural_df.EARNING_CATEGORY
    X_con = conserv_df.loc[:, feature_cols]
    y_con = conserv_df.EARNING_CATEGORY

    # build random forest model and test
    RF_nat = RandomForestClassifier(n_estimators=270, 
                                max_depth=15,
                                min_samples_split=6,
                                min_samples_leaf = 7)
    RF_con = RF_nat
    RF_nat.fit(X_nat,y_nat)
    RF_con.fit(X_con,y_con)
    print("Natural Prediction Training Score:", round(RF_nat.score(X_nat,y_nat),3))
    print("Conservative Prediction Training Score:", round(RF_con.score(X_con,y_con),3))

    # create result dataframe
    pred_dict = dataframe2.copy()
    pred_dict['NATURAL_PREDICTION'] = RF_nat.predict(dataframe2.loc[:,feature_cols])
    pred_dict['CONSERVATIVE_PREDICTION'] = RF_con.predict(dataframe2.loc[:,feature_cols])
    tmp = []
    for i in range(len(pred_dict)):
        if pred_dict.iloc[i]['NATURAL_PREDICTION'] == pred_dict.iloc[i]['CONSERVATIVE_PREDICTION']:
            tmp.append(pred_dict.iloc[i]['CONSERVATIVE_PREDICTION'])
        else:
            tmp.append("Needs HR's Judgement")
    pred_dict["RESPONSE_ALERT"] = tmp
    result = pred_dict.drop(["SITE_", "EXCEPTION_GROUP_", "PROGRAM_", "MONTH_", "SUB_PROGRAM_", "DEPARTMENT_", "NOTICE_", "WEEKDAY_","JOB_FAMILY_"], axis=1)
    return result

In [224]:
df4 =exception_prediction(df2,df3)
df4.head()

Natural Prediction Training Score: 0.886
Conservative Prediction Training Score: 0.915


Unnamed: 0_level_0,PCN,EXCEPTION_REASON,EXCEPTION_GROUP,EXCEPTION_HOURS,EXCEPTION_CREATION_DATE,MASKED_REPLACED_EMPLID,LABOR_AGREEMENT,UNION_CD,JOB_FAMILY,JOB_FAMILY_DESCRIPTION,...,EARL_NOTIFICATION_TO_FILL_MINUTES,EARL_NOTIFICATION_TO_SHIFTSTART_MINUTES,FILL_TO_SHIFTSTART_MINUTES,MASKED_SCHEDULER_EMPLID,SCHEDULER_JOBCODE,SHIFT_,JOB_FAMILY_,NATURAL_PREDICTION,CONSERVATIVE_PREDICTION,RESPONSE_ALERT
EXCEPTIONID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
11506356,,RWB- GRTW with benefits - AJ- Schedule Adjustment,Work Related Injury,6.0,2018-04-03 13:00:37.427,BOOK-ON,NURS,BCNU-Nur,DC1000,Registered Nurse-DC1,...,0,0,0,,,1,4,Straight Time,Straight Time,Straight Time
11884013,6626.0,PSK- Sick Lv,Paid Sick,11.0,2018-09-25 14:27:06.163,8709029,NURS,HSA-Nur,DC1000,Registered Nurse-DC1,...,0,0,0,,,3,4,Overtime and Beyond,Overtime and Beyond,Overtime and Beyond
11878477,5460.0,Swap shifts,Swap,7.5,2018-09-24 13:14:55.377,8706217,NURS,BCNU-Nur,DC1000,Registered Nurse-DC1,...,0,0,0,,,1,4,Straight Time,Straight Time,Straight Time
12088498,11823.0,Swap shifts,Swap,11.0,2018-12-07 13:17:08.120,8708963,NURS,BCNU-LPNFc,LPN001,Licensed Practical Nurse,...,0,0,0,,,1,10,Straight Time,Straight Time,Straight Time
11848782,11806.0,PSK- Sick Lv,Paid Sick,11.0,2018-09-13 10:01:37.393,8703044,NURS,BCNU-LPNFc,LPN001,Licensed Practical Nurse,...,0,-11349,0,828348.0,900963.0,1,10,Straight Time,Straight Time,Straight Time
