In [57]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
from sklearn import tree 
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [107]:
df = pd.read_csv("classification.csv",index_col=[0])

In [108]:
df.head()

Unnamed: 0,EXCEPTION_HOURS,SHIFT_DATE,WEEKDAY,NOTICE,EARNING_CATEGORY,JOB_FAMILY,SITE,MONTH
1,6.0,2018-04-16,1,306,Miscellaneous Straight-Time,1,4,4
2,11.0,2018-09-25,2,6,Relief Not Needed,1,1,9
3,11.25,2017-04-14,5,2734,Regular Relief Utilized,1,1,4
4,11.25,2016-04-24,7,58,Overtime,1,1,4
5,7.5,2013-12-23,1,10197,Relief Not Needed,2,1,12


In [113]:
# split data to train and validation
train = df[(df["SHIFT_DATE"]>"2012-12-31") & (df["SHIFT_DATE"]<"2017-01-01")]
val = df[(df["SHIFT_DATE"]>"2016-12-31") & (df["SHIFT_DATE"]<"2018-01-01")]

# Prepare data for model fitting
feature_cols = ["EXCEPTION_HOURS","WEEKDAY","NOTICE","SITE","MONTH","JOB_FAMILY"]
X = train.loc[:, feature_cols]
y = train.EARNING_CATEGORY
X1_train, X1_test, y1_train, y1_test = train_test_split(X, y, test_size=0.2)

In [119]:
# fit decision tree and random forest
DT = tree.DecisionTreeClassifier()
DT.fit(X1_train, y1_train)
RF = RandomForestClassifier(max_depth=10)
RF.fit(X1_train,y1_train)
print("Decision Tree Training Score:", round(DT.score(X1_train,y1_train),3))
print("Decision Tree Test Score:", round(DT.score(X1_test,y1_test),3))
print("Random Forest Training Score:", round(RF.score(X1_train,y1_train),3))
print("Random Forest Test Score:", round(RF.score(X1_test,y1_test),3))



Decision Tree Training Score: 0.891
Decision Tree Test Score: 0.348
Random Forest Training Score: 0.434
Random Forest Test Score: 0.427


In [115]:
# prepare for result dataframe
predictions_DT = DT.predict(X1_train)
predictions_RF = RF.predict(X1_train)

In [116]:
pred_dict = X1_train.copy()
pred_dict['EARN_CATEGORY'] = y1_train
pred_dict['DECISION_TREE'] = predictions_DT
pred_dict['RANDOM_FOREST'] = predictions_RF
result = pd.DataFrame(pred_dict)
result

Unnamed: 0,EXCEPTION_HOURS,WEEKDAY,NOTICE,SITE,MONTH,JOB_FAMILY,EARN_CATEGORY,DECISION_TREE,RANDOM_FOREST
283320,6.50,1,10,1,3,1,FT Employee Moved - Straight-Time,FT Employee Moved - Straight-Time,FT Employee Moved - Straight-Time
424071,7.00,1,12,1,2,1,Overtime,Overtime,Overtime
58006,7.50,5,9,2,6,1,Casual at Straight-Time,Casual at Straight-Time,Casual at Straight-Time
389983,11.25,5,1260,1,9,1,Casual at Straight-Time,Casual at Straight-Time,Casual at Straight-Time
330505,11.25,5,2686,1,2,1,FT Employee Moved - Straight-Time,FT Employee Moved - Straight-Time,FT Employee Moved - Straight-Time
274582,11.25,2,3,1,1,1,Relief Not Found,Overtime,Overtime
151241,11.25,4,1749,1,1,1,Relief Not Needed,Relief Not Needed,Relief Not Needed
299044,7.50,1,1159,1,1,1,Casual at Straight-Time,Casual at Straight-Time,Casual at Straight-Time
504902,11.00,4,5,1,8,1,Casual at Straight-Time,Relief Not Needed,Relief Not Needed
450851,7.50,4,665,1,6,2,Regular Relief Utilized,Regular Relief Utilized,Regular Relief Utilized


In [118]:
DT.feature_importances_

array([0.10236146, 0.14435154, 0.55180706, 0.02612623, 0.13773487,
       0.03761885])

In [117]:
RF.feature_importances_

array([0.13394245, 0.06167647, 0.64139638, 0.02189762, 0.10970802,
       0.03137906])