In [69]:
import pandas as pd
import numpy as np

from sklearn.metrics import fbeta_score

pd.options.display.max_colwidth=100
pd.options.display.max_columns=300

employees = pd.read_csv("data/employees.csv")
history = pd.read_csv("data/history.csv")
submission = pd.read_csv("data/submission.csv")

# history.loc[:,'Date'] = list(pd.to_datetime(history['Date']))

def get_month(text):
    if type(text) == str:
        numbers = text.split("/")
        return int(numbers[0])

def get_year(text):
    if type(text) == str:
        numbers = text.split("/")
        return int(numbers[-1])



# for data labeling
def label_df(data, n_month=3):
    labels = []
    for emp in data.EmployeeID.unique():
        curr_emp = list(data[data.EmployeeID == emp]['DismissalDate'])
        len_emp = len(curr_emp)
        if pd.isnull(curr_emp[0]):
            labels += [0 for _ in range(len_emp - n_month)] + [2 for _ in range(n_month)]
        else:
            labels += [0 for _ in range(len_emp - n_month)] + [1 for _ in range(n_month)]
    return labels


In [70]:
df = history.merge(employees)
lbls = label_df(df)

In [71]:
df["target"] = lbls
df['Date'] = list(pd.to_datetime(df['Date']))

In [72]:
df = df.sort_values("Date")

In [73]:
X_test = df[df.EmployeeID.isin(set(submission.EmployeeID))]
X_test = X_test.drop_duplicates(subset=["EmployeeID"], keep='last')
len(X_test)

4156

In [74]:
X_test_Employee = X_test["EmployeeID"]

In [75]:
test_index = X_test.index
train_index = list(set(list(df.index)).difference(set(list(test_index))))
X_train = df.loc[train_index, :]

X_train = X_train[X_train.target != 2]

In [76]:
X_train.head(2)

Unnamed: 0,EmployeeID,Date,DevCenterID,SBUID,PositionID,PositionLevel,IsTrainee,LanguageLevelID,CustomerID,ProjectID,IsInternalProject,Utilization,HourVacation,HourMobileReserve,HourLockedReserve,OnSite,CompetenceGroupID,FunctionalOfficeID,PaymentTypeId,BonusOneTime,APM,WageGross,MonthOnPosition,MonthOnSalary,HiringDate,DismissalDate,target
0,00116D71-E87D-4B64-A566-1F29B2A798A8,2017-07-01,3,292,70,2,0,11,893EA22F-08BE-4F11-AD93-C50746E4565F,7F97465B-ED3C-45DB-BE96-86C7E493F0CD,0,0.7619,40,0,0,0,15,1,9,0,39,0.887446,1,1,6/1/13,,0
1,00116D71-E87D-4B64-A566-1F29B2A798A8,2017-08-01,3,332,70,2,0,11,893EA22F-08BE-4F11-AD93-C50746E4565F,7F97465B-ED3C-45DB-BE96-86C7E493F0CD,0,1.0,0,0,0,0,15,1,9,200,28,0.887446,2,2,6/1/13,,0


In [77]:
columns_to_drop = ['EmployeeID', 'HiringDate', 'DismissalDate']

cat_columns = ['DevCenterID', 'SBUID', 'PositionID', 'PositionLevel', 
               'IsTrainee', 'LanguageLevelID', 'CustomerID', 'ProjectID', 
               'IsInternalProject', 'OnSite', 'CompetenceGroupID', 'FunctionalOfficeID',
               'PaymentTypeId']

In [78]:
X_train = X_train.drop(columns_to_drop, axis = 1)
X_test = X_test.drop(columns_to_drop, axis = 1)

In [79]:
X_test = X_test.drop("target", axis=1)

In [80]:
y = X_train["target"]

In [81]:
X_train = X_train.drop(columns=["Date", "target"])

In [82]:
from category_encoders.basen import BaseNEncoder
encoder = BaseNEncoder(cols = cat_columns, base = 2)

X = encoder.fit_transform(X_train)

In [83]:
# y = X["target"]
# X = X.drop("target", axis=1)
#X = X.drop("Date", axis=1)

### Try XGBoost

In [84]:
71647/(71647+3051)

0.9591555329459959

In [85]:
y.value_counts()

0    71647
1     3051
Name: target, dtype: int64

In [86]:
import xgboost as xgb

In [87]:
xg_clas = xgb.XGBClassifier(scale_pos_weight=96)

In [88]:
from sklearn.metrics import fbeta_score

In [104]:
import random
from sklearn.model_selection import KFold

RANDOM_SEED = 42

kf = KFold(n_splits=4, random_state=RANDOM_SEED, shuffle=True)

waccs = []
fold = 1
for train_index, test_index in kf.split(X):
    print('FOLD #{}'.format(fold))
    fold += 1
    X_train_, X_val = X.loc[X.index.intersection(train_index)], X.loc[X.index.intersection(test_index)]
    y_train_, y_val = y.loc[y.index.intersection(train_index)], y.loc[y.index.intersection(test_index)]

    xg_clas = xgb.XGBClassifier(
        learning_rate =0.1, 
        n_estimators=1000,
        gamma=0.4,
        subsample=0.55,
        colsample_bytree=0.85,
        max_depth = 3,
        min_child_weight = 5,
        objective= 'binary:logistic', nthread=10,scale_pos_weight=95,seed=27
    )
    
    xg_clas.fit(X_train_,y_train_)
    
    y_pred = xg_clas.predict(X_val)

    score = fbeta_score(y_val, y_pred, beta=1.7)
    print(score)
    waccs.append(score)

FOLD #1
0.31616154062150326
FOLD #2
0.29567554488025144
FOLD #3
0.28340061308062603
FOLD #4
0.29224142421974175


In [90]:
X_train.head(2)

Unnamed: 0,DevCenterID,SBUID,PositionID,PositionLevel,IsTrainee,LanguageLevelID,CustomerID,ProjectID,IsInternalProject,Utilization,HourVacation,HourMobileReserve,HourLockedReserve,OnSite,CompetenceGroupID,FunctionalOfficeID,PaymentTypeId,BonusOneTime,APM,WageGross,MonthOnPosition,MonthOnSalary
0,3,292,70,2,0,11,893EA22F-08BE-4F11-AD93-C50746E4565F,7F97465B-ED3C-45DB-BE96-86C7E493F0CD,0,0.7619,40,0,0,0,15,1,9,0,39,0.887446,1,1
1,3,332,70,2,0,11,893EA22F-08BE-4F11-AD93-C50746E4565F,7F97465B-ED3C-45DB-BE96-86C7E493F0CD,0,1.0,0,0,0,0,15,1,9,200,28,0.887446,2,2


In [94]:
X_test = X_test.drop("Date", axis=1)

In [95]:
X_test_encode = encoder.transform(X_test)

In [105]:
xg_clas = xgb.XGBClassifier(
        learning_rate =0.1, 
        n_estimators=1000,
        gamma=0.4,
        subsample=0.55,
        colsample_bytree=0.85,
        max_depth = 3,
        min_child_weight = 5,
        objective= 'binary:logistic', nthread=10,scale_pos_weight=95,seed=27
    )

In [106]:
xg_clas.fit(X,y)

XGBClassifier(base_score=0.5, booster=None, colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=0.85, gamma=0.4, gpu_id=-1,
       importance_type='gain', interaction_constraints=None,
       learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=5, missing=nan, monotone_constraints=None,
       n_estimators=1000, n_jobs=10, nthread=10, num_parallel_tree=1,
       objective='binary:logistic', random_state=27, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=95, seed=27, subsample=0.55,
       tree_method=None, validate_parameters=False, verbosity=None)

In [107]:
y_pred = xg_clas.predict(X_test_encode)

In [108]:
pd.Series(y_pred).value_counts()

1    2854
0    1302
dtype: int64

In [109]:
subm = pd.DataFrame({"EmployeeID": X_test_Employee, "target":y_pred})

In [110]:
subm.to_csv("subm_xgbost_2.csv", index=None)

In [16]:
#!pip install category_encoders --user