In [3]:
import pandas as pd
import numpy as np

from sklearn.metrics import fbeta_score

pd.options.display.max_colwidth=100
pd.options.display.max_columns=300

employees = pd.read_csv("../data/employees.csv")
history = pd.read_csv("../data/history.csv")
submission = pd.read_csv("../data/submission.csv")

history.loc[:,'Date'] = list(pd.to_datetime(history['Date']))

def get_month(text):
    if type(text) == str:
        numbers = text.split("/")
        return int(numbers[0])

def get_year(text):
    if type(text) == str:
        numbers = text.split("/")
        return int(numbers[-1])

df = history.merge(employees)

# for data labeling
def label_df(data, n_month=3):
    labels = []
    for emp in data.EmployeeID.unique():
        curr_emp = list(data[data.EmployeeID == emp]['DismissalDate'])
        len_emp = len(curr_emp)
        if pd.isnull(curr_emp[0]):
            labels += [0 for _ in range(len_emp - n_month)] + [2 for _ in range(n_month)]
        else:
            labels += [0 for _ in range(len_emp - n_month)] + [1 for _ in range(n_month)]
    return labels

lbls = label_df(df)

In [4]:
df['target'] = lbls
test = df[df.target==2]
df = df[df.target!=2]

In [5]:
columns_to_drop = ['HiringDate', 'DismissalDate']

cat_columns = ['DevCenterID', 'SBUID', 'PositionID', 'PositionLevel', 
               'IsTrainee', 'LanguageLevelID', 'CustomerID', 'ProjectID', 
               'IsInternalProject', 'OnSite', 'CompetenceGroupID', 'FunctionalOfficeID',
               'PaymentTypeId']

X = df.drop(columns_to_drop, axis = 1)

from category_encoders.basen import BaseNEncoder
encoder = BaseNEncoder(cols = cat_columns, base = 2)
X = encoder.fit_transform(X)

In [54]:
empls = np.array(X.EmployeeID.unique())
# X_test = pd.DataFrame(columns = X.columns)

In [47]:
### Developing X_test
from tqdm import tqdm

for em in tqdm(empls):
    max_date = X[X.EmployeeID == em].Date.max()
    X_test_em = X[X.EmployeeID == em][X.Date == max_date]
    X_test = pd.concat([X_test, X_test_em], axis = 0)
    X = X.drop(X_test_em.index.values)

  
100%|██████████| 5373/5373 [02:52<00:00, 31.17it/s]


In [64]:
# !pip install imblearn
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
import random

RANDOM_SEED = 42

def set_seed(random_state=RANDOM_SEED):
    random.seed(random_state)
    np.random.seed(random_state)

set_seed()
def model_fit(X_train, y_train):
    clf2 = BalancedRandomForestClassifier(n_estimators=200, n_jobs = 8)
    clf2 = clf2.fit(X_train, y_train)
    return clf2

def model_predict(X, models):
    preds = models.predict(X)
    return preds

In [81]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

USE_SCALER = True
RANDOM_SEED = 42

fold = 0
scores = []

kf = KFold(n_splits=5, random_state=RANDOM_SEED, shuffle=True)
kf.get_n_splits(empls)

for train_index, test_index in kf.split(empls):
    train_emps = empls[train_index]
    test_emps = empls[test_index]
    
    X_train = X[X.EmployeeID.isin(train_emps)]
    X_val = X_test[X_test.EmployeeID.isin(test_emps)]

    y_train = X_train.target.values.astype(int)
    y_val = X_val.target.values.astype(int)
    
    X_train = X_train.drop(columns=["EmployeeID", "Date",'target'])
    X_val = X_val.drop(columns=["EmployeeID", "Date",'target'])
    
    if USE_SCALER:
        X_train = scaler.fit_transform(X_train)
        X_val = scaler.transform(X_val)
    
    print("fold: ", fold)
    print("train:", len(X_train))
    print("val:",len(X_val))
    print(" ")
    
    print('FOLD #{}'.format(fold))
    models = model_fit(X_train, y_train)
    print('END OF MODEL FIT')
    
    y_pred = model_predict(X_val, models)
    
    tmp = y_pred
    tmp2 = y_val

    score = fbeta_score(y_val, y_pred, beta=1.7)
    
    print('Validation Score: {}'.format(score))
    scores.append(score)
    
mean_score = np.mean(scores)
print('MEAN OF SCOREs: {}'.format(mean_score))   

# model = model_fit(X, y)   

fold:  0
train: 55545
val: 1075
 
FOLD #0
END OF MODEL FIT
Validation Score: 0.4504117841658976
fold:  0
train: 55456
val: 1075
 
FOLD #0
END OF MODEL FIT
Validation Score: 0.4510675218734542
fold:  0
train: 55670
val: 1075
 
FOLD #0
END OF MODEL FIT
Validation Score: 0.5058359053094988
fold:  0
train: 55118
val: 1074
 
FOLD #0
END OF MODEL FIT
Validation Score: 0.4076861780449356
fold:  0
train: 55511
val: 1074
 
FOLD #0
END OF MODEL FIT
Validation Score: 0.4388267869962055
MEAN OF SCOREs: 0.4507656352779984


In [138]:
from datetime import datetime

history.loc[:,'Date'] = list(pd.to_datetime(history['Date']))

df = history.merge(employees)
df['target'] = lbls

X_test = df[df.target==2]
X_test = X_test[X_test.Date == datetime(2019,2,1)]

X_test = X_test[X_test.EmployeeID.isin(set(submission.EmployeeID))]

In [139]:
emp_ids = X_test.EmployeeID
X_test = X_test.drop(columns_to_drop, axis = 1)

X_test = encoder.transform(X_test)

X_test = X_test.drop(columns=["Date",'target'])

In [140]:
preds = model.predict(X_test)

In [143]:
result = pd.DataFrame({'EmployeeID':emp_ids, 'target':preds})

In [144]:
result.to_csv('Mykola_initial_submit.csv', index = False)

In [145]:
submission = pd.read_csv("../data/submission.csv")
len(submission.EmployeeID.unique())

4156