In [236]:
import pandas as pd
import numpy as np

from sklearn.metrics import fbeta_score

pd.options.display.max_colwidth=100
pd.options.display.max_columns=300

employees = pd.read_csv("../data/employees.csv")
history = pd.read_csv("../data/history.csv")
submission = pd.read_csv("../data/submission.csv")

# history.loc[:,'Date'] = list(pd.to_datetime(history['Date']))

def get_month(text):
    if type(text) == str:
        numbers = text.split("/")
        return int(numbers[0])

def get_year(text):
    if type(text) == str:
        numbers = text.split("/")
        return int(numbers[-1])

df = history.merge(employees)

# for data labeling
def label_df(data, n_month=3):
    labels = []
    for emp in data.EmployeeID.unique():
        curr_emp = list(data[data.EmployeeID == emp]['DismissalDate'])
        len_emp = len(curr_emp)
        if pd.isnull(curr_emp[0]):
            labels += [0 for _ in range(len_emp - n_month)] + [2 for _ in range(n_month)]
        else:
            labels += [0 for _ in range(len_emp - n_month)] + [1 for _ in range(n_month)]
    return labels

lbls = label_df(df)
df['target'] = lbls
df = df[df.target!=2]

In [237]:
# !pip install imblearn
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
import random

RANDOM_SEED = 42

def set_seed(random_state=RANDOM_SEED):
    random.seed(random_state)
    np.random.seed(random_state)

def model_fit(X_train, y_train):
    clf2 = BalancedRandomForestClassifier(n_estimators=200, n_jobs = 8)
    clf2 = clf2.fit(X_train, y_train)
    return clf2

def model_predict(X, models):
    preds = models.predict(X)
    return preds

In [239]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

columns_to_drop = ['HiringDate', 'DismissalDate']

cat_columns = ['DevCenterID', 'SBUID', 'PositionID', 'PositionLevel', 
               'IsTrainee', 'LanguageLevelID', 'CustomerID', 'ProjectID', 
               'IsInternalProject', 'OnSite', 'CompetenceGroupID', 'FunctionalOfficeID',
               'PaymentTypeId']

X = df.drop(columns_to_drop, axis = 1)

from category_encoders.basen import BaseNEncoder
encoder = BaseNEncoder(cols = cat_columns, base = 2)
X = encoder.fit_transform(X)

USE_SCALER = True
RANDOM_SEED = 1
set_seed()

X["month"] = X["Date"].apply(get_month)
X["year"] = X["Date"].apply(get_year)
X = X.sort_values(["year","month"])
X["year_month"] = X.apply(lambda row: str(row["year"])+"_"+str(row["month"]), axis=1)
year_month = list(X.year_month.unique())
mapping_year_month = dict(zip(year_month, range(len(year_month))))
X["order_in_time"] = X["year_month"].map(mapping_year_month)
splits = [[[0,9],[10,11]],
          [[0,11],[12,13]],
          [[0,13],[14,15]],
          [[0,15],[16,17]],
          [[0,17],[18,19]]]

fold = 0
scores = []

kf = KFold(n_splits=5, random_state=RANDOM_SEED, shuffle=True)
kf.get_n_splits(X)

for split in splits:
    fold += 1
    if fold ==5:
        continue;
    
    X_train = X.query(f"order_in_time >= "+str(split[0][0])+" & order_in_time <"+str(split[0][1]))
    X_val = X.query(f"order_in_time >= "+str(split[1][0])+" & order_in_time <"+str(split[1][1]))
    
    X_val = X_val.sort_values(by="Date").drop_duplicates(subset=["EmployeeID"], keep="last")
    
    
    y_train = X_train['target']
    y_val = list(X_val['target'])
    
    X_train =X_train.drop(columns=['EmployeeID',"month", "year", "year_month", "order_in_time", "Date",'target'])
    X_val =X_val.drop(columns=['EmployeeID',"month", "year", "year_month", "order_in_time", "Date",'target'])
    
    if USE_SCALER:
        X_train = scaler.fit_transform(X_train)
        X_val = scaler.transform(X_val)
    
    print("fold: ", fold)
    print("train:", len(X_train))
    print("val:",len(X_val))
    print(" ")
    
    print('FOLD #{}'.format(fold))
    models = model_fit(X_train, y_train)
    print('END OF MODEL FIT')
    
    y_pred = model_predict(X_val, models)

    score = fbeta_score(y_val, y_pred, beta=1.7)
    
    print('Validation Score: {}'.format(score))
    scores.append(score)
    
mean_score = np.mean(scores)
print('MEAN OF SCOREs: {}'.format(mean_score))   


y = X.target
emps_X = X.EmployeeID
X = X.drop(columns=['EmployeeID',"month", "year", "year_month", "order_in_time", "Date",'target'])

model = model_fit(X, y)   

fold:  1
train: 37624
val: 4527
 
FOLD #1
END OF MODEL FIT
Validation Score: 0.2724328109953602
fold:  2
train: 46644
val: 4572
 
FOLD #2
END OF MODEL FIT
Validation Score: 0.2361719843188047
fold:  3
train: 55756
val: 4719
 
FOLD #3
END OF MODEL FIT
Validation Score: 0.29528479609929076
fold:  4
train: 65100
val: 4576
 
FOLD #4
END OF MODEL FIT
Validation Score: 0.30095223691328654
MEAN OF SCOREs: 0.27621045708168557


# Prediction logic:

In [137]:
from datetime import datetime

history.loc[:,'Date'] = list(pd.to_datetime(history['Date']))

df = history.merge(employees)
df['target'] = lbls

X_test = df[df.target==2]

X_test = X_test[X_test.EmployeeID.isin(set(submission.EmployeeID))]


In [138]:
emp_ids = X_test.EmployeeID
X_test = X_test.drop(columns_to_drop, axis = 1)
X_test = encoder.transform(X_test)
X_test = X_test.drop(columns=['EmployeeID',"Date",'target'])

In [139]:
preds = model.predict_proba(X_test)

In [140]:
result = pd.DataFrame({'EmployeeID':emp_ids, 'target':preds[:,1]})
result = pd.DataFrame({'EmployeeID':(result.groupby('EmployeeID').max().target>0.5).index, 'target':(result.groupby('EmployeeID').max().target>0.5).astype(int).values})

In [141]:
result.to_csv('Mykola_initial_submit_max_score.csv', index = False)

In [142]:
submission = pd.read_csv("../data/submission.csv")
len(submission.EmployeeID.unique())

4156

In [143]:
result

Unnamed: 0,EmployeeID,target
0,00116D71-E87D-4B64-A566-1F29B2A798A8,0
1,0034ECA4-0562-4AC7-A826-4AE81C64D69F,0
2,00384806-F711-41BA-A924-8F27E996F891,1
3,005B5FD6-FD19-4924-98E4-4C06F7F6BF2C,1
4,0061CAE7-B123-46B0-9BF7-E1E94E9AD80B,0
...,...,...
4151,FFCFA379-9529-49EF-87BA-1522FE94B415,0
4152,FFE9E1F0-1DB1-4BA8-A8FB-026E7DBCF49F,1
4153,FFEBB9DA-B0CF-49AE-91D3-14A0BF22219E,1
4154,FFED12A3-5B28-4101-908A-2851CBADE045,0


# Developing LSTM model

# building train and test datasets:

In [593]:
# building train and test datasets:
import pandas as pd
import numpy as np

from sklearn.metrics import fbeta_score

pd.options.display.max_colwidth=100
pd.options.display.max_columns=300

employees = pd.read_csv("../data/employees.csv")
history = pd.read_csv("../data/history.csv")
submission = pd.read_csv("../data/submission.csv")

history.loc[:,'Date'] = list(pd.to_datetime(history['Date']))

def get_month(text):
    if type(text) == str:
        numbers = text.split("/")
        return int(numbers[0])

def get_year(text):
    if type(text) == str:
        numbers = text.split("/")
        return int(numbers[-1])

df = history.merge(employees)

# for data labeling
def label_df(data, n_month=3):
    labels = []
    for emp in data.EmployeeID.unique():
        curr_emp = list(data[data.EmployeeID == emp]['DismissalDate'])
        len_emp = len(curr_emp)
        if pd.isnull(curr_emp[0]):
            labels += [0 for _ in range(len_emp - n_month+2)] + [2 for _ in range(n_month-2)]
        else:
            labels += [0 for _ in range(len_emp - n_month)] + [1 for _ in range(n_month)]
    return labels

lbls = label_df(df)
df['target'] = lbls
df = df[df.target!=2]

In [631]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

columns_to_drop = ['Date','HiringDate', 'DismissalDate']

cat_columns = ['DevCenterID', 'SBUID', 'PositionID', 'PositionLevel', 
               'IsTrainee', 'LanguageLevelID', 'CustomerID', 'ProjectID', 
               'IsInternalProject', 'OnSite', 'CompetenceGroupID', 'FunctionalOfficeID',
               'PaymentTypeId']

X = df.drop(columns_to_drop, axis = 1)

from category_encoders.basen import BaseNEncoder
encoder = BaseNEncoder(cols = cat_columns, base = 2)

X = encoder.fit_transform(X)
X = X.reset_index()

tmp = X.EmployeeID
X.drop('EmployeeID', axis = 1, inplace = True)
cols = X.columns

X = pd.DataFrame(scaler.fit_transform(X), columns = cols)
X['EmployeeID'] = tmp

In [632]:
test_set = X[X.EmployeeID.isin(set(submission.EmployeeID))]
test_set = test_set.groupby('EmployeeID').tail(5)
train_set = X.drop(test_set.index)

In [635]:
### test dataset:
import tensorflow as tf

cols = list(X.columns)
cols.pop(-1)
cols.pop(-1)

X_test = [person[1][cols].values for person in test_set.groupby('EmployeeID')]

In [637]:
### train dataset:
import tensorflow as tf
from tqdm import tqdm

cols = list(X.columns)
cols.pop(-1)

data = [person[1][cols].values for person in train_set.groupby('EmployeeID')]

# getting subsequences:
def get_subsequence(sequence):
    if sequence.shape[0]<6:
        return []
#     if (sequence.shape[0]>=2) and (sequence.shape[0]<6):
#         return [sequence]
    else:
        tmp_data = []
        for i in range(0, sequence.shape[0]-5):
            tmp_data.append(sequence[i:i+6])
        return tmp_data

sub_data = []
for x in tqdm(data):
    sub_data = sub_data+ get_subsequence(x)

x_train = [x[:-1,:-1] for x in sub_data]

# y_train = tf.keras.utils.to_categorical([x[-1, -1] for x in data])   # Expects class labels from 0 to n (-> subtract 1).
y_train = [x[-1, -1] for x in sub_data]
print(len(x_train))
print(len(y_train))

100%|██████████| 5216/5216 [00:00<00:00, 8106.50it/s] 

37498
37498





In [640]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.15, random_state=42)

# Training the model:

In [649]:
import pandas as pd
import tensorflow as tf

from keras import backend as K

def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))


model = tf.keras.models.Sequential()
model.add(tf.keras.layers.LSTM(5, input_shape=(None, 91), dropout=0.5, recurrent_dropout=0.5))  # LSTM for arbitrary length series.
model.add(tf.keras.layers.Dense(32, activation='relu'))
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))

optimizer = tf.keras.optimizers.Adam(lr=0.0001)

model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['acc',f1_m,precision_m, recall_m])

class TrainGenerator(tf.keras.utils.Sequence):
    def __init__(self, x, y):
        self.x = x
        self.y = y

    def __len__(self):
        return len(self.x)

    def __getitem__(self, index):
        # Need to expand arrays to have batch size 1.
        return self.x[index][None, :, :], self.y[index][None]

model.fit_generator(TrainGenerator(X_train, y_train), epochs=150)

Epoch 1/150

KeyboardInterrupt: 

In [None]:
model.fit_generator(TrainGenerator(X_train, y_train), epochs=150)

In [457]:
pred = [model.predict(x[None, :, :]).ravel() for x in X_test]

In [461]:
pred = np.array([x[0] for x in pred])
pred = np.array(pred)
# pred>0.5

In [474]:
from datetime import datetime

# history.loc[:,'Date'] = list(pd.to_datetime(history['Date']))

# df = history.merge(employees)
# df['target'] = lbls

# X_test = df[df.target==2]
# X_test = X_test[X_test.Date == datetime(2019,2,1)]

# X_test = X_test[X_test.EmployeeID.isin(set(submission.EmployeeID))]
# emp_ids = X_test.EmployeeID

result = pd.DataFrame({'EmployeeID':emp_ids, 'target':(pred>0.03).astype(int)})

result.to_csv('Mykola_initial_LSTM_submit_super_dummie_3.csv', index = False)