In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from category_encoders import WOEEncoder

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**Downloading data**

In [None]:
train_df = pd.read_csv('/kaggle/input/playground-series-s3e3/train.csv')
test_df = pd.read_csv('/kaggle/input/playground-series-s3e3/test.csv')
submission = pd.read_csv('/kaggle/input/playground-series-s3e3/sample_submission.csv')
addition_data = pd.read_csv('/kaggle/input/ibm-hr-analytics-attrition-dataset/WA_Fn-UseC_-HR-Employee-Attrition.csv')

train_df['is_generated'] = 1
test_df['is_generated'] = 1
addition_data['is_generated'] = 0

In [None]:
train_df = train_df.drop(527).drop(1398).reset_index(drop=True)

In [None]:
train_df = train_df.drop('id', axis=1)
train_df

In [None]:
train_df.Attrition.hist()

In [None]:
train_df.isna().any()

In [None]:
# addition_data['Attrition'] = (addition_data['Attrition'] == 'Yes').astype(int)
addition_data['Attrition'] = addition_data['Attrition'].map(lambda x: 1 if x == "Yes" else 0)
# addition_data = addition_data[addition_data.Attrition == 1]

In [None]:
# addition_data.Attrition.hist()

In [None]:
addition_data.isna().any()

In [None]:
train_df = pd.concat([train_df, addition_data],axis=0, ignore_index=True)
train_df = train_df.drop('EmployeeNumber', axis=1)
train_df

In [None]:
train_df.isna().any()

In [None]:
def is_young(x):
    if x <=25:
        return 1
    else:
        return 0
    
def young_and_low_daily_rate(x):
    if x['Age'] <= 25 & x['DailyRate'] < 500:
        return 1
    else:
        return 0
    
def overtime_satisfaction(x):
        if x['OverTime'] == 'Yes':
            return (x['MonthlyIncome'] * (x['StockOptionLevel'] + 0.05) * x['JobSatisfaction'])/x['Age']
        else:
            return (x['MonthlyIncome'] * (x['StockOptionLevel'] + 1.05) * x['JobSatisfaction'])/x['Age']

In [None]:
df = pd.concat([train_df, test_df], axis=0)
df = df.drop(["EmployeeCount", "Over18", "StandardHours"], axis=1)

df['is_young'] = df['Age'].apply(lambda x: is_young(x))
df['young_and_underpaid'] = df.apply(lambda x: young_and_low_daily_rate(x), axis = 1)
df['worklife_stock'] = df.apply(lambda x: x['WorkLifeBalance'] + x['StockOptionLevel'], axis = 1)

df['income_satisfaction'] = df.apply(lambda x: x['JobSatisfaction'] * x['MonthlyIncome'], axis = 1)
df['income_level_environ_job_sat'] = df.apply(lambda x: x['EnvironmentSatisfaction']*x['JobSatisfaction'] * (x['MonthlyIncome']/x['JobLevel']), axis = 1)
df['overtime_stock'] = df.apply(lambda x: overtime_satisfaction(x), axis = 1)

# df = pd.get_dummies(df)
df

In [None]:
features = ['Age', 'BusinessTravel', 'DailyRate', 'Department', 'DistanceFromHome',
       'Education', 'EducationField', 'EnvironmentSatisfaction', 'Gender',
       'HourlyRate', 'JobInvolvement', 'JobLevel', 'JobRole',
       'JobSatisfaction', 'MaritalStatus', 'MonthlyIncome', 'MonthlyRate',
       'NumCompaniesWorked', 'OverTime', 'PercentSalaryHike',
       'PerformanceRating', 'RelationshipSatisfaction', 'StockOptionLevel',
       'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance',
       'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion',
       'YearsWithCurrManager', 'is_generated', 'id', 'is_young',
       'young_and_underpaid', 'worklife_stock', 'income_satisfaction',
        'income_level_environ_job_sat', 'overtime_stock']
cat_features = ['BusinessTravel', 'Department','Education', 'EducationField', 'EnvironmentSatisfaction', 'Gender',
               'JobInvolvement', 'JobLevel', 'JobRole', 'JobSatisfaction', 'MaritalStatus','NumCompaniesWorked', 'OverTime', 
               'PerformanceRating', 'RelationshipSatisfaction', 'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear', 
                'WorkLifeBalance', 'YearsAtCompany','is_young', 'young_and_underpaid']

Woe encoder from @faelk8 notebook https://www.kaggle.com/code/faelk8/catboost/notebook

In [None]:
woe = WOEEncoder(drop_invariant=True, randomized = True)
for col in cat_features:
    df[col] = df[col].astype(str)
woe.fit(df[features][:-len(test_df)], df['Attrition'][:-len(test_df)], cols = cat_features)
X = woe.transform(df[features])
X['Attrition'] = df['Attrition']
df = X

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

y = df['Attrition']
df = df.drop(['id', 'Attrition'], axis=1)

df[df.columns] = scaler.fit_transform(df[df.columns])

In [None]:
train_df = df.iloc[:-len(test_df),:]
train_df['Attrition'] = y[:-len(test_df)]
test_df = df.iloc[-len(test_df):,:].reset_index(drop=True)

X = train_df.drop('Attrition', axis=1)
y = train_df.Attrition
X_test = test_df

In [None]:
# X1 = X[:500]
# y1 = y[:500]
# X = X[500:]
# y = y[500:]

In [None]:
X

**Keras NN**

In [None]:
from tensorflow import keras
import tensorflow_addons as tfa
from tensorflow.keras import layers

from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Dense, Dropout, Input, Lambda, Concatenate, Add, BatchNormalization, LeakyReLU

from sklearn.model_selection import KFold, StratifiedKFold, RepeatedStratifiedKFold

from sklearn.metrics import classification_report

# import keras_tuner

In [None]:
models = []
preds = []

class_weight = 10 

n_folds = 11 #10
repeats = 10 #10 
dr = 0.1     

# k_fold = StratifiedKFold(n_splits=n_folds, random_state=42, shuffle=True)
k_fold = RepeatedStratifiedKFold(n_splits=n_folds, n_repeats=repeats, random_state=42)

def get_model():
    model = keras.Sequential([
    layers.Dense(512), 
    layers.LeakyReLU(alpha=0.3),
    layers.Dropout(rate=dr),
    layers.Dense(256), 
    layers.LeakyReLU(alpha=0.3),
    layers.Dropout(rate=dr),
    layers.Dense(128), 
    layers.LeakyReLU(alpha=0.3),
    layers.Dropout(rate=dr),
    layers.Dense(64), 
    layers.LeakyReLU(alpha=0.3),
    layers.Dropout(rate=dr),
    layers.BatchNormalization(),
    layers.Dense(32), 
    layers.LeakyReLU(alpha=0.3),
    layers.Dropout(rate=dr),
    layers.Dense(16), 
    layers.LeakyReLU(alpha=0.3),
    layers.Dropout(rate=dr),
    layers.Dense(8),
    layers.LeakyReLU(alpha=0.3),
    layers.Dropout(rate=dr),
    layers.Dense(4), 
    layers.LeakyReLU(alpha=0.3),
    layers.Dense(2), 
    layers.LeakyReLU(alpha=0.3),
    layers.Dense(1, activation='sigmoid')
   ])

    opt = keras.optimizers.Adam(learning_rate=0.0001)
    
    model.compile(
    optimizer=opt,
        loss=tfa.losses.SigmoidFocalCrossEntropy(
                                             alpha=0.8,
                                             gamma=2.0
                                             ),
        metrics='AUC',
)
    
    return model



early_stopping = keras.callbacks.EarlyStopping(
        monitor="val_auc", 
        mode='max',
        patience=30,
        min_delta=0.00001,
        restore_best_weights=True,
)
plat = keras.callbacks.ReduceLROnPlateau(
        monitor="val_auc", 
        mode='max', 
        patience=3, 
        factor=0.1, 
        min_lr=1e-8, 
        min_delta=0.000001)


for train_index, test_index in k_fold.split(X, y):
    X_train, X_valid = X.iloc[train_index], X.iloc[test_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[test_index]
    
    model = get_model()

    history = model.fit(
          X_train, y_train,
          validation_data=(X_valid, y_valid),
          batch_size=64,
          epochs=500,
          class_weight = { 0: 1.0, 1: class_weight, },
          callbacks=[early_stopping, plat],
          verbose=0
         )
    
#     print(classification_report(y, np.round(model.predict(X))))
    models.append(model)
    preds.append(model.predict(X_test))


In [None]:
history_df = pd.DataFrame(history.history)
history_df.loc[1:, ['loss', 'val_loss']].plot()
history_df.loc[1:, ['auc', 'val_auc']].plot()

In [None]:
# preds_evalX1 = []
# for model in models:
#     preds_evalX1.append(model.predict(X1))
    
# predsX1 = np.average(np.array(preds_evalX1),axis=0)
# dfX1 = pd.DataFrame(predsX1)
# dfX1.to_csv('KerasNNX1.csv')
# dfX1

In [None]:
preds_eval = []
for model in models:
    preds_eval.append(model.predict(X))
    
print(classification_report(y, np.round(np.average(np.array(preds_eval),axis=0))))

In [None]:
pred = np.average(np.array(preds),axis=0)
pred

In [None]:
submission['Attrition'] = pred
submission['Attrition'] = submission['Attrition'].clip(0,1)
submission

In [None]:
submission.to_csv('submission.csv', index=False)

In [None]:
submission.Attrition.hist()