In [63]:
#importing the required libraries and packages

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import numpy.random as nr
import sklearn.model_selection as ms
import seaborn as sns
from sklearn.model_selection import cross_validate
import sklearn.metrics as sklm
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn import linear_model
from sklearn import preprocessing
import math
import scipy.stats as ss
from sklearn import feature_selection as fs

import tensorflow as tf
import keras
from keras import backend as k
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.optimizers import Adam, SGD
from keras.metrics import categorical_crossentropy

%matplotlib inline

In [3]:
#reading the train and test files in

features = pd.read_csv("train.csv")
test = pd.read_csv('test.csv')
print(test.shape)
print(features.shape)

(16496, 18)
(38312, 19)


In [4]:
#recreating new features/columns

def new_name(data, cols):
    for col in cols:
        data[col + '_Str'] = data[col].astype(str)
    
cols = ['Year_of_birth', 'Year_of_recruitment']
new_name(features, cols)
new_name(test, cols)

In [5]:
#dropping unwanted features/columns

features = features.drop(['Year_of_recruitment', 'Year_of_birth'], axis = 1)
test = test.drop(['Year_of_recruitment', 'Year_of_birth'], axis = 1)

print(features.shape)
print(test.shape)

(38312, 19)
(16496, 18)


In [6]:
#aggregating categorical variables
#grouping the years into similar categories using a dictionary and list comprehension

year_categories = {'1991': '1990-2001', '1990': '1990-2001', '1989': '1980-1989', '1992': '1990-2001',
                   '1988': '1980-1989', '1987': '1980-1989', '1993': '1990-2001', '1994': '1990-2001',
                  '1986': '1980-1989', '1985': '1980-1989', '1984': '1980-1989', '1995': '1990-2001', 
                  '1983': '1980-1989', '1982': '1980-1989', '1981': '1980-1989', '1980': '1980-1989',
                  '1996': '1990-2001', '1979': '1970-1979', '1978': '1970-1979', '1977': '1970-1979',
                   '1976': '1970-1979', '1975': '1970-1979', '1997': '1990-2001', '1973': '1970-1979',
                   '1974': '1970-1979', '1971': '1970-1979', '1972': '1970-1979', '1970': '1970-1979',
                   '1969': '1950-1969', '1968': '1950-1969', '1998': '1990-2001', '1966': '1950-1969',
                   '1967': '1950-1969', '1965': '1950-1969', '1964': '1950-1969', '1963': '1950-1969',
                   '1961': '1950-1969', '1962': '1950-1969', '1999': '1990-2001', '2001': '1990-2001',
                   '2000': '1990-2001', '1957': '1950-1969', '1956': '1950-1969', '1955': '1950-1969',
                   '1950': '1950-1969', '1952':'1950-1969', '1958': '1950-1969', '1959': '1950-1969',
                   '1960': '1950-1969'}

features['Year_of_birth_Str'] = [year_categories[x] for x in features['Year_of_birth_Str']]
print(features['Year_of_birth_Str'].value_counts())

test['Year_of_birth_Str'] = [year_categories[x] for x in test['Year_of_birth_Str']]
print(test['Year_of_birth_Str'].value_counts())

1980-1989    16611
1990-2001    15163
1970-1979     4853
1950-1969     1685
Name: Year_of_birth_Str, dtype: int64
1980-1989    7172
1990-2001    6489
1970-1979    2057
1950-1969     778
Name: Year_of_birth_Str, dtype: int64


In [7]:
year_categories1 = {'1982': '1982-2000', '1985': '1982-2000', '1986': '1982-2000', '1987': '1982-2000', '1988': '1982-2000', '1989': '1982-2000',
                    '1990': '1982-2000', '1991': '1982-2000', '1992': '1982-2000', '1993': '1982-2000', 
                   '1994': '1982-2000', '1995': '1982-2000', '1996': '1982-2000', '1997': '1982-2000', 
                   '1998': '1982-2000', '1999': '1982-2000', '2000': '1982-2000', '2001': '2001-2005', 
                   '2002': '2001-2005', '2003': '2001-2005', '2004': '2001-2005', '2005': '2001-2005', 
                   '2006': '2006-2010', '2007': '2006-2010', '2008': '2006-2010', '2009': '2006-2010', 
                   '2010': '2006-2010', '2011': '2011-2015', '2012': '2011-2015', '2013': '2011-2015', 
                   '2014': '2011-2015', '2015': '2011-2015', '2016': '2016-2018', '2017': '2016-2018', 
                   '2018': '2016-2018'}

features['Year_of_recruitment_Str'] = [year_categories1[x] for x in features['Year_of_recruitment_Str']]
print(features['Year_of_recruitment_Str'].value_counts())

test['Year_of_recruitment_Str'] = [year_categories1[x] for x in test['Year_of_recruitment_Str']]
print(test['Year_of_recruitment_Str'].value_counts())

2011-2015    18092
2016-2018    12766
2006-2010     5023
2001-2005     1754
1982-2000      677
Name: Year_of_recruitment_Str, dtype: int64
2011-2015    7744
2016-2018    5498
2006-2010    2196
2001-2005     760
1982-2000     298
Name: Year_of_recruitment_Str, dtype: int64


In [8]:
#aggregating the states into similar states

states = {'ABIA': 'EAST', 'ANAMBRA': 'EAST', 'DELTA': 'EAST', 'BAYELSA': 'EAST',
         'ENUGU': 'EAST', 'EBONYI': 'EAST', 'RIVERS': 'EAST', 'CROSS RIVER': 'EAST',
         'AKWA IBOM': 'EAST', 'IMO': 'EAST', 'KATSINA': 'NORTH', 'KANO': 'NORTH', 'NIGER': 'NORTH',
         'SOKOTO': 'NORTH', 'KANO': 'NORTH', 'KADUNA': 'NORTH', 'BORNO': 'NORTH', 'TARABA':'NORTH',
         'YOBE': 'NORTH', 'ADAMAWA': 'NORTH', 'KEBBI': 'NORTH', 'JIGAWA': 'NORTH', 'ZAMFARA': 'NORTH',
         'KEBBI': 'NORTH', 'PLATEAU': 'NORTH', 'NASSARAWA': 'NORTH', 'FCT': 'NORTH', 'PLATEAU': 'NORTH',
         'BENUE': 'NORTH', 'KOGI':'NORTH', 'BAUCHI': 'NORTH', 'GOMBE': 'NORTH', 'OYO': 'WEST', 'LAGOS': 'WEST', 'OGUN': 'WEST',
         'OSUN': 'WEST', 'EKITI': 'WEST', 'ONDO': 'WEST', 'KWARA': 'WEST', 'EDO': 'WEST'}

features['State_Of_Origin'] = [states[x] for x in features['State_Of_Origin']]
print(features['State_Of_Origin'].value_counts())

test['State_Of_Origin'] = [states[x] for x in test['State_Of_Origin']]
print(test['State_Of_Origin'].value_counts())

WEST     14297
NORTH    13402
EAST     10613
Name: State_Of_Origin, dtype: int64
WEST     6259
NORTH    5684
EAST     4553
Name: State_Of_Origin, dtype: int64


In [9]:
#checking the label counts to check for class inbalance

label_count = features['Promoted_or_Not'].value_counts()
label_count

0    35071
1     3241
Name: Promoted_or_Not, dtype: int64

In [10]:
#creating the label into an array matrix

Labels = np.array(features['Promoted_or_Not'])

In [11]:
#Encoding categorical variables

def encode_string(data):
    enc = preprocessing.LabelEncoder()
    enc.fit(data)
    enc_features = enc.transform(data)
    ohe = preprocessing.OneHotEncoder()
    encoded = ohe.fit(enc_features.reshape(-1,1))
    return encoded.transform(enc_features.reshape(-1,1)).toarray()
    
categorical_columns = ['Gender', 'Channel_of_Recruitment', 'State_Of_Origin',
                       'Foreign_schooled', 'Marital_Status', 'Past_Disciplinary_Action', 'Previous_IntraDepartmental_Movement',
                       'Year_of_birth_Str', 'Year_of_recruitment_Str']
Features_enc = encode_string(features['Division'])
for col in categorical_columns:
    temp = encode_string(features[col])
    Features_enc = np.concatenate([Features_enc, temp], axis = 1)
    
print(Features_enc.shape)

test_enc = encode_string(test['Division'])
for col in categorical_columns:
    temps = encode_string(test[col])
    test_enc = np.concatenate([test_enc, temps], axis = 1)
    
print(test_enc.shape)

(38312, 35)
(16496, 35)


In [12]:
#concatenating the numerical variables with the categorical variables

Features_enc = np.concatenate([Features_enc, np.array(features[['Trainings_Attended', 'Last_performance_score',
                                                              'Targets_met', 'Previous_Award', 'Training_score_average']])], axis = 1)

print(Features_enc.shape)

test_enc = np.concatenate([test_enc, np.array(test[['Trainings_Attended', 'Last_performance_score',
                                                              'Targets_met', 'Previous_Award', 'Training_score_average']])], axis = 1)

print(test_enc.shape)

(38312, 40)
(16496, 40)


In [13]:
#using StandardScaler function to scale the numeric features 

scaler = preprocessing.StandardScaler().fit(Features_enc[:, 36:])
Features_enc[:, 36:] = scaler.transform(Features_enc[:, 36:])
Features_enc[:, 31:]

array([[ 0.        ,  0.        ,  1.        , ...,  1.35384256,
        -0.15395043, -1.07512768],
       [ 0.        ,  0.        ,  1.        , ..., -0.73863832,
        -0.15395043, -0.25193251],
       [ 0.        ,  0.        ,  1.        , ..., -0.73863832,
        -0.15395043, -1.00029176],
       ...,
       [ 0.        ,  0.        ,  0.        , ...,  1.35384256,
        -0.15395043,  1.16995006],
       [ 0.        ,  0.        ,  0.        , ..., -0.73863832,
        -0.15395043, -1.37447138],
       [ 0.        ,  0.        ,  1.        , ..., -0.73863832,
        -0.15395043, -1.07512768]])

In [14]:
test_enc[:, 36:] = scaler.transform(test_enc[:, 36:])
test_enc[:, 36]

array([-0.05313941, -2.05629819, -0.05313941, ...,  1.28229978,
       -1.3885786 , -0.05313941])

In [15]:
#encoding the year of birth category

birth_encoded = encode_string(features['Year_of_birth_Str'])
birth_encoded_test = encode_string(test['Year_of_birth_Str'])

In [16]:
#concatenating the year of birth(encoded) to the rest of the features

Features_enc = np.concatenate([Features_enc, birth_encoded], axis = 1)
print(Features_enc.shape)

test_enc = np.concatenate([test_enc, birth_encoded_test], axis = 1)
print(test_enc.shape)

(38312, 44)
(16496, 44)


In [17]:
#aggregating the number of previous employers

employers = {'0': '0-1', '1': '0-1', '2': '2-3', '3': '2-3',
             '4': 'Greater than 3', '5': 'Greater than 3', 'More than 5': 'Greater than 5'}

features['No_of_previous_employers'] = [employers[x] for x in features['No_of_previous_employers']]
test['No_of_previous_employers'] = [employers[x] for x in test['No_of_previous_employers']]

print(features['No_of_previous_employers'].value_counts())
print(test['No_of_previous_employers'].value_counts())

0-1               32139
2-3                3505
Greater than 3     2267
Greater than 5      401
Name: No_of_previous_employers, dtype: int64
0-1               13931
2-3                1469
Greater than 3      940
Greater than 5      156
Name: No_of_previous_employers, dtype: int64


In [18]:
#encoding the number of previous employers(encoded) category

employers_encodedd = encode_string(features['No_of_previous_employers'])
employers_encoded_testt = encode_string(test['No_of_previous_employers'])

In [19]:
Features_enc = np.concatenate([Features_enc, employers_encodedd], axis = 1)
print(Features_enc.shape)

test_enc = np.concatenate([test_enc, employers_encoded_testt], axis = 1)
print(test_enc.shape)

(38312, 48)
(16496, 48)


In [20]:
## Randomly sample cases to create independent training and test data
nr.seed(9988)
indx = range(Features_enc.shape[0])
indx = ms.train_test_split(indx, test_size = 0.3)
x_train = Features_enc[indx[0],:]
y_train = np.ravel(Labels[indx[0]])
x_test = Features_enc[indx[1],:]
y_test = np.ravel(Labels[indx[1]])

In [21]:
## Randomly sample cases to create independent training and test data
np.random.seed(9988)
indx1 = range(x_test.shape[0])
indx1 = ms.train_test_split(indx1, test_size = 0.5)
x_test1 = x_test[indx1[0],:]
y_test1 = np.ravel(y_test[indx1[0]])
x_test2 = x_test[indx1[1],:]
y_test2 = np.ravel(y_test[indx1[1]])

In [22]:
valid_set = (x_test1, y_test1)

In [None]:
model = Sequential([Dense(24, input_shape=(48,), activation='relu'),
                    Dense(12, activation='relu'),
                    Dense(6, activation='relu'),
                    Dense(2, activation='sigmoid'),])

callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)

model.compile(Adam(lr=0.0001), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

model1 = Sequential([Dense(5, input_shape=(48,), activation='relu'),
                    Dense(2, activation='sigmoid'),])

model1.compile(Adam(lr=0.0001), loss='BinaryCrossentropy', metrics=['accuracy'])

In [89]:
model.fit(x_train, y_train, batch_size=10, validation_data = valid_set, epochs=30, callbacks=[callback], shuffle=True, verbose=2)

Epoch 1/30
2682/2682 - 20s - loss: 0.3330 - accuracy: 0.8889 - val_loss: 0.2427 - val_accuracy: 0.9193
Epoch 2/30
2682/2682 - 17s - loss: 0.2442 - accuracy: 0.9158 - val_loss: 0.2336 - val_accuracy: 0.9210
Epoch 3/30
2682/2682 - 17s - loss: 0.2378 - accuracy: 0.9173 - val_loss: 0.2284 - val_accuracy: 0.9234
Epoch 4/30
2682/2682 - 22s - loss: 0.2333 - accuracy: 0.9192 - val_loss: 0.2239 - val_accuracy: 0.9250
Epoch 5/30
2682/2682 - 28s - loss: 0.2292 - accuracy: 0.9210 - val_loss: 0.2192 - val_accuracy: 0.9278
Epoch 6/30
2682/2682 - 17s - loss: 0.2249 - accuracy: 0.9231 - val_loss: 0.2147 - val_accuracy: 0.9287
Epoch 7/30
2682/2682 - 18s - loss: 0.2205 - accuracy: 0.9246 - val_loss: 0.2106 - val_accuracy: 0.9302
Epoch 8/30
2682/2682 - 21s - loss: 0.2162 - accuracy: 0.9262 - val_loss: 0.2061 - val_accuracy: 0.9311
Epoch 9/30
2682/2682 - 30s - loss: 0.2120 - accuracy: 0.9271 - val_loss: 0.2022 - val_accuracy: 0.9327
Epoch 10/30
2682/2682 - 40s - loss: 0.2083 - accuracy: 0.9278 - val_loss:

KeyboardInterrupt: 

In [78]:
model.loss

'sparse_categorical_crossentropy'

In [79]:
pred = model.predict_classes(x_test2)

In [80]:
#Features_enc.reshape(38312, 48, 1).shape

In [81]:
predictions = model.predict(test_enc, batch_size=10, verbose=0)

In [82]:
predictions[:200]

array([[5.5445683e-01, 8.7758899e-04],
       [3.6999291e-01, 2.6103258e-03],
       [5.5075854e-01, 1.1201799e-03],
       [5.1290214e-01, 1.0842681e-03],
       [1.6052684e-01, 2.2913039e-02],
       [1.6448212e-01, 2.7233869e-02],
       [1.6959473e-01, 2.9939324e-02],
       [3.9273870e-01, 2.8153360e-03],
       [1.8527496e-01, 5.4462880e-02],
       [4.1220760e-01, 1.9690990e-03],
       [5.3845000e-01, 1.0144413e-03],
       [5.5897880e-01, 8.6322427e-04],
       [1.5857083e-01, 3.7951171e-02],
       [3.4828404e-01, 2.6604235e-03],
       [2.1094906e-01, 3.4309506e-02],
       [1.8961650e-01, 1.3693392e-02],
       [4.2969659e-01, 2.4564862e-03],
       [2.1924081e-01, 3.4502119e-02],
       [4.5811903e-01, 1.7595589e-03],
       [2.4947292e-01, 4.1102886e-02],
       [1.9057283e-01, 1.9019961e-02],
       [4.1576865e-01, 1.9321740e-03],
       [2.2000879e-01, 2.4123728e-02],
       [5.1817006e-01, 1.0443121e-01],
       [1.7629999e-01, 5.7588309e-02],
       [1.4461514e-01, 2.

In [83]:
preds = model.predict_classes(test_enc, batch_size=10, verbose=0)

In [84]:
preds[:200]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0])

In [85]:
np.unique(preds)

array([0, 1])

In [86]:
confusion_matrix(pred, y_test2)

array([[5224,  342],
       [  15,  166]])

In [87]:
accuracy_score(pred, y_test2)

0.9378806333739342

In [None]:
from keras.wrappers.scikit_learn import KerasRegressor