In [27]:
#library imports
import pandas as pd
import numpy as np
import matplotlib as plt

In [28]:
#function and model imports
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import scale

from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm

from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.metrics import accuracy_score

In [29]:
import tensorflow as tf

In [30]:
def df_shuffle(inp):
    inp=shuffle(inp)
    inp=inp.reset_index(drop=True)
    return inp

In [31]:
train_data=pd.read_csv('../Data/train_LZdllcl.csv')
test_data=pd.read_csv('../Data/test_2umaH9m.csv')
print(train_data.shape)
print(test_data.shape)
#tr_data

(54808, 14)
(23490, 13)


In [32]:
train_data=df_shuffle(train_data)

In [33]:
t=train_data.isnull().sum()
t[t>0]

education               2409
previous_year_rating    4124
dtype: int64

In [34]:
def process_categorical(train,test):    #use this after removing target variable
    fulldata=pd.concat([train,test],axis=0)
    fulldata=fulldata.fillna(-1)
    #fulldata=fulldata.drop('region',axis=1)
    fulldata['region']=fulldata['region'].apply(lambda x : int(x[7:]))
    trainend=len(train)
    onecoded=pd.get_dummies(fulldata)
    return (onecoded[:trainend],onecoded[trainend:])

In [35]:
#drop target variable for preprocessing
ytrainfull=train_data['is_promoted']
X_train=train_data.drop('is_promoted',axis=1)

In [36]:
xtrainfull,xtestproc=process_categorical(X_train,test_data)

In [37]:
#sample train data to test on itself
end = 40000 
xtrain = xtrainfull[:end]
ytrain = ytrainfull[:end]

xvalid = xtrainfull[end:54801] 
yvalid = ytrainfull[end:54801]
xvalid=xvalid.reset_index(drop=True)
yvalid=yvalid.reset_index(drop=True)

In [38]:
xtrain.shape

(40000, 27)

In [39]:
X_train_std=scale(xtrain)
X_test_std=scale(xvalid)
X_sub_std=scale(xtestproc)

In [40]:
Y_bin_train = pd.get_dummies(ytrain)
Y_bin_test = pd.get_dummies(yvalid)

In [62]:
# Parameters
learning_rate =0.005 #0.02
#num_steps = 500
batch_size = 128#128
#display_step = 100

# Network Parameters
n_hidden_1 = 20#36 # 1st layer number of neurons
n_hidden_2 = 10#24 # 2nd layer number of neurons
n_hidden_3 = 5#12
num_input = 26 # MNIST data input (img shape: 28*28)
num_classes = 2 # MNIST total classes (0-9 digits)


# tf Graph input
X = tf.placeholder("float", [None, num_input])
Y = tf.placeholder("float", [None, num_classes])

In [63]:
# Store layers weight & bias
inz = tf.contrib.layers.xavier_initializer()
weights = {
    'h1': tf.Variable(tf.random_normal([num_input, n_hidden_1])),
    'h2': tf.Variable(tf.random_normal([n_hidden_1, n_hidden_2])),
    'h3': tf.Variable(tf.random_normal([n_hidden_2, n_hidden_3])),
    'out': tf.Variable(tf.random_normal([n_hidden_3, num_classes]))
}
biases = {
    'b1': tf.Variable(tf.random_normal([n_hidden_1])),
    'b2': tf.Variable(tf.random_normal([n_hidden_2])),
    'b3': tf.Variable(tf.random_normal([n_hidden_3])),
    'out': tf.Variable(tf.random_normal([num_classes]))
}

In [64]:
# Create model
def neural_net(x):
    # Hidden fully connected layer with 256 neurons
    layer_1 = tf.add(tf.matmul(x, weights['h1']), biases['b1'])
    layer_1 = tf.nn.relu(layer_1)
    # Hidden fully connected layer with 256 neurons
    layer_2 = tf.add(tf.matmul(layer_1, weights['h2']), biases['b2'])
    layer_2 = tf.nn.relu(layer_2)
    #layer_2 = tf.nn.dropout(layer_2, 0.9)
    # Output fully connected layer with a neuron for each class
    layer_3 = tf.add(tf.matmul(layer_2, weights['h3']), biases['b3'])
    layer_3 = tf.nn.relu(layer_3) 
    
    out_layer = tf.matmul(layer_3, weights['out']) + biases['out']
    return out_layer

In [65]:
# Construct model
logits = neural_net(X)


class_weights = tf.constant([[1.0, 12.0]])
# Define loss and optimizer
#loss_op = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(
#    logits=logits, labels=Y))
loss_op = tf.reduce_mean(tf.nn.weighted_cross_entropy_with_logits(
    logits=logits, targets=Y,pos_weight=class_weights))


# your class weights
#class_weights = tf.constant([[1.0, 3.0]])
# deduce weights for batch samples based on their true label
#weights_imb = tf.reduce_sum(class_weights * Y, axis=1)
# compute your (unweighted) softmax cross entropy loss
#unweighted_losses = tf.nn.softmax_cross_entropy_with_logits(logits= logits,labels=Y)
# apply the weights, relying on broadcasting of the multiplication
#weighted_losses = unweighted_losses * weights_imb
# reduce the result to get your final loss
#loss_op = tf.reduce_mean(weighted_losses)





optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
train_op = optimizer.minimize(loss_op)

# Evaluate model (with test logits, for dropout to be disabled)
correct_pred = tf.equal(tf.argmax(logits, 1), tf.argmax(Y, 1))
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
pred_tf=tf.argmax(logits, 1)

# Initialize the variables (i.e. assign their default value)
init = tf.global_variables_initializer()

In [78]:
# Start training

no_iterations = int(end/batch_size)-2

all_metrics=[]
for sel_col in range(0,len(xtrain.columns)):


    col_metrics=[]
    #col_mask=[sel_col,sel_col]
    col_mask =list(range(0,len(xtrain.columns)))
    col_mask.remove(sel_col)
    
    
    X_train_sel=X_train_std[:,col_mask]
    X_test_sel=X_test_std[:,col_mask]

    with tf.Session() as sess:

        # Run the initializer
        sess.run(init)

        for step in range(1, 25):
            for no_batch in range(0,no_iterations):

                batch_x = X_train_sel[(no_batch*batch_size) :((no_batch+1)*batch_size) ]
                batch_y = Y_bin_train[(no_batch*batch_size) :((no_batch+1)*batch_size) ]
                # Run optimization op (backprop)
                sess.run(train_op, feed_dict={X: batch_x, Y: batch_y})
                #if step % display_step == 0 or step == 1:
                    # Calculate batch loss and accuracy
            loss, acc = sess.run([loss_op, accuracy], feed_dict={X: X_train_sel,Y: Y_bin_train})
            print("Step " + str(step) + ", Minibatch Loss= " + \
                          "{:.4f}".format(loss) + ", Training Accuracy= " + \
                          "{:.3f}".format(acc))
            print("Testing Accuracy:", \
            sess.run(accuracy, feed_dict={X: X_test_sel,
                                          Y: Y_bin_test}))
        print("Optimization Finished!")

        # Calculate accuracy for MNIST test images
        print("Testing Accuracy:", \
            sess.run(accuracy, feed_dict={X: X_test_sel,
                                          Y: Y_bin_test}))
        test_pred= sess.run(pred_tf, feed_dict={X: X_test_sel,Y: Y_bin_test})
        tr_pred= sess.run(pred_tf, feed_dict={X: X_train_sel,Y: Y_bin_train})
        precision, recall, fscore, support = score(yvalid, test_pred)
        tac=accuracy_score(yvalid, test_pred)
        #print("iter "+str(sel_col))
        #print('precision: {}'.format(precision))
        #print('recall: {}'.format(recall))
        #print('fscore: {}'.format(fscore))
        #print('support: {}'.format(support))
        col_name = xtrain.columns[sel_col]
        col_metrics = [col_name] +[tac] + list(recall) + list(precision)+list(fscore)
        all_metrics.append(col_metrics)
        #all_metrics.append([0,0])

Step 1, Minibatch Loss= 0.8644, Training Accuracy= 0.818
Testing Accuracy: 0.8105533
Step 2, Minibatch Loss= 0.6366, Training Accuracy= 0.888
Testing Accuracy: 0.8801432
Step 3, Minibatch Loss= 0.5771, Training Accuracy= 0.902
Testing Accuracy: 0.892237
Step 4, Minibatch Loss= 0.5371, Training Accuracy= 0.909
Testing Accuracy: 0.8989257
Step 5, Minibatch Loss= 0.5062, Training Accuracy= 0.911
Testing Accuracy: 0.9012905
Step 6, Minibatch Loss= 0.4820, Training Accuracy= 0.913
Testing Accuracy: 0.9028444
Step 7, Minibatch Loss= 0.4658, Training Accuracy= 0.915
Testing Accuracy: 0.90473616
Step 8, Minibatch Loss= 0.4539, Training Accuracy= 0.916
Testing Accuracy: 0.9056145
Step 9, Minibatch Loss= 0.4448, Training Accuracy= 0.920
Testing Accuracy: 0.9091953
Step 10, Minibatch Loss= 0.4381, Training Accuracy= 0.924
Testing Accuracy: 0.914803
Step 11, Minibatch Loss= 0.4337, Training Accuracy= 0.929
Testing Accuracy: 0.91885686
Step 12, Minibatch Loss= 0.4310, Training Accuracy= 0.930
Testi

Step 23, Minibatch Loss= 0.4208, Training Accuracy= 0.934
Testing Accuracy: 0.92649144
Step 24, Minibatch Loss= 0.4110, Training Accuracy= 0.931
Testing Accuracy: 0.92284304
Optimization Finished!
Testing Accuracy: 0.92284304
Step 1, Minibatch Loss= 0.8014, Training Accuracy= 0.913
Testing Accuracy: 0.9085873
Step 2, Minibatch Loss= 0.7666, Training Accuracy= 0.914
Testing Accuracy: 0.9085873
Step 3, Minibatch Loss= 0.7402, Training Accuracy= 0.914
Testing Accuracy: 0.90892506
Step 4, Minibatch Loss= 0.6844, Training Accuracy= 0.914
Testing Accuracy: 0.9091953
Step 5, Minibatch Loss= 0.6192, Training Accuracy= 0.915
Testing Accuracy: 0.9098034
Step 6, Minibatch Loss= 0.5726, Training Accuracy= 0.915
Testing Accuracy: 0.90953314
Step 7, Minibatch Loss= 0.5471, Training Accuracy= 0.914
Testing Accuracy: 0.9081143
Step 8, Minibatch Loss= 0.5272, Training Accuracy= 0.913
Testing Accuracy: 0.90845215
Step 9, Minibatch Loss= 0.5095, Training Accuracy= 0.918
Testing Accuracy: 0.9132491
Step 1

  'precision', 'predicted', average, warn_for)


Step 1, Minibatch Loss= 0.8472, Training Accuracy= 0.878
Testing Accuracy: 0.87190056
Step 2, Minibatch Loss= 0.7939, Training Accuracy= 0.909
Testing Accuracy: 0.90399295
Step 3, Minibatch Loss= 0.7637, Training Accuracy= 0.914
Testing Accuracy: 0.9087224
Step 4, Minibatch Loss= 0.7392, Training Accuracy= 0.915
Testing Accuracy: 0.9099385
Step 5, Minibatch Loss= 0.7137, Training Accuracy= 0.916
Testing Accuracy: 0.9118303
Step 6, Minibatch Loss= 0.6969, Training Accuracy= 0.917
Testing Accuracy: 0.9123708
Step 7, Minibatch Loss= 0.6770, Training Accuracy= 0.917
Testing Accuracy: 0.9096007
Step 8, Minibatch Loss= 0.6638, Training Accuracy= 0.921
Testing Accuracy: 0.9141274
Step 9, Minibatch Loss= 0.6513, Training Accuracy= 0.924
Testing Accuracy: 0.9158165
Step 10, Minibatch Loss= 0.6436, Training Accuracy= 0.925
Testing Accuracy: 0.9178434
Step 11, Minibatch Loss= 0.6317, Training Accuracy= 0.924
Testing Accuracy: 0.9147355
Step 12, Minibatch Loss= 0.6233, Training Accuracy= 0.927
Tes

Step 23, Minibatch Loss= 0.4312, Training Accuracy= 0.931
Testing Accuracy: 0.9231809
Step 24, Minibatch Loss= 0.4252, Training Accuracy= 0.930
Testing Accuracy: 0.92304575
Optimization Finished!
Testing Accuracy: 0.92304575
Step 1, Minibatch Loss= 0.9249, Training Accuracy= 0.907
Testing Accuracy: 0.90297955
Step 2, Minibatch Loss= 0.7621, Training Accuracy= 0.915
Testing Accuracy: 0.910952
Step 3, Minibatch Loss= 0.6886, Training Accuracy= 0.916
Testing Accuracy: 0.9110195
Step 4, Minibatch Loss= 0.6486, Training Accuracy= 0.916
Testing Accuracy: 0.9112222
Step 5, Minibatch Loss= 0.6180, Training Accuracy= 0.916
Testing Accuracy: 0.91156006
Step 6, Minibatch Loss= 0.5921, Training Accuracy= 0.916
Testing Accuracy: 0.91156006
Step 7, Minibatch Loss= 0.5766, Training Accuracy= 0.916
Testing Accuracy: 0.9116276
Step 8, Minibatch Loss= 0.5650, Training Accuracy= 0.916
Testing Accuracy: 0.9116276
Step 9, Minibatch Loss= 0.5568, Training Accuracy= 0.916
Testing Accuracy: 0.9116276
Step 10,

Step 20, Minibatch Loss= 0.4131, Training Accuracy= 0.941
Testing Accuracy: 0.9331126
Step 21, Minibatch Loss= 0.4143, Training Accuracy= 0.941
Testing Accuracy: 0.933518
Step 22, Minibatch Loss= 0.4158, Training Accuracy= 0.941
Testing Accuracy: 0.93263966
Step 23, Minibatch Loss= 0.4118, Training Accuracy= 0.942
Testing Accuracy: 0.93378824
Step 24, Minibatch Loss= 0.4108, Training Accuracy= 0.941
Testing Accuracy: 0.93399096
Optimization Finished!
Testing Accuracy: 0.93399096
Step 1, Minibatch Loss= 0.8667, Training Accuracy= 0.906
Testing Accuracy: 0.9023039
Step 2, Minibatch Loss= 0.8039, Training Accuracy= 0.915
Testing Accuracy: 0.91074926
Step 3, Minibatch Loss= 0.7947, Training Accuracy= 0.916
Testing Accuracy: 0.9108844
Step 4, Minibatch Loss= 0.7736, Training Accuracy= 0.916
Testing Accuracy: 0.9116276
Step 5, Minibatch Loss= 0.7183, Training Accuracy= 0.916
Testing Accuracy: 0.9119654
Step 6, Minibatch Loss= 0.6313, Training Accuracy= 0.917
Testing Accuracy: 0.91149247
Step

Step 17, Minibatch Loss= 0.4331, Training Accuracy= 0.932
Testing Accuracy: 0.9237889
Step 18, Minibatch Loss= 0.4346, Training Accuracy= 0.934
Testing Accuracy: 0.9258834
Step 19, Minibatch Loss= 0.4271, Training Accuracy= 0.936
Testing Accuracy: 0.9267617
Step 20, Minibatch Loss= 0.4244, Training Accuracy= 0.937
Testing Accuracy: 0.9277076
Step 21, Minibatch Loss= 0.4198, Training Accuracy= 0.937
Testing Accuracy: 0.92743737
Step 22, Minibatch Loss= 0.4196, Training Accuracy= 0.938
Testing Accuracy: 0.92797786
Step 23, Minibatch Loss= 0.4216, Training Accuracy= 0.938
Testing Accuracy: 0.92845076
Step 24, Minibatch Loss= 0.4211, Training Accuracy= 0.938
Testing Accuracy: 0.9282481
Optimization Finished!
Testing Accuracy: 0.9282481
Step 1, Minibatch Loss= 0.8187, Training Accuracy= 0.916
Testing Accuracy: 0.9116276
Step 2, Minibatch Loss= 0.7802, Training Accuracy= 0.916
Testing Accuracy: 0.9116276
Step 3, Minibatch Loss= 0.7141, Training Accuracy= 0.916
Testing Accuracy: 0.9116276
Ste

Step 14, Minibatch Loss= 0.4550, Training Accuracy= 0.941
Testing Accuracy: 0.93345046
Step 15, Minibatch Loss= 0.4489, Training Accuracy= 0.942
Testing Accuracy: 0.93459904
Step 16, Minibatch Loss= 0.4458, Training Accuracy= 0.941
Testing Accuracy: 0.9354098
Step 17, Minibatch Loss= 0.4417, Training Accuracy= 0.942
Testing Accuracy: 0.9355449
Step 18, Minibatch Loss= 0.4382, Training Accuracy= 0.942
Testing Accuracy: 0.93568003
Step 19, Minibatch Loss= 0.4381, Training Accuracy= 0.942
Testing Accuracy: 0.93615294
Step 20, Minibatch Loss= 0.4331, Training Accuracy= 0.942
Testing Accuracy: 0.9354098
Step 21, Minibatch Loss= 0.4355, Training Accuracy= 0.943
Testing Accuracy: 0.9362881
Step 22, Minibatch Loss= 0.4297, Training Accuracy= 0.943
Testing Accuracy: 0.93703127
Step 23, Minibatch Loss= 0.4262, Training Accuracy= 0.943
Testing Accuracy: 0.93750423
Step 24, Minibatch Loss= 0.4258, Training Accuracy= 0.943
Testing Accuracy: 0.9373691
Optimization Finished!
Testing Accuracy: 0.93736

Step 11, Minibatch Loss= 0.4476, Training Accuracy= 0.934
Testing Accuracy: 0.92959934
Step 12, Minibatch Loss= 0.4424, Training Accuracy= 0.935
Testing Accuracy: 0.9293291
Step 13, Minibatch Loss= 0.4354, Training Accuracy= 0.936
Testing Accuracy: 0.9304777
Step 14, Minibatch Loss= 0.4321, Training Accuracy= 0.936
Testing Accuracy: 0.9298696
Step 15, Minibatch Loss= 0.4294, Training Accuracy= 0.934
Testing Accuracy: 0.92838323
Step 16, Minibatch Loss= 0.4261, Training Accuracy= 0.935
Testing Accuracy: 0.92831564
Step 17, Minibatch Loss= 0.4259, Training Accuracy= 0.933
Testing Accuracy: 0.92655903
Step 18, Minibatch Loss= 0.4219, Training Accuracy= 0.934
Testing Accuracy: 0.92716706
Step 19, Minibatch Loss= 0.4193, Training Accuracy= 0.934
Testing Accuracy: 0.92777514
Step 20, Minibatch Loss= 0.4181, Training Accuracy= 0.934
Testing Accuracy: 0.9275725
Step 21, Minibatch Loss= 0.4173, Training Accuracy= 0.934
Testing Accuracy: 0.9273698
Step 22, Minibatch Loss= 0.4165, Training Accura

In [81]:
df_met=pd.DataFrame(all_metrics,columns=['C_name','Accuracy','Notpromoted_rec','Promoted_rec','notpromoted_prec','promoted_prec','fscore_nonprom','fscore_prom'])

In [82]:
t=df_met.sort_values('Promoted_rec',ascending=True)
t[['C_name','Accuracy','Promoted_rec','promoted_prec','fscore_prom','notpromoted_prec','Notpromoted_rec','fscore_nonprom']]

Unnamed: 0,C_name,Accuracy,Promoted_rec,promoted_prec,fscore_prom,notpromoted_prec,Notpromoted_rec,fscore_nonprom
5,length_of_service,0.911628,0.0,0.0,0.0,0.911628,1.0,0.953771
8,avg_training_score,0.891832,0.196483,0.318463,0.243026,0.924896,0.959238,0.941754
21,education_Master's & above,0.937369,0.31422,0.931973,0.469983,0.937535,0.997777,0.966718
20,education_Below Secondary,0.936153,0.324159,0.874227,0.47295,0.938251,0.995479,0.966018
22,gender_f,0.93595,0.326453,0.864372,0.473918,0.938422,0.995034,0.965899
26,recruitment_channel_sourcing,0.935545,0.329511,0.848425,0.47467,0.938641,0.994293,0.965666
14,department_Procurement,0.936558,0.330275,0.872727,0.479201,0.938767,0.995331,0.966222
1,region,0.934194,0.33104,0.81391,0.470652,0.938678,0.992663,0.964916
7,awards_won?,0.932437,0.332569,0.774021,0.465241,0.93869,0.990588,0.963941
13,department_Operations,0.933991,0.334862,0.80367,0.472747,0.938973,0.99207,0.964792


In [None]:
#df_met.to_csv('../col_wise_res/col_wise_1_4high_v1.csv',index=False)

In [None]:
import old

In [None]:
precision, recall, fscore, support = score(ytrain, tr_pred)
accuracy = accuracy_score(ytrain,tr_pred)
print('Accuracy: {}'.format(accuracy))

print('precision: {}'.format(precision))
print('recall: {}'.format(recall))
print('fscore: {}'.format(fscore))
#print('support: {}'.format(support))

In [None]:
precision, recall, fscore, support = score(yvalid, test_pred)
accuracy = accuracy_score(yvalid,test_pred)
print('Accuracy: {}'.format(accuracy))

print('precision: {}'.format(precision))
print('recall: {}'.format(recall))
print('fscore: {}'.format(fscore))
#print('support: {}'.format(support))

In [None]:
sub_data= {'employee_id':xtestproc['employee_id'],'is_promoted':sub_pred}

In [None]:
res = pd.DataFrame(sub_data)

In [None]:
res.to_csv("../results/pred3normal_nn.csv",index=False)

In [None]:
import stop

In [None]:
xtrain.isnull().sum().sum()

In [None]:
#rf = RandomForestClassifier(n_estimators = 20, random_state = 0,max_depth=None,min_samples_split=4)

In [None]:
#rf = DecisionTreeClassifier(criterion = "gini", max_depth=None,min_samples_split=4,random_state = 0)

In [None]:
rf = svm.SVC(kernel='linear', C=500, gamma=0.001)

In [None]:
rf.fit(xtrain,ytrain)

In [None]:
tr_pred = rf.predict(xtrain)

In [None]:
valid_pred = rf.predict(xvalid)

In [None]:
precision, recall, fscore, support = score(ytrain, tr_pred)
accuracy = accuracy_score(ytrain,tr_pred)
print('Accuracy: {}'.format(accuracy))

print('precision: {}'.format(precision))
print('recall: {}'.format(recall))
print('fscore: {}'.format(fscore))
#print('support: {}'.format(support))

In [None]:
precision, recall, fscore, support = score(yvalid, valid_pred)
accuracy = accuracy_score(yvalid,valid_pred)
print('Accuracy: {}'.format(accuracy))

print('precision: {}'.format(precision))
print('recall: {}'.format(recall))
print('fscore: {}'.format(fscore))
#print('support: {}'.format(support))

In [None]:
tr_data['is_promoted'].value_counts()

In [None]:
tr_data['department'].unique()

In [None]:
ts_data.shape

In [None]:
l1=ts_data['employee_id'].values
l2=tr_data['employee_id'].values
common=np.intersect1d(l1,l2)

In [None]:
common

In [None]:
np.sort(l1)

In [None]:
np.sort(l2)