In [1]:
#library imports
import pandas as pd
import numpy as np
import matplotlib as plt

In [2]:
#function and model imports
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import scale

from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm

from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.metrics import accuracy_score

In [3]:
import tensorflow as tf

  from ._conv import register_converters as _register_converters


In [4]:
def df_shuffle(inp):
    inp=shuffle(inp)
    inp=inp.reset_index(drop=True)
    return inp

In [5]:
train_data=pd.read_csv('../Data/train_LZdllcl.csv')
test_data=pd.read_csv('../Data/test_2umaH9m.csv')
print(train_data.shape)
print(test_data.shape)
#tr_data

(54808, 14)
(23490, 13)


In [6]:
train_data=df_shuffle(train_data)

In [7]:
t=train_data.isnull().sum()
t[t>0]

education               2409
previous_year_rating    4124
dtype: int64

In [8]:
def process_categorical(train,test):    #use this after removing target variable
    fulldata=pd.concat([train,test],axis=0)
    fulldata=fulldata.fillna(-1)
    trainend=len(train)
    onecoded=pd.get_dummies(fulldata)
    return (onecoded[:trainend],onecoded[trainend:])

In [9]:
#drop target variable for preprocessing
ytrainfull=train_data['is_promoted']
X_train=train_data.drop('is_promoted',axis=1)

In [10]:
xtrainfull,xtestproc=process_categorical(X_train,test_data)

In [11]:
#sample train data to test on itself
end = 54000 
xtrain = xtrainfull[:end]
ytrain = ytrainfull[:end]

xvalid = xtrainfull[end:54801] 
yvalid = ytrainfull[end:54801]
xvalid=xvalid.reset_index(drop=True)
yvalid=yvalid.reset_index(drop=True)

In [12]:
xtrain.shape

(54000, 60)

In [13]:
X_train_std=scale(xtrain)
X_test_std=scale(xvalid)
X_sub_std=scale(xtestproc)

In [14]:
Y_bin_train = pd.get_dummies(ytrain)
Y_bin_test = pd.get_dummies(yvalid)

In [52]:
# Parameters
learning_rate =0.02 #0.02
#num_steps = 500
batch_size = 128#128
#display_step = 100

# Network Parameters
n_hidden_1 = 40#36 # 1st layer number of neurons
n_hidden_2 = 30#24 # 2nd layer number of neurons
n_hidden_3 = 15#12
num_input = 60 # MNIST data input (img shape: 28*28)
num_classes = 2 # MNIST total classes (0-9 digits)

# tf Graph input
X = tf.placeholder("float", [None, num_input])
Y = tf.placeholder("float", [None, num_classes])

In [53]:
# Store layers weight & bias
inz = tf.contrib.layers.xavier_initializer()
weights = {
    'h1': tf.Variable(tf.random_normal([num_input, n_hidden_1])),
    'h2': tf.Variable(tf.random_normal([n_hidden_1, n_hidden_2])),
    'h3': tf.Variable(tf.random_normal([n_hidden_2, n_hidden_3])),
    'out': tf.Variable(tf.random_normal([n_hidden_3, num_classes]))
}
biases = {
    'b1': tf.Variable(tf.random_normal([n_hidden_1])),
    'b2': tf.Variable(tf.random_normal([n_hidden_2])),
    'b3': tf.Variable(tf.random_normal([n_hidden_3])),
    'out': tf.Variable(tf.random_normal([num_classes]))
}

In [54]:
# Create model
def neural_net(x):
    # Hidden fully connected layer with 256 neurons
    layer_1 = tf.add(tf.matmul(x, weights['h1']), biases['b1'])
    layer_1 = tf.nn.relu(layer_1)
    # Hidden fully connected layer with 256 neurons
    layer_2 = tf.add(tf.matmul(layer_1, weights['h2']), biases['b2'])
    layer_2 = tf.nn.relu(layer_2)
    #layer_2 = tf.nn.dropout(layer_2, 0.9)
    # Output fully connected layer with a neuron for each class
    layer_3 = tf.add(tf.matmul(layer_2, weights['h3']), biases['b3'])
    layer_3 = tf.nn.relu(layer_3) 
    
    out_layer = tf.matmul(layer_3, weights['out']) + biases['out']
    return out_layer

In [55]:
# Construct model
logits = neural_net(X)


class_weights = tf.constant([[1.0, 12.0]])
# Define loss and optimizer
#loss_op = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(
#    logits=logits, labels=Y))
loss_op = tf.reduce_mean(tf.nn.weighted_cross_entropy_with_logits(
    logits=logits, targets=Y,pos_weight=class_weights))


# your class weights
#class_weights = tf.constant([[1.0, 3.0]])
# deduce weights for batch samples based on their true label
#weights_imb = tf.reduce_sum(class_weights * Y, axis=1)
# compute your (unweighted) softmax cross entropy loss
#unweighted_losses = tf.nn.softmax_cross_entropy_with_logits(logits= logits,labels=Y)
# apply the weights, relying on broadcasting of the multiplication
#weighted_losses = unweighted_losses * weights_imb
# reduce the result to get your final loss
#loss_op = tf.reduce_mean(weighted_losses)





optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
train_op = optimizer.minimize(loss_op)

# Evaluate model (with test logits, for dropout to be disabled)
correct_pred = tf.equal(tf.argmax(logits, 1), tf.argmax(Y, 1))
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
pred_tf=tf.argmax(logits, 1)

# Initialize the variables (i.e. assign their default value)
init = tf.global_variables_initializer()

In [56]:
# Start training

no_iterations = int(end/batch_size)-2
print(no_iterations)

with tf.Session() as sess:

    # Run the initializer
    sess.run(init)

    for step in range(1, 30):
        for no_batch in range(0,no_iterations):
            
            batch_x = X_train_std[(no_batch*batch_size) :((no_batch+1)*batch_size) ]
            batch_y = Y_bin_train[(no_batch*batch_size) :((no_batch+1)*batch_size) ]
            # Run optimization op (backprop)
            sess.run(train_op, feed_dict={X: batch_x, Y: batch_y})
            #if step % display_step == 0 or step == 1:
                # Calculate batch loss and accuracy
        loss, acc = sess.run([loss_op, accuracy], feed_dict={X: X_train_std,Y: Y_bin_train})
        print("Step " + str(step) + ", Minibatch Loss= " + \
                      "{:.4f}".format(loss) + ", Training Accuracy= " + \
                      "{:.3f}".format(acc))
        print("Testing Accuracy:", \
        sess.run(accuracy, feed_dict={X: X_test_std,
                                      Y: Y_bin_test}))
    print("Optimization Finished!")

    # Calculate accuracy for MNIST test images
    print("Testing Accuracy:", \
        sess.run(accuracy, feed_dict={X: X_test_std,
                                      Y: Y_bin_test}))
    test_pred= sess.run(pred_tf, feed_dict={X: X_test_std,Y: Y_bin_test})
    tr_pred= sess.run(pred_tf, feed_dict={X: X_train_std,Y: Y_bin_test})
    sub_pred= sess.run(pred_tf, feed_dict={X: X_sub_std})

419
Step 1, Minibatch Loss= 0.7946, Training Accuracy= 0.913
Testing Accuracy: 0.9101124
Step 2, Minibatch Loss= 0.7357, Training Accuracy= 0.914
Testing Accuracy: 0.9088639
Step 3, Minibatch Loss= 0.6725, Training Accuracy= 0.914
Testing Accuracy: 0.9113608
Step 4, Minibatch Loss= 0.6200, Training Accuracy= 0.914
Testing Accuracy: 0.9101124
Step 5, Minibatch Loss= 0.5776, Training Accuracy= 0.914
Testing Accuracy: 0.9101124
Step 6, Minibatch Loss= 0.5571, Training Accuracy= 0.914
Testing Accuracy: 0.9113608
Step 7, Minibatch Loss= 0.5413, Training Accuracy= 0.915
Testing Accuracy: 0.9126092
Step 8, Minibatch Loss= 0.5325, Training Accuracy= 0.911
Testing Accuracy: 0.91635454
Step 9, Minibatch Loss= 0.4870, Training Accuracy= 0.930
Testing Accuracy: 0.9238452
Step 10, Minibatch Loss= 0.4630, Training Accuracy= 0.936
Testing Accuracy: 0.9238452
Step 11, Minibatch Loss= 0.4499, Training Accuracy= 0.937
Testing Accuracy: 0.92509365
Step 12, Minibatch Loss= 0.4518, Training Accuracy= 0.938

In [57]:
precision, recall, fscore, support = score(ytrain, tr_pred)
accuracy = accuracy_score(ytrain,tr_pred)
print('Accuracy: {}'.format(accuracy))

print('precision: {}'.format(precision))
print('recall: {}'.format(recall))
print('fscore: {}'.format(fscore))
#print('support: {}'.format(support))

Accuracy: 0.9430740740740741
precision: [0.94356893 0.92853123]
recall: [0.99742931 0.35892974]
fscore: [0.96975184 0.51772827]


In [58]:
precision, recall, fscore, support = score(yvalid, test_pred)
accuracy = accuracy_score(yvalid,test_pred)
print('Accuracy: {}'.format(accuracy))

print('precision: {}'.format(precision))
print('recall: {}'.format(recall))
print('fscore: {}'.format(fscore))
#print('support: {}'.format(support))

Accuracy: 0.9388264669163545
precision: [0.94056848 0.88888889]
recall: [0.99589603 0.34285714]
fscore: [0.96744186 0.49484536]


In [44]:
sub_data= {'employee_id':xtestproc['employee_id'],'is_promoted':sub_pred}

In [45]:
res = pd.DataFrame(sub_data)

In [46]:
res.to_csv("../results/pred3normal_nn.csv",index=False)

In [25]:
import stop

ModuleNotFoundError: No module named 'stop'

In [None]:
xtrain.isnull().sum().sum()

In [None]:
#rf = RandomForestClassifier(n_estimators = 20, random_state = 0,max_depth=None,min_samples_split=4)

In [None]:
#rf = DecisionTreeClassifier(criterion = "gini", max_depth=None,min_samples_split=4,random_state = 0)

In [None]:
rf = svm.SVC(kernel='linear', C=500, gamma=0.001)

In [None]:
rf.fit(xtrain,ytrain)

In [None]:
tr_pred = rf.predict(xtrain)

In [None]:
valid_pred = rf.predict(xvalid)

In [None]:
precision, recall, fscore, support = score(ytrain, tr_pred)
accuracy = accuracy_score(ytrain,tr_pred)
print('Accuracy: {}'.format(accuracy))

print('precision: {}'.format(precision))
print('recall: {}'.format(recall))
print('fscore: {}'.format(fscore))
#print('support: {}'.format(support))

In [None]:
precision, recall, fscore, support = score(yvalid, valid_pred)
accuracy = accuracy_score(yvalid,valid_pred)
print('Accuracy: {}'.format(accuracy))

print('precision: {}'.format(precision))
print('recall: {}'.format(recall))
print('fscore: {}'.format(fscore))
#print('support: {}'.format(support))

In [None]:
tr_data['is_promoted'].value_counts()

In [None]:
tr_data['department'].unique()

In [None]:
ts_data.shape

In [None]:
l1=ts_data['employee_id'].values
l2=tr_data['employee_id'].values
common=np.intersect1d(l1,l2)

In [None]:
common

In [None]:
np.sort(l1)

In [None]:
np.sort(l2)