In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
employees = pd.read_csv('HR_comma_sep.csv')
employees.head()

In [None]:
employees['salary'] = pd.factorize(employees['salary'])[0]
employees['sales'] = pd.factorize(employees['sales'])[0]

In [None]:
employees.mean()

In [None]:
correlation_matrix = employees.corr(method='pearson')
correlation_matrix 

In [None]:
# To plot the correlation_matrix:
import seaborn as maps
maps.heatmap(correlation_matrix, vmax = .8, square = True)
plt.show()

In [None]:
corr_left=pd.DataFrame(correlation_matrix['left'].drop('left').sort_values(ascending=False))
corr_left

In [None]:
maps.barplot(x = 'salary', y = 'left', data = employees)

In [None]:
# For last_evaluation

plot_features = ['last_evaluation','satisfaction_level','time_spend_company']

for pf in plot_features:
    g = maps.FacetGrid(employees, hue="left")
    g.map(maps.kdeplot,pf,shade= True)
    g.set(xlim=(0, employees[pf].max()))
    g.add_legend()

In [None]:
# To separate label and features in data
labels = np.where(employees['left'] == 1, 1, 0) 
features = employees.drop('left', axis = 1).as_matrix().astype(np.float)
label_names = np.unique(labels)
feature_names = list(employees.axes[1])

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
from sklearn import cross_validation, linear_model
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_predict
from sklearn import metrics

classifiers = [["KNN",KNeighborsClassifier(n_neighbors=3, weights='distance')],
["DT",DecisionTreeClassifier(random_state=0)],
["NB", MultinomialNB()]
]
    
classifier_types = []
for name, clf in classifiers:
    print '\nMetric for ' + name
    cv_predicted = cross_val_predict(clf, X_train, Y_train, cv=5)
    print metrics.classification_report(Y_train, cv_predicted)
    scores = cross_validation.cross_val_score(clf, X_train, Y_train)
    print '\nCross validation scores: ', scores.mean()

    clf.fit(X_train, Y_train)
    predictions = clf.predict(X_test)
    print 'Accuracy score for '+ name, accuracy_score(Y_test, predictions)
    classifier_types.append((name, clf))

In [None]:
from sklearn.tree import export_graphviz
from sklearn import tree
clf = DecisionTreeClassifier(max_depth=5, min_impurity_split=1e-02)
clf = clf.fit(X_train, Y_train)
tree.export_graphviz(clf, out_file='tree.dot')

In [None]:
import pydotplus 
dot_data = tree.export_graphviz(clf, out_file=None) 
graph = pydotplus.graph_from_dot_data(dot_data) 
graph.write_pdf("tree.pdf") 

In [None]:
from IPython.display import Image  
dot_data = tree.export_graphviz(clf, out_file=None, 
                         feature_names=feature_names,  
                         class_names=str(label_names), 
                         filled=True, rounded=True,  
                         special_characters=True
                         )  
graph = pydotplus.graph_from_dot_data(dot_data)  
graph.write_pdf("coloured_tree.pdf")
Image(graph.create_png())  

In [None]:
# Test employee features
test = [   0.44,    0.57,    2,    141,      3,      0,      0,      7,      1  ]

In [None]:
predict = clf.predict_proba(np.asarray(test).reshape(-9,9))
predict

In [None]:
# Create a dataframe for employees who left.
left_df = employees[employees['left'] == 1]

# Create a dataframe for employees who stayed.
stay_df = employees[employees['left'] == 0]

In [None]:
retention_profile_mean = pd.DataFrame(columns=[list(employees)])
retention_profile_mean = retention_profile_mean.drop('left', 1)  # Drop the 'left' column
retention_profile_mean = retention_profile_mean.append({'satisfaction_level': stay_df['satisfaction_level'].mean(),
                                              'last_evaluation': stay_df['last_evaluation'].mean(),
                                              'number_project': stay_df['number_project'].mean(),
                                              'average_montly_hours': stay_df['average_montly_hours'].mean(),
                                              'time_spend_company': stay_df['time_spend_company'].mean(),
                                              'Work_accident': stay_df['Work_accident'].mode(),
                                              'promotion_last_5years': stay_df['promotion_last_5years'].mode(),
                                              'sales': stay_df['sales'].mean(),
                                              'salary': stay_df['salary'].mean()
                                             },
                                             ignore_index=True)

retention_profile_std = pd.DataFrame(columns=[list(employees)])
retention_profile_std = retention_profile_std.drop('left', 1)  # Drop the 'left' column
retention_profile_std = retention_profile_std.append({'satisfaction_level': stay_df['satisfaction_level'].std(),
                                              'last_evaluation': stay_df['last_evaluation'].std(),
                                              'number_project': stay_df['number_project'].std(),
                                              'average_montly_hours': stay_df['average_montly_hours'].std(),
                                              'time_spend_company': stay_df['time_spend_company'].std(),
                                              'Work_accident': stay_df['Work_accident'].std(),
                                              'promotion_last_5years': stay_df['promotion_last_5years'].std(),
                                              'sales': stay_df['sales'].std(),
                                              'salary': stay_df['salary'].std()
                                             },
                                             ignore_index=True)
retention_profile_mean
#retention_profile_std

In [None]:
# Difference in job satisfaction between the two groups.
attrition_satisfaction = round(left_df['satisfaction_level'].mean(), 2)
retention_satisfaction = round(stay_df['satisfaction_level'].mean(), 2)

In [None]:
# Factors contributing to satisfaction levels among staff who left.
print left_df.corr(method='pearson')['satisfaction_level'][1:].sort_values(ascending=False)

In [None]:
# Factors contributing to satisfaction levels among staff who stay.
print stay_df.corr(method='pearson')['satisfaction_level'][1:].sort_values(ascending=False)

In [None]:
# Factors correlated to satisfaction_level
correlations2 = employees.corr(method='pearson')
satisfaction_df = correlations2['satisfaction_level'][1:].abs().sort_values(ascending=False)
satisfaction_df = satisfaction_df.drop('left')
print(satisfaction_df)

In [None]:
# To separate label and features in data
X_train_ = X_train[:, 1:]
X_test_ = X_test[:, 1:]
y_train_ = X_train[:, 0]
y_test_ = X_test[:, 0]

X_ = np.vstack((X_train_, X_test_))
y_ = np.append(y_train_, y_test_)

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold,cross_val_score,train_test_split
from sklearn.metrics import classification_report
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor

models=[['LR',LinearRegression()],['DT',DecisionTreeRegressor()],['RF',RandomForestRegressor()]]
for names,model in models:
    results = cross_val_score(model, X_,y_, cv = KFold(n_splits=10,random_state=7),
    scoring = 'neg_mean_squared_error')
    print names, results.mean()

In [None]:
# Test with random test values
regres= RandomForestRegressor()
regres.fit(X_train_,y_train_)
predicted_sat = regres.predict(X_test_)
print predicted_sat

In [None]:
import copy
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 

def genNewSamples(x, idx, xmin, xmax, num):
    """
    x - actual sample array
    idx - idx of sample to be changed
    (xmin, xmax) - sample to change
    num - number of points to pick from uniform distribution
    """
    x_ = []
    for newx in np.arange(xmin, xmax, (xmax-xmin)*1.0/num):
        y = copy.deepcopy(x)
        y[idx] = newx
        x_.append(y)

    return np.vstack([[x] for x in x_])
# Check how to improve the satisfaction level of employee for those who are leaving
def improveSatisfaction(X_test_):
    leavingThreshold = 0.9
    #for idx, x_test_ in enumerate(X_test_):
    # get the probavbilty of leaving
    leaving = clf.predict_proba(X_test_)
    leaving = leaving[0][1]
    suggestions =[]
    # if person is leaving, try to improve the satisfaction level
    if leaving > leavingThreshold:
        print '\nThe employee will leave ! - ', X_test_
        # generate new samples by changing
        x_test_rec_ = []
        # number of projects
        x_test_rec_.append(genNewSamples(X_test_, 2, 2, 8, 6))
        # salary
        x_test_rec_.append(genNewSamples(X_test_, 8, 0, 3, 3))
        # number of hours
        '''
        s = 'average_montly_hours'
        xmin = retention_profile_mean[s] - 3.0*retention_profile_std[s]
        xmax = retention_profile_mean[s] + 3.0*retention_profile_std[s]
        x_test_rec_.append(genNewSamples(X_test_, 3, xmin, xmax, 10))
        '''
        
         # predict the new satisfaction level for updated sample set
        x_test_rec_ = np.vstack(x for x in x_test_rec_)
        regres= RandomForestRegressor()
        regres.fit(X_train_,y_train_)
        y_test_rec_ = regres.predict(x_test_rec_[:,1:])

        # generate new x_test with this predicted satisfaction level and
        # check if the person is leaving
        x_test_rec = np.hstack((np.asarray([y_test_rec_]).T, x_test_rec_[:,1:]))
        y_test_rec= clf.predict_proba(x_test_rec)
        y_test_rec = y_test_rec[:, 0]
        print 'The employee will not leave for following conditions:'
        for idx, y_test in enumerate(y_test_rec):
            if y_test > leavingThreshold:
                print x_test_rec[idx]
                if x_test_rec[idx][2]>X_test_[2]:
                    text = "Increase the employee's number of projects by : "
                    text+= str(x_test_rec[idx][2]-X_test_[2])
                    text+= " to get satisfaction level of "+ str(x_test_rec[idx][0]) + "\n"
                    suggestions.append(text)
                if x_test_rec[idx][8]!=X_test_[8]:
                    text = "Make employee's salary : " + str(x_test_rec[idx][8])
                    text += " to get satisfaction level of "+ str(x_test_rec[idx][0])
                    suggestions.append(text)
    else:
        response = "The employee will not leave! No action required. "
        suggestions.append(response)
            
    return suggestions

In [None]:
test = [   0.44,    0.57,    2,    141,      3,      0,      0,      7,      1  ]

print improveSatisfaction(test)