### Exercise
Download employee retention dataset from here: https://www.kaggle.com/giripujar/hr-analytics.

1- Now do some exploratory data analysis to figure out which variables have direct and clear impact on employee retention (i.e. whether they leave the company or continue to work) <br>
2 - Plot bar charts showing impact of employee salaries on retention<br>
3 - Plot bar charts showing corelation between department and employee retention<br>
4 - Now build logistic regression model using variables that were narrowed down in step 1<br>
5 -Measure the accuracy of the model

In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split 
%matplotlib notebook
import joblib
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

#### Exploaring the data

In [2]:
df=pd.read_csv("HR_comma_sep.csv")
df

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,Department,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.80,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low
...,...,...,...,...,...,...,...,...,...,...
14994,0.40,0.57,2,151,3,0,1,0,support,low
14995,0.37,0.48,2,160,3,0,1,0,support,low
14996,0.37,0.53,2,143,3,0,1,0,support,low
14997,0.11,0.96,6,280,4,0,1,0,support,low


In [3]:
df.shape

(14999, 10)

In [4]:
df.columns[df.isna().any()]

Index([], dtype='object')

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14999 entries, 0 to 14998
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   satisfaction_level     14999 non-null  float64
 1   last_evaluation        14999 non-null  float64
 2   number_project         14999 non-null  int64  
 3   average_montly_hours   14999 non-null  int64  
 4   time_spend_company     14999 non-null  int64  
 5   Work_accident          14999 non-null  int64  
 6   left                   14999 non-null  int64  
 7   promotion_last_5years  14999 non-null  int64  
 8   Department             14999 non-null  object 
 9   salary                 14999 non-null  object 
dtypes: float64(2), int64(6), object(2)
memory usage: 1.1+ MB


In [6]:
df.time_spend_company.unique()

array([ 3,  6,  4,  5,  2,  8, 10,  7], dtype=int64)

In [7]:
df.Department.unique()

array(['sales', 'accounting', 'hr', 'technical', 'support', 'management',
       'IT', 'product_mng', 'marketing', 'RandD'], dtype=object)

In [8]:
df.groupby("Department").sum()

Unnamed: 0_level_0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years
Department,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
IT,758.46,879.55,4683,248119,4256,164,273,3
RandD,487.8,560.44,3033,158030,2650,134,121,27
accounting,446.51,550.49,2934,154292,2702,96,204,14
hr,442.52,523.84,2701,146828,2480,89,215,15
management,391.45,456.12,2432,126787,2711,103,91,69
marketing,530.76,614.23,3164,171073,3063,138,203,43
product_mng,558.91,644.71,3434,180369,3135,132,198,0
sales,2543.81,2938.23,15634,831773,14631,587,1014,100
support,1378.19,1611.81,8479,447490,7563,345,555,20
technical,1653.48,1961.39,10548,550793,9279,381,697,28


In [9]:
df.groupby('left').mean()

Unnamed: 0_level_0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years
left,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.66681,0.715473,3.786664,199.060203,3.380032,0.175009,0.026251
1,0.440098,0.718113,3.855503,207.41921,3.876505,0.047326,0.005321


In [10]:
df.groupby(["left","salary","Department"]).max()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years
left,salary,Department,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,high,IT,0.99,0.97,6,275,6,1,0
0,high,RandD,0.97,0.95,6,287,8,1,1
0,high,accounting,0.97,1.0,6,277,8,1,1
0,high,hr,0.99,0.99,6,280,6,1,1
0,high,management,0.98,1.0,6,286,10,1,1
0,high,marketing,1.0,1.0,6,286,10,1,1
0,high,product_mng,0.99,0.98,6,278,10,1,0
0,high,sales,1.0,0.99,6,286,10,1,1
0,high,support,0.99,1.0,6,286,10,1,0
0,high,technical,1.0,1.0,6,284,10,1,1


In [11]:
df.groupby(["left","salary","Department"]).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years
left,salary,Department,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,high,IT,50.64,56.62,307,15352,239,4,0
0,high,RandD,28.53,33.32,181,9503,167,9,1
0,high,accounting,43.83,50.71,275,14387,222,15,6
0,high,hr,28.7,29.52,151,8077,110,4,2
0,high,management,146.58,160.09,844,44797,1158,36,45
0,high,marketing,44.61,48.38,256,13568,254,13,5
0,high,product_mng,39.8,41.48,232,12095,226,13,0
0,high,sales,168.31,177.43,971,50692,895,37,12
0,high,support,88.1,95.17,506,27350,423,31,0
0,high,technical,113.82,125.72,654,35581,572,30,1


In [12]:
df.groupby(["left","salary","Department"]).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years
left,salary,Department,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,high,IT,0.641013,0.716709,3.886076,194.329114,3.025316,0.050633,0.0
0,high,RandD,0.607021,0.708936,3.851064,202.191489,3.553191,0.191489,0.021277
0,high,accounting,0.635217,0.734928,3.985507,208.507246,3.217391,0.217391,0.086957
0,high,hr,0.735897,0.756923,3.871795,207.102564,2.820513,0.102564,0.051282
0,high,management,0.654375,0.714688,3.767857,199.986607,5.169643,0.160714,0.200893
0,high,marketing,0.62831,0.681408,3.605634,191.098592,3.577465,0.183099,0.070423
0,high,product_mng,0.641935,0.669032,3.741935,195.080645,3.645161,0.209677,0.0
0,high,sales,0.660039,0.695804,3.807843,198.792157,3.509804,0.145098,0.047059
0,high,support,0.662406,0.715564,3.804511,205.639098,3.180451,0.233083,0.0
0,high,technical,0.646705,0.714318,3.715909,202.164773,3.25,0.170455,0.005682


In [13]:
df.groupby("left").mean()

Unnamed: 0_level_0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years
left,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.66681,0.715473,3.786664,199.060203,3.380032,0.175009,0.026251
1,0.440098,0.718113,3.855503,207.41921,3.876505,0.047326,0.005321


from the analysis can draw the conclusion:<br><br>
**Satisfaction Level**: Satisfaction level seems to be relatively low (0.44) in employees leaving the firm vs the retained ones (0.66) and got about 40% correlation with left employers<br>
**Average Monthly Hours**: Average monthly hours are higher in employees leaving the firm (199 vs 207)<br>
**Promotion Last 5 Years**: Employees who are given promotion are likely to be retained at firm<br>
**time_spend_company**: 15% correlated with left column, time spend (3.3) average hours employess working leaving the firm vs the retained ones (3.8) hours<br>
**work_accident**: with mean 18 who didn't leave the company and with only 0.04 mean who left the company so work accident defenitly not have direct or clear impact with left column because many pepole have work accident and didn't leave the firm<br>
**last_evaluation**: last_evaluation (0.71) in employees leaving the firm vs the retained ones (0.71) approximatly equal and approximatly 0% correlated with left column so there is no direct impact with left column<br>
**number of projects**: number of projects (3.7) in employees leaving the firm vs the retained ones (3.8) very close so there is no direct impact with left column


#### Visualization

In [14]:
pd.crosstab(df.salary , df.left).plot(kind="bar")

<IPython.core.display.Javascript object>

<AxesSubplot:xlabel='salary'>

In [15]:
pd.crosstab(df.Department ,df.left).plot(kind="bar")

<IPython.core.display.Javascript object>

<AxesSubplot:xlabel='Department'>

From above chart there seem to be some impact of department on employee retention but it is not major hence we will ignore department in our analysis<br>

From the data analysis so far we can conclude that we will use following variables as independant variables in our model:<br>
**Satisfaction_Level**<br>
**Average_Monthly_Hours**<br>
**Promotion_Last_5_Years**<br>
**Salary**<br>
**time_spend_company**

#### Assigning important Features

In [16]:
new= df.drop(["left","Work_accident"], axis=1)
new

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,promotion_last_5years,Department,salary
0,0.38,0.53,2,157,3,0,sales,low
1,0.80,0.86,5,262,6,0,sales,medium
2,0.11,0.88,7,272,4,0,sales,medium
3,0.72,0.87,5,223,5,0,sales,low
4,0.37,0.52,2,159,3,0,sales,low
...,...,...,...,...,...,...,...,...
14994,0.40,0.57,2,151,3,0,support,low
14995,0.37,0.48,2,160,3,0,support,low
14996,0.37,0.53,2,143,3,0,support,low
14997,0.11,0.96,6,280,4,0,support,low


#### apply one hot encoder to salary column

In [17]:
dummy=pd.get_dummies(df.salary)
dummy

Unnamed: 0,high,low,medium
0,0,1,0
1,0,0,1
2,0,0,1
3,0,1,0
4,0,1,0
...,...,...,...
14994,0,1,0
14995,0,1,0
14996,0,1,0
14997,0,1,0


#### drop salary and one dummy column to avoid trapping in dummy variables trap

In [18]:
semi_final=pd.concat([new,dummy],axis=1)
semi_final

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,promotion_last_5years,Department,salary,high,low,medium
0,0.38,0.53,2,157,3,0,sales,low,0,1,0
1,0.80,0.86,5,262,6,0,sales,medium,0,0,1
2,0.11,0.88,7,272,4,0,sales,medium,0,0,1
3,0.72,0.87,5,223,5,0,sales,low,0,1,0
4,0.37,0.52,2,159,3,0,sales,low,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...
14994,0.40,0.57,2,151,3,0,support,low,0,1,0
14995,0.37,0.48,2,160,3,0,support,low,0,1,0
14996,0.37,0.53,2,143,3,0,support,low,0,1,0
14997,0.11,0.96,6,280,4,0,support,low,0,1,0


In [19]:
final=semi_final.drop(["salary","medium"], axis=1)
final

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,promotion_last_5years,Department,high,low
0,0.38,0.53,2,157,3,0,sales,0,1
1,0.80,0.86,5,262,6,0,sales,0,0
2,0.11,0.88,7,272,4,0,sales,0,0
3,0.72,0.87,5,223,5,0,sales,0,1
4,0.37,0.52,2,159,3,0,sales,0,1
...,...,...,...,...,...,...,...,...,...
14994,0.40,0.57,2,151,3,0,support,0,1
14995,0.37,0.48,2,160,3,0,support,0,1
14996,0.37,0.53,2,143,3,0,support,0,1
14997,0.11,0.96,6,280,4,0,support,0,1


In [20]:
final.Department.unique()

array(['sales', 'accounting', 'hr', 'technical', 'support', 'management',
       'IT', 'product_mng', 'marketing', 'RandD'], dtype=object)

In [21]:
final.replace(to_replace=["sales","accounting","hr","technical","support","management","IT","product_mng","marketing","RandD"], value=[0,1,2,3,4,5,6,7,8,9],inplace=True)

#### Assigning Features and target columns

In [22]:
x=final
y=df.left

#### Split the model into training data and testing data

In [23]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.15 ,random_state=7)

#### Bulding and Training Logistic Regression model

In [24]:
model=LogisticRegression()
model.fit(x,y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [25]:
model.score(x_test,y_test)

0.772

#### Assign the parameters

In [26]:
theta_0 = model.intercept_
theta_1 = model.coef_[0][0]
theta_2 = model.coef_[0][1]
theta_3 = model.coef_[0][2]
theta_4 = model.coef_[0][3]
theta_5 = model.coef_[0][4]
theta_6 = model.coef_[0][5]

print(f"theta_0 = {theta_0}\ntheta_1 = {theta_1}\ntheta_2 = {theta_2}\ntheta_3 = {theta_3}\ntheta_4 = {theta_4}\ntheta_5 = {theta_5}\ntheta_6 = {theta_6}")

theta_0 = [0.02221887]
theta_1 = -4.1183034058115355
theta_2 = 0.6155000682691014
theta_3 = -0.308957195363807
theta_4 = 0.004416920779873931
theta_5 = 0.23958451059970445
theta_6 = -0.8079336140461716


#### define sigmoid function

In [27]:
from math import e,pow
def sigmoid_fun(x1,x2,x3,x4,x5,x6):
    return 1 / (1 + pow(e, -(theta_0 + theta_1 * x1 + theta_2 * x2 + theta_3 * x3 + theta_4 * x4 + theta_5 * x5 + theta_6 * x6)))

#### save the model

In [29]:
joblib.dump(model, "predict  of employees to leave the company")

['predict  of employees to leave the company']

#### load the model

In [30]:
joblib.load("predict  of employees to leave the company")

LogisticRegression()