In [1]:
from warnings import filterwarnings
filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
%matplotlib inline

In [3]:
df = pd.read_csv('HR_comma_sep.csv')

In [4]:
df.shape

(14999, 10)

In [5]:
df.columns

Index(['satisfaction_level', 'last_evaluation', 'number_project',
       'average_montly_hours', 'time_spend_company', 'Work_accident', 'left',
       'promotion_last_5years', 'Department', 'salary'],
      dtype='object')

In [6]:
df.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,Department,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14999 entries, 0 to 14998
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   satisfaction_level     14999 non-null  float64
 1   last_evaluation        14999 non-null  float64
 2   number_project         14999 non-null  int64  
 3   average_montly_hours   14999 non-null  int64  
 4   time_spend_company     14999 non-null  int64  
 5   Work_accident          14999 non-null  int64  
 6   left                   14999 non-null  int64  
 7   promotion_last_5years  14999 non-null  int64  
 8   Department             14999 non-null  object 
 9   salary                 14999 non-null  object 
dtypes: float64(2), int64(6), object(2)
memory usage: 1.1+ MB


In [8]:
df.isnull().sum()

satisfaction_level       0
last_evaluation          0
number_project           0
average_montly_hours     0
time_spend_company       0
Work_accident            0
left                     0
promotion_last_5years    0
Department               0
salary                   0
dtype: int64

In [9]:
df['Department'].value_counts()

Department
sales          4140
technical      2720
support        2229
IT             1227
product_mng     902
marketing       858
RandD           787
accounting      767
hr              739
management      630
Name: count, dtype: int64

In [10]:
df['salary'].value_counts()

salary
low       7316
medium    6446
high      1237
Name: count, dtype: int64

## Transforming Department Column using Ordinal Encoder:

In [11]:
from sklearn.preprocessing import OrdinalEncoder

ordinal = OrdinalEncoder()

In [12]:
ordinal_encode = ordinal.fit_transform(df[['Department', 'salary']])

In [13]:
ordinal_encode = pd.DataFrame(ordinal_encode,columns=ordinal.get_feature_names_out())

In [14]:
ordinal_encode

Unnamed: 0,Department,salary
0,7.0,1.0
1,7.0,2.0
2,7.0,2.0
3,7.0,1.0
4,7.0,1.0
...,...,...
14994,8.0,1.0
14995,8.0,1.0
14996,8.0,1.0
14997,8.0,1.0


In [15]:
df = df.drop(df[['Department','salary']],axis=1)

In [16]:
df = pd.concat([df,ordinal_encode],axis=1)

In [17]:
df.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,Department,salary
0,0.38,0.53,2,157,3,0,1,0,7.0,1.0
1,0.8,0.86,5,262,6,0,1,0,7.0,2.0
2,0.11,0.88,7,272,4,0,1,0,7.0,2.0
3,0.72,0.87,5,223,5,0,1,0,7.0,1.0
4,0.37,0.52,2,159,3,0,1,0,7.0,1.0


In [33]:
df['salary'].value_counts()

salary
1.0    7316
2.0    6446
0.0    1237
Name: count, dtype: int64

In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14999 entries, 0 to 14998
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   satisfaction_level     14999 non-null  float64
 1   last_evaluation        14999 non-null  float64
 2   number_project         14999 non-null  int64  
 3   average_montly_hours   14999 non-null  int64  
 4   time_spend_company     14999 non-null  int64  
 5   Work_accident          14999 non-null  int64  
 6   left                   14999 non-null  int64  
 7   promotion_last_5years  14999 non-null  int64  
 8   Department             14999 non-null  float64
 9   salary                 14999 non-null  float64
dtypes: float64(4), int64(6)
memory usage: 1.1 MB


In [19]:
df['Department'].value_counts()

Department
7.0    4140
9.0    2720
8.0    2229
0.0    1227
6.0     902
5.0     858
1.0     787
2.0     767
3.0     739
4.0     630
Name: count, dtype: int64

In [20]:
X = df.drop(['salary'],axis=1)
y = df['salary']

In [21]:
y.value_counts()

salary
1.0    7316
2.0    6446
0.0    1237
Name: count, dtype: int64

In [22]:
X.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,Department
0,0.38,0.53,2,157,3,0,1,0,7.0
1,0.8,0.86,5,262,6,0,1,0,7.0
2,0.11,0.88,7,272,4,0,1,0,7.0
3,0.72,0.87,5,223,5,0,1,0,7.0
4,0.37,0.52,2,159,3,0,1,0,7.0


In [23]:
y.head()

0    1.0
1    2.0
2    2.0
3    1.0
4    1.0
Name: salary, dtype: float64

In [24]:
from sklearn.model_selection import train_test_split

X_train,X_test, ytrain,ytest = train_test_split(X,y, test_size=0.20, random_state=42)

In [25]:
X_train.shape, X_test.shape

((11999, 9), (3000, 9))

In [26]:
ytrain.shape, ytest.shape

((11999,), (3000,))

In [27]:
from sklearn.linear_model import LogisticRegression
model1 = LogisticRegression(multi_class='multinomial',solver='lbfgs')

In [28]:
model1.fit(X_train,ytrain)

In [29]:
# Probability

y_pred = model1.predict(X_test)

In [30]:
y_pred

array([1., 1., 1., ..., 2., 1., 1.])

In [31]:
ytest

6723     2.0
6473     1.0
4679     1.0
862      1.0
7286     1.0
        ... 
3297     2.0
14113    2.0
5514     2.0
9939     2.0
14346    1.0
Name: salary, Length: 3000, dtype: float64

In [32]:
## Confuaion Metrics Accuracy , Classificaton report

from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

print(confusion_matrix(y_pred,ytest))
print(accuracy_score(y_pred,ytest))
print(classification_report(y_pred,ytest))

[[  0   0   0]
 [120 980 742]
 [133 494 531]]
0.5036666666666667
              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00         0
         1.0       0.66      0.53      0.59      1842
         2.0       0.42      0.46      0.44      1158

    accuracy                           0.50      3000
   macro avg       0.36      0.33      0.34      3000
weighted avg       0.57      0.50      0.53      3000

