In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [5]:
data = pd.read_csv('HR_comma_sep.csv.txt') 
data.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low


In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14999 entries, 0 to 14998
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   satisfaction_level     14999 non-null  float64
 1   last_evaluation        14999 non-null  float64
 2   number_project         14999 non-null  int64  
 3   average_montly_hours   14999 non-null  int64  
 4   time_spend_company     14999 non-null  int64  
 5   Work_accident          14999 non-null  int64  
 6   left                   14999 non-null  int64  
 7   promotion_last_5years  14999 non-null  int64  
 8   sales                  14999 non-null  object 
 9   salary                 14999 non-null  object 
dtypes: float64(2), int64(6), object(2)
memory usage: 1.1+ MB


In [7]:
data['left'].value_counts()

0    11428
1     3571
Name: left, dtype: int64

In [8]:
data = data.dropna()

In [9]:
le = LabelEncoder()

In [13]:
data['sales'] = le.fit_transform(data['sales'])
print(data['sales'])

0        7
1        7
2        7
3        7
4        7
        ..
14994    8
14995    8
14996    8
14997    8
14998    8
Name: sales, Length: 14999, dtype: int64


In [14]:
data['salary'] = le.fit_transform(data['salary'])
print(data['salary'])

0        1
1        2
2        2
3        1
4        1
        ..
14994    1
14995    1
14996    1
14997    1
14998    1
Name: salary, Length: 14999, dtype: int32


In [15]:
scaler = StandardScaler()

In [16]:
numerical_features = ['satisfaction_level', 'last_evaluation', 'number_project', 'average_montly_hours', 'time_spend_company']
print(numerical_features)

['satisfaction_level', 'last_evaluation', 'number_project', 'average_montly_hours', 'time_spend_company']


In [18]:
data[numerical_features] = scaler.fit_transform(data[numerical_features])
data[numerical_features]

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company
0,-0.936495,-1.087275,-1.462863,-0.882040,-0.341235
1,0.752814,0.840707,0.971113,1.220423,1.713436
2,-2.022479,0.957554,2.593763,1.420657,0.343655
3,0.431041,0.899131,0.971113,0.439508,1.028546
4,-0.976716,-1.145699,-1.462863,-0.841993,-0.341235
...,...,...,...,...,...
14994,-0.856051,-0.853580,-1.462863,-1.002181,-0.341235
14995,-0.976716,-1.379394,-1.462863,-0.821970,-0.341235
14996,-0.976716,-1.087275,-1.462863,-1.162368,-0.341235
14997,-2.022479,1.424944,1.782438,1.580845,0.343655


In [19]:
X = data.drop('left', axis=1)
y = data['left']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [20]:
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [21]:
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')


Accuracy: 0.9883333333333333


In [22]:
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.99      1.00      0.99      2294
           1       0.99      0.96      0.97       706

    accuracy                           0.99      3000
   macro avg       0.99      0.98      0.98      3000
weighted avg       0.99      0.99      0.99      3000



In [23]:
print(confusion_matrix(y_test, y_pred))

[[2285    9]
 [  26  680]]
