T190434

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

Data Exploration

In [2]:
data = pd.read_csv('Hr.csv')


In [None]:
data.shape

In [None]:
data.columns

In [None]:
data.head()

In [None]:
data.dtypes

In [None]:
data['MaritalStatus'].value_counts()

In [None]:
data['EmpDepartment'].unique()

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
data.isnull().any()

In [None]:
data.index

Data Visualisation

In [None]:
dept = data.iloc[:,[5,27]].copy()
dept

In [None]:
data['PerformanceRating'].unique()

In [14]:
dept_per = dept.copy()

In [None]:
dept_per.groupby(by='EmpDepartment')['PerformanceRating'].mean()

In [None]:
plt.figure(figsize=(10,4.5))
sns.barplot(x = dept_per['EmpDepartment'],y = dept_per['PerformanceRating'])

In [None]:
dept_per.groupby(by='EmpDepartment')['PerformanceRating'].value_counts()

In [18]:
department = pd.get_dummies(dept_per['EmpDepartment'])
performance = pd.DataFrame(dept_per['PerformanceRating'])
dept_rating = pd.concat([department,performance],axis=1)

In [None]:
plt.figure(figsize=(15,10))
plt.subplot(2,3,1)
sns.barplot(x=dept_rating['PerformanceRating'],y=dept_rating['Sales'])
plt.subplot(2,3,2)
sns.barplot(x=dept_rating['PerformanceRating'],y=dept_rating['Development'])
plt.subplot(2,3,3)
sns.barplot(x=dept_rating['PerformanceRating'],y=dept_rating['Research & Development'])
plt.subplot(2,3,4)
sns.barplot(x=dept_rating['PerformanceRating'],y=dept_rating['Human Resources'])
plt.subplot(2,3,5)
sns.barplot(x=dept_rating['PerformanceRating'],y=dept_rating['Finance'])
plt.subplot(2,3,6)
sns.barplot(x=dept_rating['PerformanceRating'],y=dept_rating['Data Science'])
plt.show()

Data PreProcessing

In [None]:
enc = LabelEncoder()
for i in (2,3,4,5,6,7,16,26):
    data.iloc[:,i] = enc.fit_transform(data.iloc[:,i])
data.head()

In [24]:
data.drop(['EmpNumber'],inplace=True,axis=1)

In [None]:
data.head()

In [None]:
y = data.PerformanceRating
X = data.iloc[:,[4,5,9,16,20,21,22,23,24]]
X.head()

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=10)

In [28]:
X_train.shape

(840, 9)

In [None]:
X_test.shape

Model: Random Forest with GridSearchCV

In [None]:
from sklearn.ensemble import RandomForestClassifier
classifier_rfg=RandomForestClassifier(random_state=33,n_estimators=23)
parameters=[{'min_samples_split':[2,3,4,5],'criterion':['gini','entropy'],'min_samples_leaf': [1, 2, 3],'min_samples_split': [2, 3, 4, 5]}] 
model_gridrf=GridSearchCV(estimator=classifier_rfg, param_grid=parameters, scoring='accuracy')
model_gridrf.fit(X_train,y_train)

In [None]:
model_gridrf.best_params_

In [32]:
y_predict_rf = model_gridrf.predict(X_test)

In [None]:
print(accuracy_score(y_test,y_predict_rf))
print(classification_report(y_test,y_predict_rf))

In [None]:
confusion_matrix(y_test,y_predict_rf)