In [None]:
import pandas as pd

from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.model_selection import train_test_split
from sklearn.metrics import ConfusionMatrixDisplay

import matplotlib.pyplot as plt
%matplotlib inline

### Reading the dataset

In [None]:
rrhh_df = pd.read_csv('./data/rrhh.csv', sep = ',')

In [None]:
rrhh_df.rename(columns = { 'sales' : 'department'}, inplace = True)

In [None]:
rrhh_df.shape

In [None]:
rrhh_df.dtypes

In [None]:
rrhh_df.head()

### Splitting train and test datasets

In [None]:
X = rrhh_df[['satisfaction_level', 'last_evaluation', 'number_project', 'average_montly_hours', 'time_spend_company', 'Work_accident', 'left', 'promotion_last_5years']]

In [None]:
Y = rrhh_df['salary']

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.3, stratify = Y)

In [None]:
pd.Series(Y_train).value_counts(normalize = True)

In [None]:
pd.Series(Y_test).value_counts(normalize = True)

### Training the model

In [None]:
model = DecisionTreeClassifier(max_depth = 5)

In [None]:
model.fit(X_train, Y_train)

In [None]:
plt.figure(figsize = (30, 20))
plot_tree(model)
plt.show()

### Evaluating the model

In [None]:
predictions = model.predict(X_test)

In [None]:
ConfusionMatrixDisplay.from_predictions(Y_test, predictions)

### Showing class distributions among classes

In [None]:
fig, axes = plt.subplots(nrows = 4, ncols = 2, figsize = (15, 15))
ax0, ax1, ax2, ax3, ax4, ax5, ax6, ax7 = axes.flatten()

ax0.hist(rrhh_df.loc[ rrhh_df[ 'salary' ] == 'low', 'satisfaction_level' ], label = 'Low', alpha = 0.5)
ax0.hist(rrhh_df.loc[ rrhh_df[ 'salary' ] == 'medium', 'satisfaction_level' ], label = 'Medium', alpha = 0.5)
ax0.hist(rrhh_df.loc[ rrhh_df[ 'salary' ] == 'high', 'satisfaction_level' ], label = 'High', alpha = 0.5)
ax0.set_title('Satisfaction Level')
ax0.legend()

ax1.hist(rrhh_df.loc[ rrhh_df[ 'salary' ] == 'low', 'last_evaluation' ], label = 'Low', alpha = 0.5)
ax1.hist(rrhh_df.loc[ rrhh_df[ 'salary' ] == 'medium', 'last_evaluation' ], label = 'Medium', alpha = 0.5)
ax1.hist(rrhh_df.loc[ rrhh_df[ 'salary' ] == 'high', 'last_evaluation' ], label = 'High', alpha = 0.5)
ax1.set_title('Last Evaluation')

ax2.hist(rrhh_df.loc[ rrhh_df[ 'salary' ] == 'low', 'number_project' ], label = 'Low', alpha = 0.5)
ax2.hist(rrhh_df.loc[ rrhh_df[ 'salary' ] == 'medium', 'number_project' ], label = 'Medium', alpha = 0.5)
ax2.hist(rrhh_df.loc[ rrhh_df[ 'salary' ] == 'high', 'number_project' ], label = 'High', alpha = 0.5)
ax2.set_title('Number of Projects')

ax3.hist(rrhh_df.loc[ rrhh_df[ 'salary' ] == 'low', 'average_montly_hours' ], label = 'Low', alpha = 0.5)
ax3.hist(rrhh_df.loc[ rrhh_df[ 'salary' ] == 'medium', 'average_montly_hours' ], label = 'Medium', alpha = 0.5)
ax3.hist(rrhh_df.loc[ rrhh_df[ 'salary' ] == 'high', 'average_montly_hours' ], label = 'High', alpha = 0.5)
ax3.set_title('Average Monthly Hours')

ax4.hist(rrhh_df.loc[ rrhh_df[ 'salary' ] == 'low', 'time_spend_company' ], label = 'Low', alpha = 0.5)
ax4.hist(rrhh_df.loc[ rrhh_df[ 'salary' ] == 'medium', 'time_spend_company' ], label = 'Medium', alpha = 0.5)
ax4.hist(rrhh_df.loc[ rrhh_df[ 'salary' ] == 'high', 'time_spend_company' ], label = 'High', alpha = 0.5)
ax4.set_title('Time Spend in Company')

ax5.hist(rrhh_df.loc[ rrhh_df[ 'salary' ] == 'low', 'Work_accident' ], label = 'Low', alpha = 0.5)
ax5.hist(rrhh_df.loc[ rrhh_df[ 'salary' ] == 'medium', 'Work_accident' ], label = 'Medium', alpha = 0.5)
ax5.hist(rrhh_df.loc[ rrhh_df[ 'salary' ] == 'high', 'Work_accident' ], label = 'High', alpha = 0.5)
ax5.set_title('Work Accident')

ax6.hist(rrhh_df.loc[ rrhh_df[ 'salary' ] == 'low', 'left' ], label = 'Low', alpha = 0.5)
ax6.hist(rrhh_df.loc[ rrhh_df[ 'salary' ] == 'medium', 'left' ], label = 'Medium', alpha = 0.5)
ax6.hist(rrhh_df.loc[ rrhh_df[ 'salary' ] == 'high', 'left' ], label = 'High', alpha = 0.5)
ax6.set_title('Left')

ax7.hist(rrhh_df.loc[ rrhh_df[ 'salary' ] == 'low', 'promotion_last_5years' ], label = 'Low', alpha = 0.5)
ax7.hist(rrhh_df.loc[ rrhh_df[ 'salary' ] == 'medium', 'promotion_last_5years' ], label = 'Medium', alpha = 0.5)
ax7.hist(rrhh_df.loc[ rrhh_df[ 'salary' ] == 'high', 'promotion_last_5years' ], label = 'High', alpha = 0.5)
ax7.set_title('Promotion Last 5 Years')

plt.show()