In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
# Read .csv file into pandas
data = pd.read_csv('/kaggle/input/employee-future-prediction/Employee.csv')
data.head()

In [3]:
print("The shape of data is " ,data.shape)
# Info of data
data.info()

In [4]:
data.isnull().sum()


In [5]:
# Checing for value counts 
data['LeaveOrNot'].value_counts()


In [6]:
# Checing for value counts 
data['Education'].value_counts()

In [7]:
# Checing for value counts 
data['Gender'].value_counts()


In [8]:
# Checing for value counts 
data['City'].value_counts()

In [9]:
# Checing for value counts 
data['EverBenched'].value_counts()

# Data visualization

In [10]:
# education & Loan Status
import seaborn as sns
sns.countplot(x='Education',hue='LeaveOrNot',data=data)


In [11]:
# marital status & Loan Status
sns.countplot(x='Gender',hue='LeaveOrNot',data=data)


In [12]:
# marital status & Loan Status
sns.countplot(x='City',hue='LeaveOrNot',data=data)

In [13]:
# marital status & Loan Status
sns.countplot(x='Age',hue='LeaveOrNot',data=data)

In [14]:
# marital status & Loan Status
sns.countplot(x='ExperienceInCurrentDomain',hue='LeaveOrNot',data=data)


# Data Transformation

In [15]:

# convert categorical columns to numerical values
data.replace({'EverBenched':{'No':0,'Yes':1},'City':{'Bangalore':0,'Pune':1,'New Delhi':2},
              'Gender':{'Male':0,'Female':1},'Education':{'Bachelors':0,'Masters':1,'PHD':2}},inplace=True)


In [16]:
# lets see the data one time 
data.head()

In [17]:
# separating the data and label
X = data.drop(columns=['LeaveOrNot'],axis=1)
y = data['LeaveOrNot']


In [18]:
# Shape of X & Y
print(" Shape of X is", X.shape)
print(" Shape of X is", y.shape)

# Splitting the data into testing and training data.


In [20]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2,stratify=y , random_state = 0)


In [21]:
print(" Shape of X_train is", X_train.shape)
print(" Shape of X_test is", X_test.shape)
print(" Shape of y_train is", y_train.shape)
print(" Shape of y_test is", y_test.shape)


# Feature scaling the data 

In [22]:


from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# **Logistic Regression Model**

In [23]:
# fitting data to Logistic Regression model

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

logistic_model = LogisticRegression()
logistic_model.fit(X_train, y_train)
y_pred = logistic_model.predict(X_test)

logistic_model_train_acc = accuracy_score(y_train, logistic_model.predict(X_train))
logistic_model_test_acc = accuracy_score(y_test, y_pred)

print(f"Training Accuracy of Logistic Regression Model is {logistic_model_train_acc}")
print(f"Test Accuracy of Logistic Regression Model is {logistic_model_test_acc}")


In [24]:
confusion_matrix(y_test, y_pred)


In [25]:
# classification report
print(classification_report(y_test, y_pred))

# Decision Tree

In [26]:

from sklearn.tree import DecisionTreeClassifier

dtc = DecisionTreeClassifier()
dtc.fit(X_train, y_train)

y_pred = dtc.predict(X_test)

dtc_train_acc = accuracy_score(y_train, dtc.predict(X_train))
dtc_test_acc = accuracy_score(y_test, y_pred)

print(f"Training Accuracy of Decision Tree Model is {dtc_train_acc}")
print(f"Test Accuracy of Decision Tree Model is {dtc_test_acc}")

In [27]:
# confusion matrix

confusion_matrix(y_test, y_pred)

In [28]:
# classification report
print(classification_report(y_test, y_pred))

In [29]:
# hyper parameter tuning

from sklearn.model_selection import GridSearchCV

grid_params = {
    'criterion' : ['gini', 'entropy'],
    'max_depth' : [3, 5, 7, 10],
    'min_samples_split' : range(2, 10, 1),
    'min_samples_leaf' : range(2, 10, 1)
}

grid_search = GridSearchCV(dtc, grid_params, cv = 5, n_jobs = -1, verbose = 1)
grid_search.fit(X_train, y_train)

In [30]:
dtc = grid_search.best_estimator_
y_pred = dtc.predict(X_test)
dtc_train_acc = accuracy_score(y_train, dtc.predict(X_train))
dtc_test_acc = accuracy_score(y_test, y_pred)

print(f"Training Accuracy of Decesion Tree Model is {dtc_train_acc}")
print(f"Test Accuracy of Decesion Tree Model is {dtc_test_acc}")


In [31]:
# best parameters and best score

print(grid_search.best_params_)
print(grid_search.best_score_)

# #Gradient Boosting Classifier

In [32]:

from sklearn.ensemble import GradientBoostingClassifier

gb = GradientBoostingClassifier()

parameters = {
    'loss': ['deviance', 'exponential'],
    'learning_rate': [0.001, 0.1, 1, 10],
    'n_estimators': [100, 150, 180, 200]
}

grid_search = GridSearchCV(gb, parameters, cv = 5, n_jobs = -1, verbose = 1)
grid_search.fit(X_train, y_train)


In [33]:
# best parameter and best score

print(grid_search.best_params_)
print(grid_search.best_score_)

In [34]:
gb = GradientBoostingClassifier(learning_rate = 0.1, loss = 'deviance', n_estimators = 180)
gb.fit(X_train, y_train)

y_pred = gb.predict(X_test)

gb_train_acc = accuracy_score(y_train, gb.predict(X_train))
gb_test_acc = accuracy_score(y_test, y_pred)

print(f"Training Accuracy of Gradient Boosting Classifier Model is {gb_train_acc}")
print(f"Test Accuracy of Gradient Boosting Classifier Model is {gb_test_acc}")

# Stochastic Gradient Boosting (SGB)

In [35]:

sgbc = GradientBoostingClassifier(learning_rate = 0.1, subsample = 0.9, max_features = 0.75, loss = 'deviance',
                                  n_estimators = 100)

sgbc.fit(X_train, y_train)

y_pred = sgbc.predict(X_test)

sgbc_train_acc = accuracy_score(y_train, sgbc.predict(X_train))
sgbc_test_acc = accuracy_score(y_test, y_pred)

print(f"Training Accuracy of SGB Model is {sgbc_train_acc}")
print(f"Test Accuracy of SGB Model is {sgbc_test_acc}")

# Cat Boost

In [36]:

from catboost import CatBoostClassifier

cat = CatBoostClassifier(iterations = 180, learning_rate = 0.1)
cat.fit(X_train, y_train)

y_pred = cat.predict(X_test)

In [37]:
cat_train_acc = accuracy_score(y_train, cat.predict(X_train))
cat_test_acc = accuracy_score(y_test, y_pred)

print(f"Training Accuracy of Cat Boost Classifier Model is {cat_train_acc}")
print(f"Test Accuracy of Cat Boost Classifier Model is {cat_test_acc}")

# Model Selection

In [39]:
models = ['Logistic Regression', 'Decision Tree', 'Gradient Boosting', 'SGB','Cat Boost']
scores = [logistic_model_test_acc,dtc_test_acc,  gb_test_acc, sgbc_test_acc, cat_test_acc]

models = pd.DataFrame({'Model' : models, 'Score' : scores})


models.sort_values(by = 'Score', ascending = False)

# We can see that Decision tree is giving us the best accuracy of 85%

In [40]:
import matplotlib.pyplot as plt
plt.figure(figsize = (18, 8))

sns.barplot(x = 'Model', y = 'Score', data = models)
plt.show()