In [1]:
# import 'Pandas' 
import pandas as pd 

# import 'Numpy' 
import numpy as np

# import subpackage of Matplotlib
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap

# import 'Seaborn' 
import seaborn as sns

# to suppress warnings 
from warnings import filterwarnings
filterwarnings('ignore')

# display all columns of the dataframe
pd.options.display.max_columns = None

# display all rows of the dataframe
pd.options.display.max_rows = None
 
# to display the float values upto 6 decimal places     
pd.options.display.float_format = '{:.6f}'.format

# import train-test split 
from sklearn.model_selection import train_test_split

# import various functions from statsmodels
import statsmodels
import statsmodels.api as sm

# import StandardScaler to perform scaling
from sklearn.preprocessing import StandardScaler 

# import various functions from sklearn 
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
from sklearn.model_selection import GridSearchCV

  
# import function to perform feature selection
from sklearn.feature_selection import RFE

In [2]:
# set the plot size using 'rcParams'
# once the plot size is set using 'rcParams', it sets the size of all the forthcoming plots in the file
# pass width and height in inches to 'figure.figsize' 
plt.rcParams['figure.figsize'] = [15,8]

In [3]:
# load the csv file
# store the data in 'df_admissions'
df = pd.read_csv('Train_Dataset_(1).csv')

# display first five observations using head()
df.head()

Unnamed: 0,EmployeeID,Attrition,Age,TravelProfile,Department,HomeToWork,EducationField,Gender,HourlnWeek,Involvement,WorkLifeBalance,Designation,JobSatisfaction,ESOPs,NumCompaniesWorked,OverTime,SalaryHikelastYear,WorkExperience,LastPromotion,CurrentProfile,MaritalStatus,MonthlyIncome
0,5110001.0,0.0,35.0,Rarely,Analytics,5.0,CA,Male,69.0,1.0,1.0,Executive,1.0,1.0,1.0,1.0,20.0,7.0,2.0,,M,18932.0
1,5110002.0,1.0,32.0,Yes,Sales,5.0,Statistics,Female,62.0,4.0,3.0,Executive,2.0,0.0,8.0,0.0,20.0,4.0,1.0,,Single,18785.0
2,5110003.0,0.0,31.0,Rarely,Analytics,5.0,Statistics,F,45.0,5.0,3.0,Manager,2.0,1.0,3.0,0.0,26.0,12.0,1.0,3.0,Single,22091.0
3,5110004.0,0.0,34.0,Yes,Sales,10.0,Statistics,Female,32.0,3.0,2.0,Manager,4.0,1.0,1.0,0.0,23.0,5.0,1.0,3.0,Divorsed,20302.0
4,5110005.0,0.0,37.0,No,Analytics,27.0,Statistics,Female,49.0,3.0,4.0,Manager,4.0,1.0,8.0,0.0,21.0,12.0,1.0,9.0,Divorsed,21674.0


In [4]:
# use 'shape' to check the dimension of data
df.shape

(7810, 22)

In [7]:
# use 'dtypes' to check the data type of a variable
df.dtypes

EmployeeID            float64
Attrition             float64
Age                   float64
TravelProfile          object
Department             object
HomeToWork            float64
EducationField         object
Gender                 object
HourlnWeek            float64
Involvement           float64
WorkLifeBalance       float64
Designation            object
JobSatisfaction       float64
ESOPs                 float64
NumCompaniesWorked    float64
OverTime              float64
SalaryHikelastYear    float64
WorkExperience        float64
LastPromotion         float64
CurrentProfile        float64
MaritalStatus          object
MonthlyIncome         float64
dtype: object

In [8]:
# sort the variables on the basis of total null values in the variable
# 'isnull().sum()' returns the number of missing values in each variable
# 'ascending = False' sorts values in the descending order
# the variable with highest number of missing values will appear first
Total = df.isnull().sum().sort_values(ascending=False)          

# calculate percentage of missing values
# 'ascending = False' sorts values in the descending order
# the variable with highest percentage of missing values will appear first
Percent = (df.isnull().sum()*100/df.isnull().count()).sort_values(ascending=False)   

# concat the 'Total' and 'Percent' columns using 'concat' function
# pass a list of column names in parameter 'keys' 
# 'axis = 1' concats along the columns
missing_data = pd.concat([Total, Percent], axis = 1, keys = ['Total', 'Percentage of Missing Values'])    
missing_data

Unnamed: 0,Total,Percentage of Missing Values
Age,2946,37.720871
CurrentProfile,2941,37.65685
HourlnWeek,2917,37.349552
HomeToWork,2885,36.939821
WorkExperience,2817,36.069142
SalaryHikelastYear,2799,35.838668
Department,2754,35.262484
MonthlyIncome,2723,34.865557
LastPromotion,2700,34.571063
Gender,2676,34.263764


In [9]:
df.dtypes

EmployeeID            float64
Attrition             float64
Age                   float64
TravelProfile          object
Department             object
HomeToWork            float64
EducationField         object
Gender                 object
HourlnWeek            float64
Involvement           float64
WorkLifeBalance       float64
Designation            object
JobSatisfaction       float64
ESOPs                 float64
NumCompaniesWorked    float64
OverTime              float64
SalaryHikelastYear    float64
WorkExperience        float64
LastPromotion         float64
CurrentProfile        float64
MaritalStatus          object
MonthlyIncome         float64
dtype: object

In [10]:
df1_num = df.select_dtypes(include = [np.number])
df[df1_num.columns] = df1_num.fillna(df1_num.mean())

In [11]:
df.isnull().sum()

EmployeeID               0
Attrition                0
Age                      0
TravelProfile         2630
Department            2754
HomeToWork               0
EducationField        2630
Gender                2676
HourlnWeek               0
Involvement              0
WorkLifeBalance          0
Designation           2668
JobSatisfaction          0
ESOPs                    0
NumCompaniesWorked       0
OverTime                 0
SalaryHikelastYear       0
WorkExperience           0
LastPromotion            0
CurrentProfile           0
MaritalStatus         2630
MonthlyIncome            0
dtype: int64

In [12]:
for column in df.select_dtypes(include='object'):
    df[column].fillna(df[column].mode()[0], inplace=True)

In [13]:
Total = df.isnull().sum().sort_values(ascending=False)
Total

EmployeeID            0
Attrition             0
MaritalStatus         0
CurrentProfile        0
LastPromotion         0
WorkExperience        0
SalaryHikelastYear    0
OverTime              0
NumCompaniesWorked    0
ESOPs                 0
JobSatisfaction       0
Designation           0
WorkLifeBalance       0
Involvement           0
HourlnWeek            0
Gender                0
EducationField        0
HomeToWork            0
Department            0
TravelProfile         0
Age                   0
MonthlyIncome         0
dtype: int64

In [14]:
df.shape

(7810, 22)

In [15]:
# use astype() to change the data type

df['EmployeeID'] = df['EmployeeID'].astype('int64')
df['Attrition'] = df['Attrition'].astype('int64')


In [16]:
# store the target variable 'Chance of Admit' in a dataframe 'df_target'
df_target = df['Attrition']

# store all the independent variables in a dataframe 'df_feature' 
# drop the column 'Chance of Admit' using drop()
# 'axis = 1' drops the specified column
df_feature = df.drop('Attrition', axis = 1)

In [17]:
# filter the numerical features in the dataset
# 'select_dtypes' is used to select the variables with given data type
# 'include = [np.number]' will include all the numerical variables
df_num = df_feature.select_dtypes(include = [np.number])

# display numerical features
df_num.columns

Index(['EmployeeID', 'Age', 'HomeToWork', 'HourlnWeek', 'Involvement',
       'WorkLifeBalance', 'JobSatisfaction', 'ESOPs', 'NumCompaniesWorked',
       'OverTime', 'SalaryHikelastYear', 'WorkExperience', 'LastPromotion',
       'CurrentProfile', 'MonthlyIncome'],
      dtype='object')

In [18]:
# filter the categorical features in the dataset
# 'select_dtypes' is used to select the variables with given data type
# 'include = [np.object]' will include all the categorical variables
df_cat = df_feature.select_dtypes(include = [object])

# display categorical features
df_cat.columns

Index(['TravelProfile', 'Department', 'EducationField', 'Gender',
       'Designation', 'MaritalStatus'],
      dtype='object')

In [19]:
df['Gender'].replace({'F':'Female'},inplace=True)
df['MaritalStatus'].replace({'M':'Married'},inplace=True)

In [20]:
for column in df.select_dtypes(include=object).columns:
    mapping = df[column].value_counts(normalize=True)
    df[column].replace(mapping.to_dict(),inplace=True)

In [21]:
df.dtypes

EmployeeID              int64
Attrition               int64
Age                   float64
TravelProfile         float64
Department            float64
HomeToWork            float64
EducationField        float64
Gender                float64
HourlnWeek            float64
Involvement           float64
WorkLifeBalance       float64
Designation           float64
JobSatisfaction       float64
ESOPs                 float64
NumCompaniesWorked    float64
OverTime              float64
SalaryHikelastYear    float64
WorkExperience        float64
LastPromotion         float64
CurrentProfile        float64
MaritalStatus         float64
MonthlyIncome         float64
dtype: object

In [22]:
#Defining the Train dataset
X = df.drop('Attrition',axis=1) 

# display first five observations
X.head()

Unnamed: 0,EmployeeID,Age,TravelProfile,Department,HomeToWork,EducationField,Gender,HourlnWeek,Involvement,WorkLifeBalance,Designation,JobSatisfaction,ESOPs,NumCompaniesWorked,OverTime,SalaryHikelastYear,WorkExperience,LastPromotion,CurrentProfile,MaritalStatus,MonthlyIncome
0,5110001,35.0,0.802433,0.764789,5.0,0.199744,0.738796,69.0,1.0,1.0,0.606914,1.0,1.0,1.0,1.0,20.0,7.0,2.0,4.385295,0.305762,18932.0
1,5110002,32.0,0.134571,0.206786,5.0,0.609347,0.261204,62.0,4.0,3.0,0.606914,2.0,0.0,8.0,0.0,20.0,4.0,1.0,4.385295,0.564149,18785.0
2,5110003,31.0,0.802433,0.764789,5.0,0.609347,0.261204,45.0,5.0,3.0,0.22484,2.0,1.0,3.0,0.0,26.0,12.0,1.0,3.0,0.564149,22091.0
3,5110004,34.0,0.134571,0.206786,10.0,0.609347,0.261204,32.0,3.0,2.0,0.22484,4.0,1.0,1.0,0.0,23.0,5.0,1.0,3.0,0.13009,20302.0
4,5110005,37.0,0.062996,0.764789,27.0,0.609347,0.261204,49.0,3.0,4.0,0.22484,4.0,1.0,8.0,0.0,21.0,12.0,1.0,9.0,0.13009,21674.0


In [23]:
df1 = pd.read_csv('Test_Dataset_(1)_(1).csv')

# display first five observations using head()
df1.head()

Unnamed: 0,EmployeeID,Age,TravelProfile,Department,HomeToWork,EducationField,Gender,HourlnWeek,Involvement,WorkLifeBalance,Designation,JobSatisfaction,ESOPs,NumCompaniesWorked,OverTime,SalaryHikelastYear,WorkExperience,LastPromotion,CurrentProfile,MaritalStatus,MonthlyIncome
0,6110001,18.0,No,,9.0,CA,Male,80.0,3,2,Executive,3,1,1,0,17.0,0.0,0.0,0.0,Single,16904.0
1,6110002,20.0,Rarely,Analytics,28.0,Statistics,Female,59.0,1,3,Executive,1,1,2,1,18.0,2.0,1.0,,Single,18994.0
2,6110003,50.0,Rarely,Analytics,19.0,CA,Female,76.0,3,3,Executive,5,0,5,1,22.0,18.0,3.0,3.0,M,18587.0
3,6110004,32.0,Rarely,Sales,23.0,Statistics,Female,73.0,5,2,Manager,3,0,4,1,17.0,5.0,3.0,3.0,Married,20559.0
4,6110005,39.0,Rarely,Analytics,7.0,CA,Male,42.0,4,1,Senior Manager,5,1,4,0,20.0,9.0,1.0,7.0,Married,24991.0


In [24]:
df1.shape

(2630, 21)

In [25]:
df1.dtypes

EmployeeID              int64
Age                   float64
TravelProfile          object
Department             object
HomeToWork            float64
EducationField         object
Gender                 object
HourlnWeek            float64
Involvement             int64
WorkLifeBalance         int64
Designation            object
JobSatisfaction         int64
ESOPs                   int64
NumCompaniesWorked      int64
OverTime                int64
SalaryHikelastYear    float64
WorkExperience        float64
LastPromotion         float64
CurrentProfile        float64
MaritalStatus          object
MonthlyIncome         float64
dtype: object

In [26]:
Total = df1.isnull().sum().sort_values(ascending=False)          

Percent = (df1.isnull().sum()*100/df1.isnull().count()).sort_values(ascending=False)   

missing_data = pd.concat([Total, Percent], axis = 1, keys = ['Total', 'Percentage of Missing Values'])    
missing_data

Unnamed: 0,Total,Percentage of Missing Values
Age,142,5.39924
HourlnWeek,136,5.171103
CurrentProfile,134,5.095057
HomeToWork,126,4.790875
WorkExperience,122,4.638783
SalaryHikelastYear,94,3.574144
Department,58,2.205323
LastPromotion,57,2.1673
MonthlyIncome,33,1.254753
Designation,30,1.140684


In [27]:
df1_num = df1.select_dtypes(include = [np.number])
df1[df1_num.columns]= df1_num.fillna(df1_num.mean())

In [28]:
for column in df1.select_dtypes(include='object'):
    df1[column].fillna(df1[column].mode()[0], inplace=True)

In [29]:
Total = df1.isnull().sum().sort_values(ascending=False)          

Percent = (df1.isnull().sum()*100/df1.isnull().count()).sort_values(ascending=False)   

missing_data = pd.concat([Total, Percent], axis = 1, keys = ['Total', 'Percentage of Missing Values'])    
missing_data

Unnamed: 0,Total,Percentage of Missing Values
EmployeeID,0,0.0
JobSatisfaction,0,0.0
MaritalStatus,0,0.0
CurrentProfile,0,0.0
LastPromotion,0,0.0
WorkExperience,0,0.0
SalaryHikelastYear,0,0.0
OverTime,0,0.0
NumCompaniesWorked,0,0.0
ESOPs,0,0.0


In [30]:
df1['Gender'].unique()

array(['Male', 'Female', 'F'], dtype=object)

In [31]:
df1_num = df1.select_dtypes(include = [np.number])

# display numerical features
df1_num.columns

Index(['EmployeeID', 'Age', 'HomeToWork', 'HourlnWeek', 'Involvement',
       'WorkLifeBalance', 'JobSatisfaction', 'ESOPs', 'NumCompaniesWorked',
       'OverTime', 'SalaryHikelastYear', 'WorkExperience', 'LastPromotion',
       'CurrentProfile', 'MonthlyIncome'],
      dtype='object')

In [32]:
df1_cat = df1.select_dtypes(include = [object])

# display categorical features
df1_cat.columns

Index(['TravelProfile', 'Department', 'EducationField', 'Gender',
       'Designation', 'MaritalStatus'],
      dtype='object')

In [33]:
df1['Gender'].replace({'F':'Female'},inplace=True)
df1['MaritalStatus'].replace({'M':'Married'},inplace=True)

In [34]:
for column in df1.select_dtypes(include=object).columns:
    mapping = df1[column].value_counts(normalize=True)
    df1[column].replace(mapping.to_dict(),inplace=True)

In [35]:
X1 = df1

# display first five observations
X1.head()

Unnamed: 0,EmployeeID,Age,TravelProfile,Department,HomeToWork,EducationField,Gender,HourlnWeek,Involvement,WorkLifeBalance,Designation,JobSatisfaction,ESOPs,NumCompaniesWorked,OverTime,SalaryHikelastYear,WorkExperience,LastPromotion,CurrentProfile,MaritalStatus,MonthlyIncome
0,6110001,18.0,0.094677,0.658935,9.0,0.325856,0.609886,80.0,3,2,0.388973,3,1,1,0,17.0,0.0,0.0,0.0,0.354753,16904.0
1,6110002,20.0,0.704183,0.658935,28.0,0.395437,0.390114,59.0,1,3,0.388973,1,1,2,1,18.0,2.0,1.0,4.506811,0.354753,18994.0
2,6110003,50.0,0.704183,0.658935,19.0,0.325856,0.390114,76.0,3,3,0.388973,5,0,5,1,22.0,18.0,3.0,3.0,0.463878,18587.0
3,6110004,32.0,0.704183,0.301141,23.0,0.395437,0.390114,73.0,5,2,0.34981,3,0,4,1,17.0,5.0,3.0,3.0,0.463878,20559.0
4,6110005,39.0,0.704183,0.658935,7.0,0.325856,0.609886,42.0,4,1,0.148669,5,1,4,0,20.0,9.0,1.0,7.0,0.463878,24991.0


In [36]:
X1.shape

(2630, 21)

In [37]:
# split data into train subset and test subset
# set 'random_state' to generate the same dataset each time you run the code 
# 'test_size' returns the proportion of data to be included in the test set

X_train = X
y_train = df_target

X_test = X1

# check the dimensions of the train & test subset using 'shape'
# print dimension of train set
print('X_train', X_train.shape)
print('y_train', y_train.shape)

# print dimension of test set
print('X_test', X_test.shape)

X_train (7810, 21)
y_train (7810,)
X_test (2630, 21)


In [38]:
df_target.value_counts(normalize=True)

Attrition
0   0.814981
1   0.185019
Name: proportion, dtype: float64

In [39]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(sampling_strategy='minority')
X_train_resampled, y_train_resampled = smote.fit_resample(X_train,y_train)
from sklearn.model_selection import KFold

# Define the number of splits
k = 10

# Create a KFold object
kf = KFold(n_splits=k)

# Split the data
for train_index, test_index in kf.split(X):
    X_train = X.iloc[train_index]
    Xtest1 = X.iloc[train_index]
    # Now, 'train_data' is the training set and 'test_data' is the test set
    
for train_index, test_index in kf.split(df_target):
    y_train = df_target.iloc[train_index]
    ytest1 = df_target.iloc[train_index]


In [40]:
# instantiate the 'RandomForestClassifier'
# pass the required number of trees in the random forest to the parameter, 'n_estimators'
# pass the 'random_state' to obtain the same samples for each time you run the code
from sklearn.ensemble import RandomForestRegressor
rf_classification = RandomForestClassifier(n_estimators = 120,criterion="gini",min_samples_split=4,max_depth=10, random_state = 42)

# use fit() to fit the model on the train set
rf_model = rf_classification.fit(X_train, y_train)

In [41]:
y_train_pred = rf_model.predict(Xtest1)
accuracy = accuracy_score(ytest1, y_train_pred)
print("Training Accuracy:", accuracy)

# Predictions on Test Data
y_test_pred = rf_model.predict(X_test)

Training Accuracy: 0.9866268316972542


In [40]:
submission_df = pd.DataFrame({'EmployeeID': df1['EmployeeID'], 'Attrition': y_test_pred})
submission_df.to_csv('Submissions1.csv', index=False)

In [42]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
# Define the hyperparameter ranges

tuned_paramaters = [{'criterion': ['entropy', 'gini','log_loss'],
                     'n_estimators': list(range(10,91,5)),
                     'max_depth': list(range(0,11,2)),
                     'max_features': ['sqrt', 'log2'],
                     'min_samples_split': [2, 5, 8, 11],
                     'min_samples_leaf': [1, 5, 9],
                     'max_leaf_nodes': [2, 5, 8, 11]}]
# Create the random search object
rf_random = RandomizedSearchCV(
    estimator = rf_classification,
    param_distributions = tuned_paramaters,
    n_iter = 100, # The number of parameter settings to try
    cv = 3, # The number of cross-validation folds to use
    verbose = 2, # The level of verbosity for the output
    random_state = 100, # The seed for the random number generator
    n_jobs = -1 # The number of jobs to run in parallel
)


In [None]:
# Fit the random search object to the training data
rf_random.fit(X_train_resampled, y_train_resampled)
# Print the best parameters and the best score
print(rf_random.best_params_)
print(rf_random.best_score_)


In [None]:
# Get the best model
best_rf = rf_random.best_estimator_
# Make predictions on the test data
y_pred = best_rf.predict(X_test)
# Evaluate the performance on the test data
y_train_pred = best_rf.predict(X_train_resampled)
accuracy = accuracy_score(y_train_resampled, y_train_pred)
print("Training Accuracy:", accuracy)

In [43]:
from sklearn.ensemble import AdaBoostClassifier
ada = AdaBoostClassifier(base_estimator=rf_model,n_estimators=400,learning_rate=0.1, random_state=100)

# Fit the classifier
ada.fit(X_train_resampled, y_train_resampled)

# Make predictions
y_pred = ada.predict(X_test)
y_train_pred = ada.predict(X_train_resampled)
accuracy = accuracy_score(y_train_resampled, y_train_pred)
print("Training Accuracy:", accuracy)

Training Accuracy: 0.998114689709348


In [44]:
from sklearn.ensemble import GradientBoostingClassifier
gb_clf = GradientBoostingClassifier(n_estimators=150, max_depth=3,learning_rate=0.1,criterion='squared_error', random_state=42)

# Fit the classifier
gb_clf.fit(X_train_resampled, y_train_resampled)
# Make predictions
y_pred = gb_clf.predict(X_test)
y_train_pred = gb_clf.predict(X_train_resampled)
accuracy = accuracy_score(y_train_resampled, y_train_pred)
print("Training Accuracy:", accuracy)

Training Accuracy: 0.9547525530243519


In [None]:
submission_df = pd.DataFrame({'EmployeeID': df1['EmployeeID'], 'Attrition': y_pred})
submission_df.to_csv('Submissions1.csv', index=False)

In [None]:
import xgboost as xgb
model = xgb.XGBClassifier(n_estimators = 1200, learning_rate=0.01, gamma=2)
model.fit(X_train_resampled, y_train_resampled)

# Make predictions
y_pred = model.predict(X_test)
y_train_pred = model.predict(X_train_resampled)
accuracy = accuracy_score(y_train_resampled, y_train_pred)
print("Training Accuracy:", accuracy)

In [None]:
from sklearn.neighbors import KNeighborsClassifier

# Create kNN classifier
knn = KNeighborsClassifier(n_neighbors=100,weights='distance')

# Fit the model
knn.fit(X_train, y_train)

# Make predictions
y_pred = knn.predict(X_test)
y_train_pred = knn.predict(X_train)
accuracy = accuracy_score(y_train, y_train_pred)
print("Training Accuracy:", accuracy)

In [None]:
from sklearn.model_selection import GridSearchCV

# Define the parameter values that should be searched
param_grid = {'criterion': ['entropy', 'gini','log_loss'],
              'n_estimators': list(range(25,45,5)),
              'max_depth': list(range(0,11,2)),
              'max_features': ['sqrt', 'log2'],
              'min_samples_split': [2, 5, 8, 11],
              'min_samples_leaf': [1, 5, 9],
              'max_leaf_nodes': [2, 5, 8, 11]}

# Create a GridSearchCV object
grid = GridSearchCV(rf_model, param_grid, refit=True, verbose=3)

# Fit the model with the grid search estimator. 
# This will take the SVC model and try each combination of parameters
grid.fit(X_train, y_train)

# You can inspect the best parameters found by GridSearchCV in the best_params_ attribute, and the best estimator in the best_estimator_ attribute
print(grid.best_params_)
print(grid.best_estimator_)

In [None]:
print(grid.best_estimator_)