# Step1: Load Data

In [2]:
import pandas as pd
import warnings
data = pd.read_csv('../../Excel/employee_data.csv')
warnings.filterwarnings("ignore")

In [3]:
data.shape

(4653, 9)

In [4]:
data.head()

Unnamed: 0,Education,JoiningYear,City,PaymentTier,Age,Gender,EverBenched,ExperienceInCurrentDomain,LeaveOrNot
0,Bachelors,2017,Bangalore,3,34,Male,No,0,0
1,Bachelors,2013,Pune,1,28,Female,No,3,1
2,Bachelors,2014,New Delhi,3,38,Female,No,2,0
3,Masters,2016,Bangalore,3,27,Male,No,5,1
4,Masters,2017,Pune,3,24,Male,Yes,2,1


In [5]:
data.isna().sum()

Education                    0
JoiningYear                  0
City                         0
PaymentTier                  0
Age                          0
Gender                       0
EverBenched                  0
ExperienceInCurrentDomain    0
LeaveOrNot                   0
dtype: int64

In [6]:
data.duplicated().sum()

np.int64(1889)

In [7]:
# data.drop_duplicates(inplace=True)

In [8]:
data["Gender"] = data["Gender"].apply(lambda x: 1 if x == "Male" else 0)
data["EverBenched"] = data["EverBenched"].apply(lambda x: 1 if x == "Yes" else 0)
data["Education"] = data["Education"].apply(lambda x: 1 if x == "Bachelors" else 2 if x == "Masters" else 3)

In [9]:
encoded = pd.get_dummies(data[['City', 'JoiningYear']], drop_first=True, dtype=int)
data = pd.concat([data, encoded], axis=1)
data.drop(columns=["City", "JoiningYear"], inplace=True, axis=1)
data.head()


data = pd.get_dummies(data, dtype=int)

# Step2: Separate input data and output data

In [10]:
X = data.drop('LeaveOrNot',axis=1)
y = data['LeaveOrNot']

# Step3: Split train and test data

In [19]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=0, test_size=.39)

# Logistic Regression

In [20]:
from sklearn.linear_model import LogisticRegression
logr = LogisticRegression()
logr.fit(X_train,y_train)

In [21]:
y_pred_train = logr.predict(X_train)
accuracy_score(y_train, y_pred_train) * 100

72.33967582804792

In [22]:
y_pred_test = logr.predict(X_test)
accuracy_score(y_test, y_pred_test) * 100

72.8374655647383

# Random Forest

In [28]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(X_train,y_train)

In [29]:
y_pred_train = rf.predict(X_train)
accuracy_score(y_train, y_pred_train) * 100

87.17059639389736

In [30]:
y_pred_test = rf.predict(X_test)
accuracy_score(y_test, y_pred_test) * 100

73.80878231080659

In [26]:
best_val = 0
best_scores = [0, 0]
best_split = 0

for i in range(30, 70):  # test_size from 0.30 to 0.50
    test_size = i / 100
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=0)

    logr.fit(X_train, y_train)

    y_pred_train = rf.predict(X_train)
    y_pred_test = rf.predict(X_test)

    a = accuracy_score(y_train, y_pred_train) * 100
    b = accuracy_score(y_test, y_pred_test) * 100

    if a + b > best_val:
        best_val = a + b
        best_scores = [a, b]
        best_split = test_size
        print(f"Train Acc: {a:.2f}%, Test Acc: {b:.2f}%, Split: {best_split}, Combined: {best_val:.2f}%")

Train Acc: 84.40%, Test Acc: 75.21%, Split: 0.3, Combined: 159.62%
Train Acc: 84.45%, Test Acc: 75.40%, Split: 0.31, Combined: 159.85%
Train Acc: 84.86%, Test Acc: 75.13%, Split: 0.33, Combined: 159.99%
Train Acc: 84.95%, Test Acc: 75.24%, Split: 0.34, Combined: 160.19%
Train Acc: 85.05%, Test Acc: 75.32%, Split: 0.35, Combined: 160.38%
Train Acc: 85.19%, Test Acc: 75.36%, Split: 0.36, Combined: 160.54%
Train Acc: 85.33%, Test Acc: 75.38%, Split: 0.37, Combined: 160.71%
Train Acc: 85.40%, Test Acc: 75.52%, Split: 0.38, Combined: 160.93%
Train Acc: 85.45%, Test Acc: 75.70%, Split: 0.39, Combined: 161.15%
Train Acc: 85.52%, Test Acc: 75.83%, Split: 0.4, Combined: 161.36%
Train Acc: 85.46%, Test Acc: 76.15%, Split: 0.41, Combined: 161.62%
Train Acc: 85.47%, Test Acc: 76.37%, Split: 0.42, Combined: 161.84%
Train Acc: 85.44%, Test Acc: 76.61%, Split: 0.43, Combined: 162.06%
Train Acc: 85.49%, Test Acc: 76.76%, Split: 0.44, Combined: 162.25%
Train Acc: 85.54%, Test Acc: 76.89%, Split: 0.45, 