In [20]:
from imblearn.combine import SMOTETomek
from collections import Counter
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings 
warnings.filterwarnings('ignore')

from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split

from sklearn.feature_selection import VarianceThreshold
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE


In [33]:
# Load dataset (replace 'file_path' with the actual path)
data = pd.read_csv('C:/EmotionRecog/src/EmpAttrition/Notebook/NewRAw.csv')

In [35]:
num_attr=[attr for attr in data.columns if data[attr].dtype!='O']
cat_attr=[attr for attr in data.columns if data[attr].dtype=='bool']

print(f"Numerical Attributes:{num_attr}")
print(f"Categorical Attributes:{cat_attr}")

Numerical Attributes:['AGE', 'ATTRITION', 'DAILYRATE', 'DISTANCEFROMHOME', 'EDUCATION', 'ENVIRONMENTSATISFACTION', 'GENDER', 'HOURLYRATE', 'JOBINVOLVEMENT', 'JOBLEVEL', 'JOBSATISFACTION', 'MONTHLYINCOME', 'MONTHLYRATE', 'NUMCOMPANIESWORKED', 'PERCENTSALARYHIKE', 'PERFORMANCERATING', 'RELATIONSHIPSATISFACTION', 'STOCKOPTIONLEVEL', 'TOTALWORKINGYEARS', 'TRAININGTIMESLASTYEAR', 'WORKLIFEBALANCE', 'YEARSATCOMPANY', 'YEARSINCURRENTROLE', 'YEARSSINCELASTPROMOTION', 'YEARSWITHCURRMANAGER', 'BUSINESSTRAVEL_Non-Travel', 'BUSINESSTRAVEL_Travel_Frequently', 'BUSINESSTRAVEL_Travel_Rarely', 'DEPARTMENT_Human Resources', 'DEPARTMENT_Research & Development', 'DEPARTMENT_Sales', 'EDUCATIONFIELD_Human Resources', 'EDUCATIONFIELD_Life Sciences', 'EDUCATIONFIELD_Marketing', 'EDUCATIONFIELD_Medical', 'EDUCATIONFIELD_Other', 'EDUCATIONFIELD_Technical Degree', 'JOBROLE_Healthcare Representative', 'JOBROLE_Human Resources', 'JOBROLE_Laboratory Technician', 'JOBROLE_Manager', 'JOBROLE_Manufacturing Director',

In [36]:

scaler = MinMaxScaler()
Ndata = scaler.fit_transform(data[num_attr])

# Convert the normalized data back to a DataFrame using the original column names
X_normalized_df = pd.DataFrame(Ndata, columns=data[num_attr].columns)

# Display the first few rows of the normalized DataFrame
X_normalized_df.head()

Unnamed: 0,AGE,ATTRITION,DAILYRATE,DISTANCEFROMHOME,EDUCATION,ENVIRONMENTSATISFACTION,GENDER,HOURLYRATE,JOBINVOLVEMENT,JOBLEVEL,...,JOBROLE_Manufacturing Director,JOBROLE_Research Director,JOBROLE_Research Scientist,JOBROLE_Sales Executive,JOBROLE_Sales Representative,MARITALSTATUS_Divorced,MARITALSTATUS_Married,MARITALSTATUS_Single,OVERTIME_No,OVERTIME_Yes
0,0.190476,1.0,0.898354,0.857143,0.5,0.0,1.0,0.257143,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
1,0.214286,0.0,0.638511,0.25,0.5,1.0,1.0,0.1,0.666667,0.5,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0
2,0.285714,0.0,0.443092,0.0,0.25,0.666667,0.0,0.4,0.666667,0.25,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
3,0.547619,1.0,0.900501,0.392857,0.5,0.333333,0.0,0.271429,0.666667,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
4,0.380952,0.0,0.689334,0.785714,0.75,0.333333,1.0,0.6,0.666667,0.25,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0


In [38]:
newData = pd.concat([X_normalized_df, data[cat_attr].reset_index(drop=True)], axis=1)
newData.head()

Unnamed: 0,AGE,ATTRITION,DAILYRATE,DISTANCEFROMHOME,EDUCATION,ENVIRONMENTSATISFACTION,GENDER,HOURLYRATE,JOBINVOLVEMENT,JOBLEVEL,...,JOBROLE_Manufacturing Director,JOBROLE_Research Director,JOBROLE_Research Scientist,JOBROLE_Sales Executive,JOBROLE_Sales Representative,MARITALSTATUS_Divorced,MARITALSTATUS_Married,MARITALSTATUS_Single,OVERTIME_No,OVERTIME_Yes
0,0.190476,1.0,0.898354,0.857143,0.5,0.0,1.0,0.257143,0.0,0.0,...,False,False,False,False,False,False,False,True,True,False
1,0.214286,0.0,0.638511,0.25,0.5,1.0,1.0,0.1,0.666667,0.5,...,False,False,False,True,False,False,False,True,True,False
2,0.285714,0.0,0.443092,0.0,0.25,0.666667,0.0,0.4,0.666667,0.25,...,False,False,False,False,False,False,False,True,True,False
3,0.547619,1.0,0.900501,0.392857,0.5,0.333333,0.0,0.271429,0.666667,1.0,...,False,True,False,False,False,False,True,False,True,False
4,0.380952,0.0,0.689334,0.785714,0.75,0.333333,1.0,0.6,0.666667,0.25,...,False,False,False,True,False,False,False,True,True,False


In [39]:
data['ATTRITION'].nunique()

2

In [50]:
#Split the dataset
X = newData.drop('ATTRITION', axis=1)
y = newData['ATTRITION']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=3)

In [51]:
newData['ATTRITION'].nunique()

2

In [52]:
y_train.nunique()

2

In [53]:

# Counting the number of instances in each class before oversampling
counter = Counter(y_train)
print('Before', counter)

# Oversampling the train dataset using SMOTE + Tomek
smtom = SMOTETomek(random_state=139)
X_train_smtom, y_train_smtom = smtom.fit_resample(X_train, y_train)

# Counting the number of instances in each class after oversampling
counter = Counter(y_train_smtom)
print('After', counter)

Before Counter({0.0: 988, 1.0: 188})
After Counter({0.0: 986, 1.0: 986})


In [54]:
# Variance Threshold
selector = VarianceThreshold(threshold=0.01)
X_train_var = selector.fit_transform(X_train)

In [55]:
# Recursive Feature Elimination (RFE) with Random Forest
model = RandomForestClassifier(random_state=42)
rfe = RFE(model, n_features_to_select=25)
X_train_rfe = rfe.fit_transform(X_train, y_train)

In [56]:
# Selected features
selected_features = X_train.columns[rfe.support_]
print("Selected Features:", selected_features)

Selected Features: Index(['AGE', 'DAILYRATE', 'DISTANCEFROMHOME', 'EDUCATION',
       'ENVIRONMENTSATISFACTION', 'HOURLYRATE', 'JOBINVOLVEMENT', 'JOBLEVEL',
       'JOBSATISFACTION', 'MONTHLYINCOME', 'MONTHLYRATE', 'NUMCOMPANIESWORKED',
       'PERCENTSALARYHIKE', 'RELATIONSHIPSATISFACTION', 'STOCKOPTIONLEVEL',
       'TOTALWORKINGYEARS', 'TRAININGTIMESLASTYEAR', 'WORKLIFEBALANCE',
       'YEARSATCOMPANY', 'YEARSINCURRENTROLE', 'YEARSSINCELASTPROMOTION',
       'YEARSWITHCURRMANAGER', 'BUSINESSTRAVEL_Travel_Frequently',
       'OVERTIME_No', 'OVERTIME_No'],
      dtype='object')
