In [190]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings 
warnings.filterwarnings('ignore')

from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split

In [191]:
# Load dataset (replace 'file_path' with the actual path)
data = pd.read_csv('C:/EmotionRecog/artifact/raw.csv')

In [192]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 35 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   AGE                       1470 non-null   int64 
 1   ATTRITION                 1470 non-null   object
 2   BUSINESSTRAVEL            1470 non-null   object
 3   DAILYRATE                 1470 non-null   int64 
 4   DEPARTMENT                1470 non-null   object
 5   DISTANCEFROMHOME          1470 non-null   int64 
 6   EDUCATION                 1470 non-null   int64 
 7   EDUCATIONFIELD            1470 non-null   object
 8   EMPLOYEECOUNT             1470 non-null   int64 
 9   EMPLOYEENUMBER            1470 non-null   int64 
 10  ENVIRONMENTSATISFACTION   1470 non-null   int64 
 11  GENDER                    1470 non-null   object
 12  HOURLYRATE                1470 non-null   int64 
 13  JOBINVOLVEMENT            1470 non-null   int64 
 14  JOBLEVEL                

In [193]:
data.nunique()

AGE                           43
ATTRITION                      2
BUSINESSTRAVEL                 3
DAILYRATE                    886
DEPARTMENT                     3
DISTANCEFROMHOME              29
EDUCATION                      5
EDUCATIONFIELD                 6
EMPLOYEECOUNT                  1
EMPLOYEENUMBER              1470
ENVIRONMENTSATISFACTION        4
GENDER                         2
HOURLYRATE                    71
JOBINVOLVEMENT                 4
JOBLEVEL                       5
JOBROLE                        9
JOBSATISFACTION                4
MARITALSTATUS                  3
MONTHLYINCOME               1349
MONTHLYRATE                 1427
NUMCOMPANIESWORKED            10
OVER18                         1
OVERTIME                       2
PERCENTSALARYHIKE             15
PERFORMANCERATING              2
RELATIONSHIPSATISFACTION       4
STANDARDHOURS                  1
STOCKOPTIONLEVEL               4
TOTALWORKINGYEARS             40
TRAININGTIMESLASTYEAR          7
WORKLIFEBA

In [173]:
attrition_counts = data['ATTRITION'].value_counts()
attrition_percentages = (attrition_counts / len(data)) * 100
print(attrition_counts)
print(attrition_percentages)

ATTRITION
0    1233
1     237
Name: count, dtype: int64
ATTRITION
0    83.877551
1    16.122449
Name: count, dtype: float64


In [194]:
# Standard hours and Employeecount is same for all instances 
data.drop(columns=['EMPLOYEECOUNT','STANDARDHOURS','OVER18','EMPLOYEENUMBER'],axis=1, inplace=True)

In [195]:
data.describe()

Unnamed: 0,AGE,DAILYRATE,DISTANCEFROMHOME,EDUCATION,ENVIRONMENTSATISFACTION,HOURLYRATE,JOBINVOLVEMENT,JOBLEVEL,JOBSATISFACTION,MONTHLYINCOME,...,PERFORMANCERATING,RELATIONSHIPSATISFACTION,STOCKOPTIONLEVEL,TOTALWORKINGYEARS,TRAININGTIMESLASTYEAR,WORKLIFEBALANCE,YEARSATCOMPANY,YEARSINCURRENTROLE,YEARSSINCELASTPROMOTION,YEARSWITHCURRMANAGER
count,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,...,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0
mean,36.92381,802.485714,9.192517,2.912925,2.721769,65.891156,2.729932,2.063946,2.728571,6502.931293,...,3.153741,2.712245,0.793878,11.279592,2.79932,2.761224,7.008163,4.229252,2.187755,4.123129
std,9.135373,403.5091,8.106864,1.024165,1.093082,20.329428,0.711561,1.10694,1.102846,4707.956783,...,0.360824,1.081209,0.852077,7.780782,1.289271,0.706476,6.126525,3.623137,3.22243,3.568136
min,18.0,102.0,1.0,1.0,1.0,30.0,1.0,1.0,1.0,1009.0,...,3.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
25%,30.0,465.0,2.0,2.0,2.0,48.0,2.0,1.0,2.0,2911.0,...,3.0,2.0,0.0,6.0,2.0,2.0,3.0,2.0,0.0,2.0
50%,36.0,802.0,7.0,3.0,3.0,66.0,3.0,2.0,3.0,4919.0,...,3.0,3.0,1.0,10.0,3.0,3.0,5.0,3.0,1.0,3.0
75%,43.0,1157.0,14.0,4.0,4.0,83.75,3.0,3.0,4.0,8379.0,...,3.0,4.0,1.0,15.0,3.0,3.0,9.0,7.0,3.0,7.0
max,60.0,1499.0,29.0,5.0,4.0,100.0,4.0,5.0,4.0,19999.0,...,4.0,4.0,3.0,40.0,6.0,4.0,40.0,18.0,15.0,17.0


In [196]:
num_attr=[attr for attr in data.columns if data[attr].dtype!='O']
cat_attr=[attr for attr in data.columns if data[attr].dtype=='O']

print(f"Numerical Attributes:{num_attr}")
print(f"Categorical Attributes:{cat_attr}")

Numerical Attributes:['AGE', 'DAILYRATE', 'DISTANCEFROMHOME', 'EDUCATION', 'ENVIRONMENTSATISFACTION', 'HOURLYRATE', 'JOBINVOLVEMENT', 'JOBLEVEL', 'JOBSATISFACTION', 'MONTHLYINCOME', 'MONTHLYRATE', 'NUMCOMPANIESWORKED', 'PERCENTSALARYHIKE', 'PERFORMANCERATING', 'RELATIONSHIPSATISFACTION', 'STOCKOPTIONLEVEL', 'TOTALWORKINGYEARS', 'TRAININGTIMESLASTYEAR', 'WORKLIFEBALANCE', 'YEARSATCOMPANY', 'YEARSINCURRENTROLE', 'YEARSSINCELASTPROMOTION', 'YEARSWITHCURRMANAGER']
Categorical Attributes:['ATTRITION', 'BUSINESSTRAVEL', 'DEPARTMENT', 'EDUCATIONFIELD', 'GENDER', 'JOBROLE', 'MARITALSTATUS', 'OVERTIME']


In [197]:
label_enc = LabelEncoder()
data['GENDER'] = label_enc.fit_transform(data['GENDER'])
data = pd.get_dummies(data, columns=['BUSINESSTRAVEL', 'DEPARTMENT', 'EDUCATIONFIELD', 'JOBROLE', 'MARITALSTATUS', 'OVERTIME'])

In [198]:
# Convert Attrition to binary (0/1)
data['ATTRITION'] = data['ATTRITION'].apply(lambda x: 1 if x == 'Yes' else 0)
data.head()

Unnamed: 0,AGE,ATTRITION,DAILYRATE,DISTANCEFROMHOME,EDUCATION,ENVIRONMENTSATISFACTION,GENDER,HOURLYRATE,JOBINVOLVEMENT,JOBLEVEL,...,JOBROLE_Manufacturing Director,JOBROLE_Research Director,JOBROLE_Research Scientist,JOBROLE_Sales Executive,JOBROLE_Sales Representative,MARITALSTATUS_Divorced,MARITALSTATUS_Married,MARITALSTATUS_Single,OVERTIME_No,OVERTIME_Yes
0,26,1,1357,25,3,1,1,48,1,1,...,False,False,False,False,False,False,False,True,True,False
1,27,0,994,8,3,4,1,37,3,3,...,False,False,False,True,False,False,False,True,True,False
2,30,0,721,1,2,3,0,58,3,2,...,False,False,False,False,False,False,False,True,True,False
3,41,1,1360,12,3,2,0,49,3,5,...,False,True,False,False,False,False,True,False,True,False
4,34,0,1065,23,4,2,1,72,3,2,...,False,False,False,True,False,False,False,True,True,False


In [200]:
num_attr=[attr for attr in data.columns if data[attr].dtype!='O']
cat_attr=[attr for attr in data.columns if data[attr].dtype=='bool']

print(f"Numerical Attributes:{num_attr}")
print(f"Categorical Attributes:{cat_attr}")

Numerical Attributes:['AGE', 'ATTRITION', 'DAILYRATE', 'DISTANCEFROMHOME', 'EDUCATION', 'ENVIRONMENTSATISFACTION', 'GENDER', 'HOURLYRATE', 'JOBINVOLVEMENT', 'JOBLEVEL', 'JOBSATISFACTION', 'MONTHLYINCOME', 'MONTHLYRATE', 'NUMCOMPANIESWORKED', 'PERCENTSALARYHIKE', 'PERFORMANCERATING', 'RELATIONSHIPSATISFACTION', 'STOCKOPTIONLEVEL', 'TOTALWORKINGYEARS', 'TRAININGTIMESLASTYEAR', 'WORKLIFEBALANCE', 'YEARSATCOMPANY', 'YEARSINCURRENTROLE', 'YEARSSINCELASTPROMOTION', 'YEARSWITHCURRMANAGER', 'BUSINESSTRAVEL_Non-Travel', 'BUSINESSTRAVEL_Travel_Frequently', 'BUSINESSTRAVEL_Travel_Rarely', 'DEPARTMENT_Human Resources', 'DEPARTMENT_Research & Development', 'DEPARTMENT_Sales', 'EDUCATIONFIELD_Human Resources', 'EDUCATIONFIELD_Life Sciences', 'EDUCATIONFIELD_Marketing', 'EDUCATIONFIELD_Medical', 'EDUCATIONFIELD_Other', 'EDUCATIONFIELD_Technical Degree', 'JOBROLE_Healthcare Representative', 'JOBROLE_Human Resources', 'JOBROLE_Laboratory Technician', 'JOBROLE_Manager', 'JOBROLE_Manufacturing Director',

In [160]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 51 columns):
 #   Column                             Non-Null Count  Dtype
---  ------                             --------------  -----
 0   AGE                                1470 non-null   int64
 1   ATTRITION                          1470 non-null   int64
 2   DAILYRATE                          1470 non-null   int64
 3   DISTANCEFROMHOME                   1470 non-null   int64
 4   EDUCATION                          1470 non-null   int64
 5   ENVIRONMENTSATISFACTION            1470 non-null   int64
 6   GENDER                             1470 non-null   int64
 7   HOURLYRATE                         1470 non-null   int64
 8   JOBINVOLVEMENT                     1470 non-null   int64
 9   JOBLEVEL                           1470 non-null   int64
 10  JOBSATISFACTION                    1470 non-null   int64
 11  MONTHLYINCOME                      1470 non-null   int64
 12  MONTHLYRATE         

In [202]:
data.to_csv("NewRAw.csv", index=False)

In [203]:
data['ATTRITION'].nunique()

2

In [120]:
# # Univariate analysis
# for num in :
#     plt.figure(figsize=(10, 6))
#     sns.histplot(data[num], bins=30,kde=True)
#     plt.title(f'Distribution of {num}')
# plt.show()

In [121]:
# # Bivariate analysis: Box plot for numerical variable by attrition
# plt.figure(figsize=(10, 6))
# sns.boxplot(x='ATTRITION', y='DAILYRATE', data=data)
# plt.title('Daily Rate by Attrition')
# plt.show()

Selected Features: Index(['AGE', 'DAILYRATE', 'DISTANCEFROMHOME', 'EDUCATION',
       'ENVIRONMENTSATISFACTION', 'HOURLYRATE', 'JOBINVOLVEMENT', 'JOBLEVEL',
       'JOBSATISFACTION', 'MONTHLYINCOME', 'MONTHLYRATE', 'NUMCOMPANIESWORKED',
       'PERCENTSALARYHIKE', 'RELATIONSHIPSATISFACTION', 'STOCKOPTIONLEVEL',
       'TOTALWORKINGYEARS', 'TRAININGTIMESLASTYEAR', 'WORKLIFEBALANCE',
       'YEARSATCOMPANY', 'YEARSINCURRENTROLE', 'YEARSSINCELASTPROMOTION',
       'YEARSWITHCURRMANAGER', 'OVERTIME_No', 'OVERTIME_Yes',
       'MARITALSTATUS_Single'],
      dtype='object')
