In [229]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder, TargetEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.metrics import roc_auc_score, roc_curve, auc

In [230]:
df = pd.read_csv(r"C:\Users\Maftuna\Downloads\WA_Fn-UseC_-HR-Employee-Attrition.csv")

In [231]:
df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,Gender,HourlyRate,JobInvolvement,JobLevel,JobRole,JobSatisfaction,MaritalStatus,MonthlyIncome,MonthlyRate,NumCompaniesWorked,Over18,OverTime,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,2,Female,94,3,2,Sales Executive,4,Single,5993,19479,8,Y,Yes,11,3,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,3,Male,61,2,2,Research Scientist,2,Married,5130,24907,1,Y,No,23,4,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,4,Male,92,2,1,Laboratory Technician,3,Single,2090,2396,6,Y,Yes,15,3,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,4,Female,56,3,1,Research Scientist,3,Married,2909,23159,1,Y,Yes,11,3,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,1,Male,40,3,1,Laboratory Technician,2,Married,3468,16632,9,Y,No,12,3,4,80,1,6,3,3,2,2,2,2


In [232]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 35 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Age                       1470 non-null   int64 
 1   Attrition                 1470 non-null   object
 2   BusinessTravel            1470 non-null   object
 3   DailyRate                 1470 non-null   int64 
 4   Department                1470 non-null   object
 5   DistanceFromHome          1470 non-null   int64 
 6   Education                 1470 non-null   int64 
 7   EducationField            1470 non-null   object
 8   EmployeeCount             1470 non-null   int64 
 9   EmployeeNumber            1470 non-null   int64 
 10  EnvironmentSatisfaction   1470 non-null   int64 
 11  Gender                    1470 non-null   object
 12  HourlyRate                1470 non-null   int64 
 13  JobInvolvement            1470 non-null   int64 
 14  JobLevel                

In [233]:
# missing values
df.isnull().sum()

Age                        0
Attrition                  0
BusinessTravel             0
DailyRate                  0
Department                 0
                          ..
WorkLifeBalance            0
YearsAtCompany             0
YearsInCurrentRole         0
YearsSinceLastPromotion    0
YearsWithCurrManager       0
Length: 35, dtype: int64

In [234]:
object_cols = df.select_dtypes(include='object').columns
object_cols

Index(['Attrition', 'BusinessTravel', 'Department', 'EducationField', 'Gender',
       'JobRole', 'MaritalStatus', 'Over18', 'OverTime'],
      dtype='object')

In [235]:
for col in object_cols:
    unique_count = df[col].nunique()
    print(f"{col}: {unique_count} unique values")

Attrition: 2 unique values
BusinessTravel: 3 unique values
Department: 3 unique values
EducationField: 6 unique values
Gender: 2 unique values
JobRole: 9 unique values
MaritalStatus: 3 unique values
Over18: 1 unique values
OverTime: 2 unique values


In [236]:
object_cols = df.select_dtypes(include=['object']).columns

for col in object_cols:
    uniques = df[col].unique()  # shu ustundagi barcha unikal qiymatlar
    print(f"{col} ({len(uniques)} ta unique qiymat): {list(uniques)}")


Attrition (2 ta unique qiymat): ['Yes', 'No']
BusinessTravel (3 ta unique qiymat): ['Travel_Rarely', 'Travel_Frequently', 'Non-Travel']
Department (3 ta unique qiymat): ['Sales', 'Research & Development', 'Human Resources']
EducationField (6 ta unique qiymat): ['Life Sciences', 'Other', 'Medical', 'Marketing', 'Technical Degree', 'Human Resources']
Gender (2 ta unique qiymat): ['Female', 'Male']
JobRole (9 ta unique qiymat): ['Sales Executive', 'Research Scientist', 'Laboratory Technician', 'Manufacturing Director', 'Healthcare Representative', 'Manager', 'Sales Representative', 'Research Director', 'Human Resources']
MaritalStatus (3 ta unique qiymat): ['Single', 'Married', 'Divorced']
Over18 (1 ta unique qiymat): ['Y']
OverTime (2 ta unique qiymat): ['Yes', 'No']


In [237]:
label_col =  ['Attrition', 'Gender', 'OverTime', 'Over18']
le = LabelEncoder()

for col in label_col:
    df[col] = le.fit_transform(df[col])

In [238]:
target_cols = ['EducationField', 'JobRole']

for col in target_cols:
    mapping = df.groupby(col)['Attrition'].mean()
    df[col + '_encoded'] = df[col].map(mapping)

# Kerak bo'lsa, endi eski (kategorik) ustunlarni o'chiramiz
df.drop(columns=target_cols, inplace=True)



In [239]:
df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,Gender,HourlyRate,JobInvolvement,JobLevel,JobSatisfaction,MaritalStatus,MonthlyIncome,MonthlyRate,NumCompaniesWorked,Over18,OverTime,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,EducationField_encoded,JobRole_encoded
0,41,1,Travel_Rarely,1102,Sales,1,2,1,1,2,0,94,3,2,4,Single,5993,19479,8,0,1,11,3,1,80,0,8,0,1,6,4,0,5,0.15,0.17
1,49,0,Travel_Frequently,279,Research & Development,8,1,1,2,3,1,61,2,2,2,Married,5130,24907,1,0,0,23,4,4,80,1,10,3,3,10,7,1,7,0.15,0.16
2,37,1,Travel_Rarely,1373,Research & Development,2,2,1,4,4,1,92,2,1,3,Single,2090,2396,6,0,1,15,3,2,80,0,7,3,3,0,0,0,0,0.13,0.24
3,33,0,Travel_Frequently,1392,Research & Development,3,4,1,5,4,0,56,3,1,3,Married,2909,23159,1,0,1,11,3,3,80,0,8,3,3,8,7,3,0,0.15,0.16
4,27,0,Travel_Rarely,591,Research & Development,2,1,1,7,1,1,40,3,1,2,Married,3468,16632,9,0,0,12,3,4,80,1,6,3,3,2,2,2,2,0.14,0.24


In [240]:
object_cols = df.select_dtypes(include=['object']).columns

for col in object_cols:
    uniques = df[col].unique()  # shu ustundagi barcha unikal qiymatlar
    print(f"{col} ({len(uniques)} ta unique qiymat): {list(uniques)}")

BusinessTravel (3 ta unique qiymat): ['Travel_Rarely', 'Travel_Frequently', 'Non-Travel']
Department (3 ta unique qiymat): ['Sales', 'Research & Development', 'Human Resources']
MaritalStatus (3 ta unique qiymat): ['Single', 'Married', 'Divorced']


In [241]:
categorical_cols = ['BusinessTravel', 'Department', 'MaritalStatus']

# OneHotEncoder obyektini yaratamiz (sparse=False parametri massivni zich emas formatda qaytaradi):
encoder = OneHotEncoder(sparse_output=False, drop=None)

# Faqat kategorik ustunlarni kodlash:
encoded_array = encoder.fit_transform(df[categorical_cols])

# Yangi ustun nomlarini olish:
encoded_feature_names = encoder.get_feature_names_out(categorical_cols)

# Natijani DataFrame formatiga o'tkazish:
df_encoded = pd.DataFrame(encoded_array, columns=encoded_feature_names)

# Asl DataFrame da qolgan ustunlarni qo'shish:
df_final = pd.concat([df.drop(columns=categorical_cols), df_encoded], axis=1)



In [242]:
df_final.head()

Unnamed: 0,Age,Attrition,DailyRate,DistanceFromHome,Education,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,Gender,HourlyRate,JobInvolvement,JobLevel,JobSatisfaction,MonthlyIncome,MonthlyRate,NumCompaniesWorked,Over18,OverTime,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,EducationField_encoded,JobRole_encoded,BusinessTravel_Non-Travel,BusinessTravel_Travel_Frequently,BusinessTravel_Travel_Rarely,Department_Human Resources,Department_Research & Development,Department_Sales,MaritalStatus_Divorced,MaritalStatus_Married,MaritalStatus_Single
0,41,1,1102,1,2,1,1,2,0,94,3,2,4,5993,19479,8,0,1,11,3,1,80,0,8,0,1,6,4,0,5,0.15,0.17,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
1,49,0,279,8,1,1,2,3,1,61,2,2,2,5130,24907,1,0,0,23,4,4,80,1,10,3,3,10,7,1,7,0.15,0.16,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2,37,1,1373,2,2,1,4,4,1,92,2,1,3,2090,2396,6,0,1,15,3,2,80,0,7,3,3,0,0,0,0,0.13,0.24,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
3,33,0,1392,3,4,1,5,4,0,56,3,1,3,2909,23159,1,0,1,11,3,3,80,0,8,3,3,8,7,3,0,0.15,0.16,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
4,27,0,591,2,1,1,7,1,1,40,3,1,2,3468,16632,9,0,0,12,3,4,80,1,6,3,3,2,2,2,2,0.14,0.24,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0


In [243]:
X = df_final.drop('Attrition', axis=1)
y = df_final['Attrition']

In [244]:
# Scaling 
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [245]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

In [246]:
lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)
lr_predictions = lr_model.predict(X_test)
lr_acc = accuracy_score(y_test, lr_predictions)

print("Logistic Regression Accuracy:", lr_acc)

Logistic Regression Accuracy: 0.8752834467120182


In [247]:
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)
dt_predictions = dt_model.predict(X_test)
dt_acc = accuracy_score(y_test, dt_predictions)

print("Decision Tree Accuracy:", dt_acc)

Decision Tree Accuracy: 0.8049886621315193
