In [1]:
import pandas as pd
import pickle
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, classification_report, roc_curve
from imblearn.over_sampling import SMOTE
from collections import Counter

In [2]:
df=pd.read_csv("D:/4th_project/dataset/Employee-Attrition - Employee-Attrition.csv")

In [3]:
df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


In [4]:
df.isnull().sum()

Age                         0
Attrition                   0
BusinessTravel              0
DailyRate                   0
Department                  0
DistanceFromHome            0
Education                   0
EducationField              0
EmployeeCount               0
EmployeeNumber              0
EnvironmentSatisfaction     0
Gender                      0
HourlyRate                  0
JobInvolvement              0
JobLevel                    0
JobRole                     0
JobSatisfaction             0
MaritalStatus               0
MonthlyIncome               0
MonthlyRate                 0
NumCompaniesWorked          0
Over18                      0
OverTime                    0
PercentSalaryHike           0
PerformanceRating           0
RelationshipSatisfaction    0
StandardHours               0
StockOptionLevel            0
TotalWorkingYears           0
TrainingTimesLastYear       0
WorkLifeBalance             0
YearsAtCompany              0
YearsInCurrentRole          0
YearsSince

In [5]:
# Select relevant columns
df = df[['Attrition', 'Age', 'Department', 'MonthlyIncome', 'JobSatisfaction', 
         'YearsAtCompany', 'MaritalStatus', 'OverTime']]

In [6]:
df.head()

Unnamed: 0,Attrition,Age,Department,MonthlyIncome,JobSatisfaction,YearsAtCompany,MaritalStatus,OverTime
0,Yes,41,Sales,5993,4,6,Single,Yes
1,No,49,Research & Development,5130,2,10,Married,No
2,Yes,37,Research & Development,2090,3,0,Single,Yes
3,No,33,Research & Development,2909,3,8,Married,Yes
4,No,27,Research & Development,3468,2,2,Married,No


In [7]:
# Encode categorical features
from sklearn.preprocessing import OneHotEncoder, StandardScaler
cat_cols = ['Department', 'MaritalStatus', 'OverTime']
encoder = OneHotEncoder(sparse_output=False)
encoded_data = encoder.fit_transform(df[cat_cols])
encoded_df = pd.DataFrame(encoded_data.astype(int), columns=encoder.get_feature_names_out(cat_cols))


In [8]:
encoded_df.head()

Unnamed: 0,Department_Human Resources,Department_Research & Development,Department_Sales,MaritalStatus_Divorced,MaritalStatus_Married,MaritalStatus_Single,OverTime_No,OverTime_Yes
0,0,0,1,0,0,1,0,1
1,0,1,0,0,1,0,1,0
2,0,1,0,0,0,1,0,1
3,0,1,0,0,1,0,0,1
4,0,1,0,0,1,0,1,0


In [9]:
# Select numerical columns
num_cols = ['Age', 'MonthlyIncome', 'YearsAtCompany', 'JobSatisfaction']
num_df = df[num_cols]

# Scale numerical features
scaler = StandardScaler()
num_df_scaled = pd.DataFrame(scaler.fit_transform(num_df), columns=num_cols)

In [10]:
num_df_scaled.head()

Unnamed: 0,Age,MonthlyIncome,YearsAtCompany,JobSatisfaction
0,0.44635,-0.10835,-0.164613,1.153254
1,1.322365,-0.291719,0.488508,-0.660853
2,0.008343,-0.937654,-1.144294,0.2462
3,-0.429664,-0.763634,0.161947,0.2462
4,-1.086676,-0.644858,-0.817734,-0.660853


In [11]:
# Save the scaler to a pickle file
with open("D:/4th_project/model/attrition_scaler_new.pkl", "wb") as f:
    pickle.dump(scaler, f)

In [12]:
# Combine all features
X = pd.concat([num_df_scaled, encoded_df], axis=1)

# Encode target
y = df['Attrition'].replace({'Yes': 1, 'No': 0})

  y = df['Attrition'].replace({'Yes': 1, 'No': 0})


In [13]:
# Print class distribution before SMOTE
print("Class distribution before SMOTE:", Counter(y))

# Apply SMOTE to balance classes
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Print class distribution after SMOTE
print("Class distribution after SMOTE:", Counter(y_resampled))

Class distribution before SMOTE: Counter({0: 1233, 1: 237})
Class distribution after SMOTE: Counter({1: 1233, 0: 1233})


In [14]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, y_resampled, test_size=0.3, random_state=42
)

In [15]:
from sklearn.linear_model import LogisticRegression

# Train model
lr_model = LogisticRegression(max_iter=1000, random_state=42)
lr_model.fit(X_train, y_train)


# Evaluate
y_pred = lr_model.predict(X_test)
y_proba = lr_model.predict_proba(X_test)[:, 1]

print("\n📈 Logistic Regression Metrics:")
print(f"✅ Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"✅ F1-Score: {f1_score(y_test, y_pred):.4f}")
print(f"✅ AUC-ROC: {roc_auc_score(y_test, y_proba):.4f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred))



📈 Logistic Regression Metrics:
✅ Accuracy: 0.7878
✅ F1-Score: 0.7887
✅ AUC-ROC: 0.8534

Classification Report:
               precision    recall  f1-score   support

           0       0.79      0.79      0.79       369
           1       0.79      0.79      0.79       371

    accuracy                           0.79       740
   macro avg       0.79      0.79      0.79       740
weighted avg       0.79      0.79      0.79       740



In [16]:
# Train model
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)


# Evaluate
y_pred = dt_model.predict(X_test)
y_proba = dt_model.predict_proba(X_test)[:, 1]

print("\n🌳 Decision Tree Metrics:")
print(f"✅ Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"✅ F1-Score: {f1_score(y_test, y_pred):.4f}")
print(f"✅ AUC-ROC: {roc_auc_score(y_test, y_proba):.4f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred))


🌳 Decision Tree Metrics:
✅ Accuracy: 0.8176
✅ F1-Score: 0.8231
✅ AUC-ROC: 0.8175

Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.79      0.81       369
           1       0.80      0.85      0.82       371

    accuracy                           0.82       740
   macro avg       0.82      0.82      0.82       740
weighted avg       0.82      0.82      0.82       740



In [20]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, classification_report

# Train SVM model (with probability estimates)
svm_model = SVC(probability=True, random_state=42)
svm_model.fit(X_train, y_train)

# Evaluate
y_pred_svm = svm_model.predict(X_test)
y_proba_svm = svm_model.predict_proba(X_test)[:, 1]

print("\n🤖 SVM Metrics:")
print(f"✅ Accuracy: {accuracy_score(y_test, y_pred_svm):.4f}")
print(f"✅ F1-Score: {f1_score(y_test, y_pred_svm):.4f}")
print(f"✅ AUC-ROC: {roc_auc_score(y_test, y_proba_svm):.4f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred_svm))



🤖 SVM Metrics:
✅ Accuracy: 0.7986
✅ F1-Score: 0.8000
✅ AUC-ROC: 0.8775

Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.79      0.80       369
           1       0.80      0.80      0.80       371

    accuracy                           0.80       740
   macro avg       0.80      0.80      0.80       740
weighted avg       0.80      0.80      0.80       740



In [None]:
# Save model
with open("D:/4th_project/model/new_attrition_decision_tree.pkl", "wb") as f:
    pickle.dump(dt_model, f)