In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import cross_val_score, GridSearchCV
import joblib

# Step 1: Import the required libraries and data
data_path = r"C:\Users\Hiremath\OneDrive\Desktop\WA_Fn-UseC_-HR-Employee-Attrition.csv"
data = pd.read_csv(data_path)

# Step 2: Exploratory Data Analysis (EDA)
# Perform EDA here to understand the data, check for missing values, data distributions, etc.

# Step 3: Data Preprocessing and Feature Engineering
data.drop(['EmployeeNumber'], axis=1, inplace=True)
data['Attrition'] = data['Attrition'].map({'Yes': 1, 'No': 0})
data = pd.get_dummies(data)
X = data.drop(['Attrition'], axis=1)
y = data['Attrition']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 4: Model Building and Testing (Random Forest Classifier)
classification_model = RandomForestClassifier(random_state=42)
classification_model.fit(X_train, y_train)
y_pred = classification_model.predict(X_test)

# Step 5: Evaluate the Model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classification_report_str = classification_report(y_test, y_pred)

# Step 6: Check for Underfitting/Overfitting
cv_scores = cross_val_score(classification_model, X_train, y_train, cv=5)
print("Cross-validation Scores:", cv_scores)
print("Mean CV Score:", cv_scores.mean())

# Step 7: Hyperparameter Tuning (Optional)
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(classification_model, param_grid, cv=5)
grid_search.fit(X_train, y_train)
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# Step 8: Save the Best Model
joblib.dump(best_model, "best_model.pkl")


Cross-validation Scores: [0.84745763 0.88085106 0.84255319 0.85106383 0.85531915]
Mean CV Score: 0.8554489722322394


['best_model.pkl']