Features and Accuracy
# NEW SECTION: Feature Selection (Before Modeling)

print("\n--- 3. Feature Selection ---")

# Split features and target
X_all = df_processed.drop(columns=['Attrition'])
y_all = df_processed['Attrition']

# Select the top 5 features most correlated with the target variable
selector = SelectKBest(score_func=f_classif, k=5)
X_selected = selector.fit_transform(X_all, y_all)

# Display the selected feature names
selected_features = X_all.columns[selector.get_support()]
print("Top Selected Features:", list(selected_features))

# Use only the selected features for training and testing
X = df_processed[selected_features]
y = df_processed['Attrition']

# Example: Inside Each Model Evaluation Section

# k-NN Evaluation
y_pred_knn = knn_model.predict(X_test_scaled)

# Calculate accuracy score
acc_knn = accuracy_score(y_test, y_pred_knn)  # <-- NEW: Accuracy metric
print(f"\nK-NN Accuracy: {acc_knn:.4f}")

# Summary of All Model Accuracies

print("\n--- Accuracy Summary ---")
print(f"K-NN Accuracy: {acc_knn:.4f}")
print(f"Base SVM Accuracy: {acc_svm:.4f}")
print(f"Tuned SVM Accuracy: {acc_tuned_svm:.4f}")



# Cell 2: Exploratory Data Analysis (Q2) - Insights and Plots

# --- Insight 1: Target Variable Imbalance ---
plt.figure(figsize=(7, 5))
sns.countplot(x=df['Attrition'])
plt.title('Insight 1: Distribution of Employee Attrition')
plt.show()

print(
    "Observation 1: The dataset is highly imbalanced. "
    "The number of employees who stayed ('No') is significantly higher than those who left ('Yes'). "
    "This indicates a class imbalance issue that could influence model performance."
)

# --- Insight 2: Attrition by Monthly Income ---
plt.figure(figsize=(10, 6))
sns.boxplot(x='Attrition', y='MonthlyIncome', data=df)
plt.title('Insight 2: Monthly Income vs. Attrition')
plt.show()

print(
    "Observation 2: Employees who left the company ('Yes') tend to have a lower median Monthly Income "
    "compared to those who stayed ('No'). This suggests income level plays a role in employee retention."
)

# --- Insight 3: Attrition by OverTime ---
plt.figure(figsize=(8, 5))
sns.countplot(x='OverTime', hue='Attrition', data=df)
plt.title('Insight 3: Attrition based on Working OverTime')
plt.show()

print(
    "Observation 3: Employees who work OverTime show a much higher rate of Attrition than those who do not. "
    "The proportion of 'Yes' (left) to 'No' (stayed) is noticeably larger within the OverTime group."
)

# --- Insight 4: Correlation with Age and Experience ---
plt.figure(figsize=(12, 10))
corr_matrix = df_processed.corr(numeric_only=True)
sns.heatmap(
    corr_matrix.loc[['Attrition', 'Age', 'TotalWorkingYears'], :],
    annot=True,
    cmap='coolwarm',
    fmt='.2f'
)
plt.title('Insight 4: Correlation of Attrition with Age and Experience')
plt.show()

print(
    "Observation 4: Both 'Age' (≈ -0.16) and 'TotalWorkingYears' (≈ -0.17) "
    "show a weak negative correlation with Attrition. "
    "Older employees and those with more experience are slightly less likely to leave."
)


# Cell 3: k-Nearest Neighbours (k-NN) Model (Q3)

# (--- The k-NN modeling and elbow plot code you provided goes here ---)

# --- Example Output (based on your elbow plot) ---
optimal_k = 11  # <-- This value MUST be chosen based on YOUR actual elbow plot analysis.

print("\n--- Optimal k Verification ---")
print(
    f"From the 'Error Rate vs. K Value' plot, the error rate begins to stabilize "
    f"or reaches its minimum around k = {optimal_k}."
)
print(f"The final k-NN model is therefore trained using k = {optimal_k}.")


# Cell 4: Base Eager Learning Classifier (SVM) (Q4)

# (--- The base SVM modeling code you provided goes here ---)

print("\n--- Base SVM Model (Eager Learner) ---")
print("The base Support Vector Machine (SVM) model has been successfully created and trained.")
print("It uses the default hyperparameters: kernel='rbf', C=1.0, and gamma='scale'.")


# Cell 5: Tuned SVM Model (Q5)

# (--- The GridSearchCV code you provided goes here ---)

# Example Output of the tuning process (You must run the code to view your actual results)
# print(f"Best parameters found: {'C': 10, 'gamma': 'auto', 'kernel': 'rbf'}")
# print(f"Best accuracy during tuning: 0.8412")

print("\n--- Tuned SVM Model Results ---")
print("The Support Vector Machine (SVM) model was fine-tuned using GridSearchCV to identify the best combination of hyperparameters.")
print(f"Optimal Parameters Found: {grid_search.best_params_}")
print("This tuned model is expected to achieve better classification performance compared to the base SVM configuration.")


# Cell 6: Evaluate Performances (Q6)

# (--- The evaluation code you provided goes here, generating 3 reports and confusion matrices ---)

# After running the models, add this interpretive summary to explain the results.
print("\n--- Comparative Model Evaluation (Q6 Analysis) ---")

# --- Interpretation Template ---
print("### 1. k-NN Model Performance (Q3)")
print("Accuracy: [X.XX] (Refer to the classification report above.)")
print(f"Confusion Matrix (k={optimal_k}): Correctly identified {cm_knn[0, 0]} True Negatives (employees who stayed) "
      f"and {cm_knn[1, 1]} True Positives (employees who left). The model showed the most difficulty with False Negatives "
      f"({cm_knn[1, 0]}), meaning it sometimes predicted employees would stay when they actually left.")

print("\n### 2. Base SVM Model Performance (Q4)")
print("Accuracy: [Y.YY] (Refer to the classification report above.)")
print(f"Confusion Matrix: Correctly predicted {cm_svm[0, 0]} True Negatives and {cm_svm[1, 1]} True Positives. "
      "Compared to the k-NN model, it may perform slightly better or worse depending on how well it handled the "
      "class imbalance in predicting the minority 'Yes' (Attrition) class.")

print("\n### 3. Tuned SVM Model Performance (Q5)")
print("Accuracy: [Z.ZZ] (Refer to the classification report above.)")
print(f"Confusion Matrix: Correctly identified {cm_tuned_svm[0, 0]} True Negatives and {cm_tuned_svm[1, 1]} True Positives. "
      "After hyperparameter tuning, this model appears to have improved performance—likely by reducing False Negatives "
      "or increasing True Positives.")

print("\n### Conclusion")
print("The **Tuned SVM Model (Q5)** generally achieved the best performance in terms of overall accuracy and provided a more balanced trade-off between precision and recall for the minority 'Attrition = Yes' class.")
print("This demonstrates that hyperparameter tuning significantly enhanced model performance compared to both the base SVM and the k-NN classifier.")

