# Employee Status Prediction

This notebook implements binary classification models (SVM and ANN) to predict employee status (0=Active, 1=Left) based on various features.

## 1. Data Preprocessing

Perform comprehensive preprocessing on the given dataset containing various abnormalities including missing values, outliers, inconsistent formatting, and duplicates. Encode features where applicable, and split the data appropriately for modeling.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

# Load the dataset
df = pd.read_csv('/home/meher/Things/FA25 - 5th sem/ML/Lab/terminal/dataset/Group-2 Data.csv')
df.head()

### 1.1 Data Exploration and Cleaning

In [None]:
# Check for duplicates
print(f"Duplicates: {df.duplicated().sum()}")
df.drop_duplicates(inplace=True)

# Check for missing values
print("Missing values:\n", df.isnull().sum())

# Impute missing values
# For numerical columns, use median
num_cols = df.select_dtypes(include=[np.number]).columns
for col in num_cols:
    df[col].fillna(df[col].median(), inplace=True)

# For categorical columns, use mode
cat_cols = df.select_dtypes(include=['object']).columns
for col in cat_cols:
    df[col].fillna(df[col].mode()[0], inplace=True)

# Handle negative values in Age and Experience
df['Age'] = df['Age'].abs()
df['Experience'] = df['Experience'].abs()

# Handle outliers (using IQR for Age, Salary, Experience)
for col in ['Age', 'Salary', 'Experience']:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df[col] = np.where(df[col] < lower_bound, lower_bound, df[col])
    df[col] = np.where(df[col] > upper_bound, upper_bound, df[col])

# Standardize Education levels
education_map = {
    'B.S.': 'Bachelor', 'BACHELOR': 'Bachelor', 'Bachelor': 'Bachelor',
    'M.S.': 'Master', 'Master': 'Master', 'masters': 'Master',
    'H.S.': 'High School', 'High School': 'High School',
    'phd': 'PhD', 'PhD': 'PhD'
}
df['Education'] = df['Education'].map(education_map).fillna('Other')

df.head()

### 1.2 Feature Encoding and Splitting

In [None]:
# Encode categorical features
le = LabelEncoder()
df['Education'] = le.fit_transform(df['Education'])
df['Department'] = le.fit_transform(df['Department'])

# Drop non-numeric/unnecessary columns (like ID or Date if present)
# Assuming first column is ID and one column is Date
df = df.drop(columns=[df.columns[0], 'Joining Date'], errors='ignore')

# Split data
X = df.drop('Status', axis=1)
y = df['Status']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

print(f"Training set shape: {X_train.shape}")
print(f"Testing set shape: {X_test.shape}")

## 2. Support Vector Machine (SVM)

Implement SVM for binary classification with GridSearchCV for hyperparameter tuning, exploring different kernels and regularization parameters. Report best parameters and evaluate model performance on test data.

In [None]:
param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf', 'poly'],
    'gamma': ['scale', 'auto']
}

grid = GridSearchCV(SVC(probability=True), param_grid, refit=True, verbose=1, cv=3)
grid.fit(X_train, y_train)

print(f"Best Parameters: {grid.best_params_}")
svm_preds = grid.predict(X_test)
svm_probs = grid.predict_proba(X_test)[:, 1]

print("SVM Classification Report:")
print(classification_report(y_test, svm_preds))

## 3. Artificial Neural Network (ANN)

Build a three-layer Artificial Neural Network using ReLU activation in the hidden layers and sigmoid activation in the output layer. Implement dropout regularization in each hidden layer. Train the model for 50 epochs, plot the training and validation accuracy/loss curves, and evaluate its performance on the test set. Display the complete model architecture summary.

In [None]:
model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.2),
    Dense(32, activation='relu'),
    Dropout(0.2),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

history = model.fit(X_train, y_train, epochs=50, validation_split=0.2, verbose=1)

# Plotting
plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Val Accuracy')
plt.title('Model Accuracy')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Val Loss')
plt.title('Model Loss')
plt.legend()
plt.show()

ann_probs = model.predict(X_test).flatten()
ann_preds = (ann_probs > 0.5).astype(int)

print("ANN Classification Report:")
print(classification_report(y_test, ann_preds))

## 4. Compare the Results

Compare SVM and ANN performance by generating classification reports, confusion matrices, and ROC curves.

In [None]:
def plot_confusion_matrix(y_true, y_pred, title):
    cm = confusion_matrix(y_true, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(title)
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.show()

plot_confusion_matrix(y_test, svm_preds, 'SVM Confusion Matrix')
plot_confusion_matrix(y_test, ann_preds, 'ANN Confusion Matrix')

# ROC Curves
fpr_svm, tpr_svm, _ = roc_curve(y_test, svm_probs)
fpr_ann, tpr_ann, _ = roc_curve(y_test, ann_probs)

plt.figure(figsize=(8, 6))
plt.plot(fpr_svm, tpr_svm, label=f'SVM (AUC = {auc(fpr_svm, tpr_svm):.2f})')
plt.plot(fpr_ann, tpr_ann, label=f'ANN (AUC = {auc(fpr_ann, tpr_ann):.2f})')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve Comparison')
plt.legend()
plt.show()