<a href="https://colab.research.google.com/github/nirupamgpta/Assignments/blob/main/EmployeeAttritionL3_P2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
import io

# --- 1. Upload and Load the Dataset in Google Colab ---
from google.colab import files

print("Please upload your employee_data.csv file")
uploaded = files.upload()

# Get the filename of the uploaded file
file_name = next(iter(uploaded))

# Read the uploaded CSV file into a pandas DataFrame
df = pd.read_csv(io.BytesIO(uploaded[file_name]))


print("\n--- Initial Data ---")
print(df.head())
print("\n" + "="*30 + "\n")


# --- 2. Preprocessing the Data ---

# Separate features (X) and the target variable (y)
X = df.drop('Attrition', axis=1)
y = df['Attrition']

# Identify categorical and numerical features
categorical_features = ['JobRole']
numerical_features = ['Age', 'MonthlyIncome', 'JobSatisfaction', 'YearsAtCompany']

# Create a preprocessor object using ColumnTransformer.
# This allows us to apply different transformations to different columns.
# - OneHotEncoder converts categorical features into a numerical format.
# - StandardScaler scales numerical features to have a mean of 0 and a standard deviation of 1.
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])


# --- 3. Splitting the Data ---
# The dataset is split into a training set (80%) and a testing set (20%).
# The model will be trained on the training set and evaluated on the unseen testing set.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


# --- 4. Applying Preprocessing ---
# Fit the preprocessor on the training data and transform both training and test data.
# It's important to fit only on the training data to avoid data leakage.
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)


# --- 5. Building and Training the KNN Model ---
# Initialize the KNeighborsClassifier.
# We'll start with n_neighbors=3, which is a common starting point for k.
# The number of neighbors is a hyperparameter you can tune for better performance.
knn_model = KNeighborsClassifier(n_neighbors=3)

# Train the model using the processed training data
knn_model.fit(X_train_processed, y_train)


# --- 6. Making Predictions ---
# Use the trained model to make predictions on the processed test data.
y_pred = knn_model.predict(X_test_processed)


# --- 7. Evaluating the Model ---
# Calculate the accuracy of the model.
accuracy = accuracy_score(y_test, y_pred)

print("--- Model Evaluation ---")
print(f"Model Accuracy: {accuracy:.2f}")
print("\nClassification Report:")
# The classification report provides more detailed metrics like precision, recall, and f1-score.
# '1' represents Attrition, '0' represents No Attrition.
print(classification_report(y_test, y_pred))

print("\n" + "="*30 + "\n")
print("--- Predictions on Test Data ---")
# Displaying the actual vs. predicted values for the test set.
test_results = X_test.copy()
test_results['Actual_Attrition'] = y_test
test_results['Predicted_Attrition'] = y_pred
print(test_results)