In [None]:
# EXPLORATORY DATA ANALYSIS

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder

# LoadING the data
data = pd.read_csv('HR_Analytics.csv')

# Removing EmpID
data = data.drop('EmpID', axis=1)

# Converting Attrition to numeric (1 for 'Yes', 0 for 'No')
data['Attrition'] = data['Attrition'].map({'Yes': 1, 'No': 0})

# Identifying numeric and categorical columns
numeric_columns = data.select_dtypes(include=[np.number]).columns
categorical_columns = data.select_dtypes(exclude=[np.number]).columns

# One-hot encoding for converting categorical columns
encoder = OneHotEncoder(sparse_output=False, drop='first')
encoded_cats = encoder.fit_transform(data[categorical_columns])
encoded_feature_names = encoder.get_feature_names_out(categorical_columns)

# Creating a new dataframe with encoded features
encoded_df = pd.DataFrame(encoded_cats, columns=encoded_feature_names, index=data.index)

# Combining numeric and encoded categorical data
X_encoded = pd.concat([data[numeric_columns], encoded_df], axis=1)

# Calculating the correlation matrix
corr_matrix = X_encoded.corr()

# Printing the full correlation matrix
print("Full Correlation Matrix:")
print(corr_matrix)

# Function to get the significant correlations
def get_significant_correlations(corr_matrix, threshold=0.5):
    significant_corr = []
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold:
                significant_corr.append((corr_matrix.index[i], corr_matrix.columns[j], corr_matrix.iloc[i, j]))
    return significant_corr

# Getting and printing the significant correlations
significant_correlations = get_significant_correlations(corr_matrix, threshold=0.5)
print("\nSignificant Correlations (|correlation| > 0.5):")
for var1, var2, corr in sorted(significant_correlations, key=lambda x: abs(x[2]), reverse=True):
    print(f"{var1} - {var2}: {corr:.2f}")


# Getting the correlations with Performance Rating
perf_rating_corr = corr_matrix['PerformanceRating'].abs().sort_values(ascending=False)

# Printing the top 10 correlations with Performance Rating
print("Top 10 features correlated with Performance Rating:")
print(perf_rating_corr.head(11))  # 11 because Performance Rating's correlation with itself (1.0) will be included

# Getting the correlations with Attrition
attrition_corr = corr_matrix['Attrition'].abs().sort_values(ascending=False)

# Printing the top 10 correlations with Attrition
print("Top 10 features correlated with Attrition:")
print(attrition_corr.head(11))  # 11 because Attrition's correlation with itself (1.0) will be included






In [None]:

# predictive model

import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import fbeta_score, make_scorer
from imblearn.over_sampling import SMOTE
import xgboost as xgb
import optuna


# loading the data
data=pd.read_csv('HR_Analytics.csv')
data.dtypes
df=pd.DataFrame(data)

# preprocesssing the data
target='Attrition'
X= df.drop(columns=[target, 'EmpID'], axis=1)
y= df[target]



# converting categorical variables with one-hot encoding

encoder=OneHotEncoder(sparse_output=False)
categorical_columns=X.select_dtypes(include=['object']).columns.tolist()
one_hot_encoded=encoder.fit_transform(X[categorical_columns])
one_hot_df=pd.DataFrame(one_hot_encoded, columns=encoder.get_feature_names_out(categorical_columns))

# Concatenating the one-hot encoded dataframe with the original dataframe
X_encoded = pd.concat([X, one_hot_df], axis=1)

X_encoded = pd.concat([X, one_hot_df], axis=1)
X_encoded = X_encoded.drop(categorical_columns, axis=1)

# Combining X and y into a single dataframe
combined_df = pd.concat([X_encoded, y], axis=1)

# Drop NaN values from the combined dataframe
combined_df = combined_df.dropna()

# Splitting back into X and y
X = combined_df.drop(columns=[target])
y = combined_df[target]

# mapping the target variable 'attrition' to binary
y = y.map({'Yes': 1, 'No': 0})


# Normalizing the data
scaler = StandardScaler()
scaled_X = scaler.fit_transform(X)

# Applying SMOTE to handle the data imbalance
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)
the s
# Scale features
scaler = StandardScaler()
scaled_X = scaler.fit_transform(X_resampled)

# Split data
X_train, X_test, y_train, y_test = train_test_split(scaled_X, y_resampled, test_size=0.2, random_state=42)

# Define Optuna objective
def objective(trial):
    params = {
        'max_depth': trial.suggest_int('max_depth', 1, 10),
        'learning_rate': trial.suggest_float('learning_rate', 1e-3, 0.5, log=True),
        'n_estimators': trial.suggest_int('n_estimators', 50, 1000),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'gamma': trial.suggest_float('gamma', 1e-8, 1.0, log=True),
        'scale_pos_weight': trial.suggest_float('scale_pos_weight', 1, 10)
    }
    
    model = xgb.XGBClassifier(**params, random_state=42)
    
    # Use cross-validation score
    score = cross_val_score(model, X_train, y_train, cv=5, scoring=make_scorer(fbeta_score, beta=2))
    return score.mean()

# Create and run Optuna study
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

# Get best parameters and train final model
best_params = study.best_params
best_model = xgb.XGBClassifier(**best_params, random_state=42)
best_model.fit(X_train, y_train)

from sklearn.metrics import roc_curve

# Get predicted probabilities
y_pred_proba = best_model.predict_proba(X_test)[:, 1]

# Calculate ROC curve and find optimal threshold
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
J = tpr - fpr
optimal_idx = np.argmax(J)
optimal_threshold = thresholds[optimal_idx]

print(f"Optimal threshold: {optimal_threshold}")

# Make predictions using the optimal threshold
y_pred = (y_pred_proba >= optimal_threshold).astype(int)

# Evaluate the model
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))




In [None]:
y_train_pred = best_model.predict(X_train)
y_test_pred = best_model.predict(X_test)

# Calculating the accuracy on the training and the testing sets
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

print("The accuracy on the training set is : ", train_accuracy)
print("The accuracy on the testing set is : ", test_accuracy)


In [None]:
#  MODEL VISUALIZATIONS

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import roc_curve, auc

# Feature Importance Plot
plt.figure(figsize=(20,15))
feature_importance = best_model.feature_importances_
sorted_idx = np.argsort(feature_importance)
pos = np.arange(sorted_idx.shape[0]) + .5

plt.barh(pos, feature_importance[sorted_idx], align='center')
plt.yticks(pos, X.columns[sorted_idx])
plt.xlabel('Feature Importance')
plt.title('Feature Importance in XGBoost Model')
plt.tight_layout()
plt.show()


# ROC Curve
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")

# Plotting the optimal threshold point 
optimal_idx = np.argmax(tpr - fpr)
plt.plot(fpr[optimal_idx], tpr[optimal_idx], 'ro', markersize=8, 
         label=f'Optimal threshold: {thresholds[optimal_idx]:.2f}')
plt.legend()

plt.show()

# Printing the top features used to make the predictive model
feature_importance_df = pd.DataFrame({
    'feature': X.columns,
    'importance': best_model.feature_importances_
}).sort_values('importance', ascending=False)

print("Top 10 Important Features:")
print(feature_importance_df.head(10))