### Reading and Inspecting the Data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Set a professional style for plots
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('viridis')

In [None]:
# Load the training dataset. All our analysis and model training will be done on this file.
df = pd.read_csv('C:/Mahdi/AI project/HR_project/archive (2)/aug_train.csv')

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.shape

### #counting values of each category in each column

In [None]:
df["city_development_index"].value_counts()

In [None]:
df["company_size"].value_counts()

In [None]:
df["education_level"].value_counts()

### Preprocessing and Exploratory Data Analysis (EDA)

#### Cleaning Experience and Last New Job Columns for train data

In [None]:
print("Cleaning 'experience' and 'last_new_job' columns...")

# Clean 'experience' column
df['experience'] = df['experience'].replace('>20', '21')
df['experience'] = df['experience'].replace('<1', '0')
df['experience'] = df['experience'].astype(float) # Convert to float to handle NaNs

# Clean 'last_new_job' column
df['last_new_job'] = df['last_new_job'].replace('>4', '5')
df['last_new_job'] = df['last_new_job'].replace('never', '0')
df['last_new_job'] = df['last_new_job'].astype(float)

print("Columns cleaned.")

#### Exploratory Data Analysis (EDA)

In [None]:
# 1. Target Variable Distribution (Imbalance Check)
plt.figure(figsize=(8, 6))
sns.countplot(x='target', data=df)
plt.title('Distribution of Target Variable\n(0: Not Looking, 1: Looking for Job)', fontsize=14)
plt.xlabel('Target', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.show()

# 2. The Training Conundrum
plt.figure(figsize=(12, 7))
sns.histplot(data=df, x='training_hours', hue='target', kde=True, bins=30)
plt.title('Training Hours vs. Job Change Intent', fontsize=16)
plt.xlabel('Training Hours', fontsize=12)
plt.ylabel('Density', fontsize=12)
plt.legend(title='Looking for Job', labels=['Yes', 'No'])
plt.show()

# 3. The Experience Factor
plt.figure(figsize=(14, 8))
sns.countplot(x='experience', data=df, hue='target', order=sorted(df['experience'].unique()))
plt.title('Experience Level vs. Job Change Intent', fontsize=16)
plt.xlabel('Years of Experience', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.xticks(rotation=45)
plt.legend(title='Looking for Job', labels=['No', 'Yes'])
plt.show()

# 4. Company Loyalty
plt.figure(figsize=(12, 7))
sns.countplot(x='last_new_job', data=df, hue='target', order=sorted(df['last_new_job'].unique()))
plt.title('Years Since Last Job vs. Job Change Intent', fontsize=16)
plt.xlabel('Years Since Last Job Change', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.legend(title='Looking for Job', labels=['No', 'Yes'])
plt.show()

#### Encoding Categorical Variables

In [None]:
# Make a copy to keep the original dataframe intact
df_encoded = df.copy()

# Ordinal Encoding for features with a clear order
ordinal_map = {
    'education_level': {'Primary School': 0, 'High School': 1, 'Graduate': 2, 'Masters': 3, 'Phd': 4},
    'company_size': {'<10': 0, '10/49': 1, '50-99': 2, '100-500': 3, '500-999': 4, '1000-4999': 5, '5000-9999': 6, '10000+': 7}
}
df_encoded['education_level'] = df_encoded['education_level'].map(ordinal_map[('education_level')])
df_encoded['company_size'] = df_encoded['company_size'].map(ordinal_map[('company_size')])

# Label Encoding for binary features
from sklearn.preprocessing import LabelEncoder

label_cols = ['relevent_experience', 'enrolled_university']
for col in label_cols:
    df_encoded[col] = LabelEncoder().fit_transform(df_encoded[col])

# One-Hot Encoding for nominal features (no order)
# We will drop one category to avoid multicollinearity
nominal_cols = ['city', 'gender', 'major_discipline', 'company_type']
df_encoded = pd.get_dummies(df_encoded, columns=nominal_cols, drop_first=True)

print("--- Data after Encoding ---")
df_encoded.head()

### Building and Comparing Models

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, confusion_matrix

In [None]:
# 1. Prepare Data for Modeling
# Drop the ID column as it's not a feature
df_encoded = df_encoded.drop('enrollee_id', axis=1)

In [None]:
X = df_encoded.drop('target', axis=1)
y = df_encoded['target']

In [None]:
# 2. Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
# 3. Scale numerical features

scaler = StandardScaler()
numerical_features = ['city_development_index', 'training_hours', 'experience', 'last_new_job']
X_train[numerical_features] = scaler.fit_transform(X_train[numerical_features])
X_test[numerical_features] = scaler.transform(X_test[numerical_features])


#### XGboost

In [None]:
scale_pos_weight = y_train.value_counts()[0] / y_train.value_counts()[1]
print(f"Calculated scale_pos_weight for XGBoost: {scale_pos_weight:.2f}")

In [None]:
xgb = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='aucpr',
                    scale_pos_weight=scale_pos_weight, n_estimators=100,
                    learning_rate=0.1, max_depth=6)
xgb.fit(X_train, y_train)

In [None]:
xgb.fit(X_train, y_train)

y_pred_xgb = xgb.predict(X_test)
y_proba_xgb = xgb.predict_proba(X_test)[:, 1]

accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
roc_auc_xgb = roc_auc_score(y_test, y_proba_xgb)
report_xgb = classification_report(y_test, y_pred_xgb)

print("--- XGBoost ---")
print(f"Accuracy: {accuracy_xgb:.4f}")
print(f"ROC AUC Score: {roc_auc_xgb:.4f}")
print("Classification Report:")
print(report_xgb)

In [None]:
cm_xgb = confusion_matrix(y_test, y_pred_xgb)
plt.figure(figsize=(6, 4))
sns.heatmap(cm_xgb, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Not Looking', 'Looking'],
            yticklabels=['Not Looking', 'Looking'])
plt.title('Confusion Matrix - XGBoost')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

#### Random Forest

In [None]:
rf = RandomForestClassifier(random_state=42, n_estimators=150, max_depth=10, class_weight='balanced')
rf.fit(X_train, y_train)

In [None]:
y_pred_rf = rf.predict(X_test)
y_proba_rf = rf.predict_proba(X_test)[:, 1]

accuracy_rf = accuracy_score(y_test, y_pred_rf)
roc_auc_rf = roc_auc_score(y_test, y_proba_rf)
report_rf = classification_report(y_test, y_pred_rf)

print("--- Random Forest ---")
print(f"Accuracy: {accuracy_rf:.4f}")
print(f"ROC AUC Score: {roc_auc_rf:.4f}")
print("Classification Report:")
print(report_rf)

In [None]:
cm_rf = confusion_matrix(y_test, y_pred_rf)
plt.figure(figsize=(6, 4))
sns.heatmap(cm_rf, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Not Looking', 'Looking'],
            yticklabels=['Not Looking', 'Looking'])
plt.title('Confusion Matrix - Random Forest')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

### Feature Importance - Uncovering the Key Predictors

In [None]:
# --- Feature Importance Analysis (using XGBoost) ---
best_model = xgb

In [None]:
# Get feature importances
importances = best_model.feature_importances_
feature_names = X_train.columns

In [None]:
# Create a DataFrame for visualization
feature_importance_df = pd.DataFrame({'feature': feature_names, 'importance': importances})
feature_importance_df = feature_importance_df.sort_values(by='importance', ascending=False)


In [None]:
# Plot the top 15 features
plt.figure(figsize=(12, 10))
sns.barplot(x='importance', y='feature', data=feature_importance_df.head(15), palette='inferno')
plt.title('Top 15 Most Important Features for Predicting Job Change', fontsize=16)
plt.xlabel('Importance Score', fontsize=12)
plt.ylabel('Feature', fontsize=12)
plt.tight_layout()
plt.show()

In [None]:
print("\n--- Top 5 Predictors ---")
print(feature_importance_df.head(5))