<a href="https://colab.research.google.com/github/mohammedidriss/hiring-system-GGU-Group9/blob/main/Course4_Hiring_system_v1_0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Cell 1

In [1]:
# This cell installs all the external libraries we need.
# I have ADDED 'httpx' which is the Python library for making API calls.
print("Installing required libraries: gradio, xgboost, shap, openpyxl, httpx...")
!pip install -q gradio xgboost shap openpyxl httpx
print("Installations complete.")

Installing required libraries: gradio, xgboost, shap, openpyxl, httpx...
Installations complete.


Cell 2

In [2]:
# This cell imports all the tools.
# I have ADDED 'httpx' (for API calls) and 'asyncio'.

# For data handling
import pandas as pd
import numpy as np
import os
import json # Make sure json is imported

# For the Web App (UI)
import gradio as gr

# For saving/loading models and files
import joblib
import datetime

# For AI Model Building (Preprocessing)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# For AI Model Training (XGBoost)
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report

# For AI Model Explainability (SHAP)
import shap

# For making API calls (replaces fetch)
import httpx
import asyncio

# To hide common warnings
import warnings
warnings.filterwarnings('ignore')

print("All libraries imported. Ready to mount Google Drive.")

All libraries imported. Ready to mount Google Drive.


Cell 3

In [3]:
# This cell connects your Colab notebook to your Google Drive
# to access your data files.

print("Mounting Google Drive...")
# This will pop up an authorization window.
from google.colab import drive
drive.mount('/content/drive')
print("Google Drive mounted successfully at /content/drive/")

Mounting Google Drive...
Mounted at /content/drive
Google Drive mounted successfully at /content/drive/


Cell 4

In [4]:
# This cell defines all our file paths and creates the "Master Dataset."
# It reads your two Excel sheets (from the same file), merges them,
# creates the 'TARGET_HIRED' label, and saves this new
# "master" file back to your Drive.

print("--- Phase 1: Creating Master Training Dataset ---")

# --- 1. Define File Paths ---
# This is your original Excel file
DRIVE_PATH = "/content/drive/MyDrive/"
FEATURES_FILE_PATH = f"{DRIVE_PATH}SampleData.xlsx"
FEATURES_SHEET_NAME = "sample_jsdrilldown"
OUTCOMES_SHEET_NAME = "Sample_Workshop_app" # The other sheet in your file
MASTER_DATASET_PATH = f"{DRIVE_PATH}master_training_dataset.csv"

# --- 2. Define the Function ---
def create_master_dataset():
    """
    Merges features and outcomes, calculates target labels,
    and saves a new "master" dataset.
    """
    print(f"Loading features from: {FEATURES_SHEET_NAME}")
    try:
        df_features = pd.read_excel(FEATURES_FILE_PATH, sheet_name=FEATURES_SHEET_NAME)
        print(f"Loaded {len(df_features)} feature rows.")
    except Exception as e:
        print(f"---!!! ERROR loading features sheet: {e} !!!---")
        print("Please check: 1. File path is correct. 2. Sheet name is correct. 3. 'openpyxl' is installed.")
        return None

    print(f"Loading outcomes from: {OUTCOMES_SHEET_NAME}")
    try:
        # We only need the 'Journey id' and the 'Status' columns
        df_outcomes = pd.read_excel(FEATURES_FILE_PATH, sheet_name=OUTCOMES_SHEET_NAME, usecols=['Journey id', 'Status'])
        print(f"Loaded {len(df_outcomes)} outcome rows.")
    except Exception as e:
        print(f"---!!! ERROR loading outcomes sheet: {e} !!!---")
        return None

    # --- 3. Create the Classification Label (The "Prediction") ---
    # Use 'Status' == 'In work' as our "Hired" (1) or "Not Hired" (0) label
    df_outcomes_clean = df_outcomes.drop_duplicates(subset=['Journey id'])
    df_outcomes_clean['TARGET_HIRED'] = np.where(df_outcomes_clean['Status'] == 'In work', 1, 0)

    # --- 4. Merge Features and Labels ---
    # We use a 'left' merge to keep all candidates from the features sheet
    # and add the 'TARGET_HIRED' label to them.
    #
    print("Merging features and labels...")
    df_master = pd.merge(
        df_features,
        df_outcomes_clean[['Journey id', 'TARGET_HIRED']],
        on='Journey id',
        how='left'
    )

    # Fill any candidate who has no outcome data with 0 (Not Hired)
    df_master['TARGET_HIRED'] = df_master['TARGET_HIRED'].fillna(0).astype(int)

    print(f"Master dataset created with {len(df_master)} rows.")

    # --- 5. Save the new master file ---
    df_master.to_csv(MASTER_DATASET_PATH, index=False)
    print(f"Master dataset saved to: {MASTER_DATASET_PATH}")
    return df_master

# --- 6. RUN THE FUNCTION ---
master_df = create_master_dataset()
if master_df is not None:
    print(f"We have {master_df['TARGET_HIRED'].sum()} 'Hired' users to train on.")

--- Phase 1: Creating Master Training Dataset ---
Loading features from: sample_jsdrilldown
Loaded 2000 feature rows.
Loading outcomes from: Sample_Workshop_app
Loaded 5375 outcome rows.
Merging features and labels...
Master dataset created with 2000 rows.
Master dataset saved to: /content/drive/MyDrive/master_training_dataset.csv
We have 1000 'Hired' users to train on.


Cell 5

In [5]:
# This cell re-builds the "AI Brain" (XGBoost Model).
# We are REMOVING the "leaky" columns 'Work status' and 'Stream'
# from the AI_FEATURE_COLUMNS list.

print("--- Phase 2: Building the AI Brain (XGBoost Model) ---")

# Define the brain file path here, similar to MASTER_DATASET_PATH
BRAIN_FILE_PATH = f"{DRIVE_PATH}ai_brain_pipeline.joblib"

if 'master_df' not in locals() or master_df.empty:
    print("ERROR: 'master_df' not found. Please run Cell 4 first.")
else:
    TARGET_COLUMN = 'TARGET_HIRED'

    # --- THIS IS THE FIX ---
    # We have removed 'Work status' and 'Stream' from this list.
    # The AI will now be forced to learn from *real* features,
    # not the "cheat code" answer.
    AI_FEATURE_COLUMNS = [
        'gender', 'Age', 'Marital status', 'JS Town', 'JS Town distrinct',
        # 'Stream', (REMOVED - This is a program, not a candidate attribute)
        # 'Work status', (REMOVED - This is the answer!)
        'Salary expectations', 'Highest Qualification', 'Highest major',
        'Shifts', 'Working environment', 'Skills English', 'Skills MS Word',
        'Skills MS Excel', 'Skills MS Powerpoint', 'Transport available', 'Driving license',
        'Job goal 1', 'Job goal 2', 'Job goal 3', 'Years workexperience',
        'Months workexperience', 'Communication skills', 'Has CV (SWS)'
    ]
    # ---------------------

    # Check if all columns are in master_df
    missing_cols = [col for col in AI_FEATURE_COLUMNS if col not in master_df.columns]
    if missing_cols:
        print(f"---!!! ERROR: The following columns are missing from 'master_df': {missing_cols} !!!---")
        print("Please check the 'sample_jsdrilldown' sheet and the 'AI_FEATURE_COLUMNS' list.")
        raise ValueError("Missing feature columns in master_df")

    # Automatically separate features by type
    numeric_features = master_df[AI_FEATURE_COLUMNS].select_dtypes(include=np.number).columns.tolist()
    categorical_features = master_df[AI_FEATURE_COLUMNS].select_dtypes(include=['object', 'bool']).columns.tolist()

    # Manually define ordinal (ordered) features
    ordinal_features = [
        'Skills English', 'Skills MS Word', 'Skills MS Excel', 'Skills MS Powerpoint', 'Communication skills'
    ]
    skill_levels = ['N_A', 'Beginner', 'Good', 'Excellent']

    # Remove ordinal features from the main categorical list
    categorical_features = [col for col in categorical_features if col not in ordinal_features]

    print(f"Identified {len(numeric_features)} numeric, {len(ordinal_features)} ordinal, {len(categorical_features)} categorical features.")
    print(f"Total features being used for training: {len(AI_FEATURE_COLUMNS)}")

    # --- Build Preprocessing Pipelines ---
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])

    ordinal_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='N_A')),
        ('encoder', OrdinalEncoder(categories=[skill_levels] * len(ordinal_features), handle_unknown='use_encoded_value', unknown_value=-1))
    ])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='Unknown')),
        ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ])

    # --- Combine Pipelines ---
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('ord', ordinal_transformer, ordinal_features),
            ('cat', categorical_transformer, categorical_features)
        ],
        remainder='passthrough'
    )

    # --- Create the Full AI Brain Pipeline ---
    print("\nDataset is balanced (1000 Hired / 1000 Not Hired).")
    print("Using default 'scale_pos_weight=1'.")

    ai_brain_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', xgb.XGBClassifier(
            objective='binary:logistic',
            eval_metric='logloss',
            use_label_encoder=False,
            scale_pos_weight=1,  # Set to 1 because our data is balanced!
            random_state=42
        ))
    ])

    # --- Train the AI Brain ---
    print("\nSplitting data and training model...")
    X = master_df[AI_FEATURE_COLUMNS]
    y = master_df[TARGET_COLUMN]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    ai_brain_pipeline.fit(X_train, y_train)
    print("...Training complete.")

    # --- Evaluate the AI Brain ---
    print("\n--- Model Performance Evaluation (No Leakage) ---")
    preds = ai_brain_pipeline.predict(X_test)
    print(f"Accuracy on Test Data: {accuracy_score(y_test, preds):.2%}")
    print(classification_report(y_test, preds))

    # --- Save the AI Brain to Google Drive ---
    joblib.dump(ai_brain_pipeline, BRAIN_FILE_PATH)
    print("\n---------------------------------")
    print(f"SUCCESS: AI Brain (new NON-LEAKING version) saved to: {BRAIN_FILE_PATH}")
    print("---------------------------------")

--- Phase 2: Building the AI Brain (XGBoost Model) ---
Identified 3 numeric, 5 ordinal, 15 categorical features.
Total features being used for training: 23

Dataset is balanced (1000 Hired / 1000 Not Hired).
Using default 'scale_pos_weight=1'.

Splitting data and training model...
...Training complete.

--- Model Performance Evaluation (No Leakage) ---
Accuracy on Test Data: 66.00%
              precision    recall  f1-score   support

           0       0.66      0.67      0.66       200
           1       0.66      0.65      0.66       200

    accuracy                           0.66       400
   macro avg       0.66      0.66      0.66       400
weighted avg       0.66      0.66      0.66       400


---------------------------------
SUCCESS: AI Brain (new NON-LEAKING version) saved to: /content/drive/MyDrive/ai_brain_pipeline.joblib
---------------------------------


Cell 6

In [7]:
# This cell builds the "Action Plan Generator" (SHAP Explainer).
# It loads the model we just saved, trains a SHAP explainer
# on it, and saves the explainer to your Drive.

print("--- Phase 3: Building the Action Plan Generator (SHAP) ---")

# --- 1. Load the Saved AI Brain ---
# (BRAIN_FILE_PATH and EXPLAINER_FILE_PATH were defined in Cell 3)
# FIX: Define EXPLAINER_FILE_PATH here
EXPLAINER_FILE_PATH = f"{DRIVE_PATH}shap_explainer.joblib"

try:
    ai_brain_pipeline = joblib.load(BRAIN_FILE_PATH)
    print(f"Successfully loaded AI Brain from: {BRAIN_FILE_PATH}")
except Exception as e:
    print(f"---!!! ERROR loading 'ai_brain_pipeline.joblib': {e} !!!---")
    raise

# --- 2. Separate Pipeline Components ---
preprocessor = ai_brain_pipeline.named_steps['preprocessor']
model = ai_brain_pipeline.named_steps['classifier']

# --- 3. Get Feature Names (FIXED METHOD) ---
# We ask the preprocessor for its *actual* output names
print("Getting feature names directly from the preprocessor...")
try:
    all_transformed_feature_names = preprocessor.get_feature_names_out().tolist()
    print(f"Successfully got {len(all_transformed_feature_names)} feature names.")
except Exception as e:
    print(f"Error getting feature names: {e}.")
    raise

# --- 4. Transform Training Data (FIXED METHOD) ---
print("Transforming training data for SHAP explainer...")
# We must re-create X_train and X_test from Cell 5
# so the explainer can use them as a background dataset
# (AI_FEATURE_COLUMNS and master_df are still in memory from Cell 5)
X = master_df[AI_FEATURE_COLUMNS]
y = master_df[TARGET_COLUMN]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# We DON'T call .toarray() because the pipeline already outputs a dense array
X_train_transformed = preprocessor.transform(X_train)
print("...Data transformed successfully (dense array).")

# --- 5. Build the SHAP Explainer ---
print("Building SHAP TreeExplainer...")
#
explainer = shap.TreeExplainer(model, X_train_transformed)
print("...Explainer built successfully.")

# --- 6. Save the SHAP Explainer ---
joblib.dump(explainer, EXPLAINER_FILE_PATH)
print(f"SUCCESS: SHAP Explainer saved to: {EXPLAINER_FILE_PATH}")

# --- 7. Define Test Function ---
def generate_action_plan_test(new_user_data_df):
    prediction_proba = ai_brain_pipeline.predict_proba(new_user_data_df)[0]
    hire_probability = prediction_proba[1]
    prediction_raw = ai_brain_pipeline.predict(new_user_data_df)[0]

    # We DON'T call .toarray() here
    user_transformed = preprocessor.transform(new_user_data_df)
    shap_values = explainer.shap_values(user_transformed)

    # This will now work
    df_shap = pd.DataFrame(shap_values, columns=all_transformed_feature_names).iloc[0].T
    df_shap = df_shap.reset_index(); df_shap.columns = ['Feature', 'SHAP_Value']
    df_shap['abs_impact'] = df_shap['SHAP_Value'].abs()
    df_shap = df_shap.sort_values(by='abs_impact', ascending=False)

    top_factors = df_shap.head(5)

    action_plan = "--- (TEST) Recommended Action Plan ---\n"
    for _, row in top_factors.iterrows():
        sign = "[+]" if row['SHAP_Value'] > 0 else "[-]"
        action_plan += f"  {sign} {row['Feature']} (Impact: {row['SHAP_Value']:.2f})\n"
    action_plan += f"\n  Prediction: {'Hired' if prediction_raw == 1 else 'Not Hired'} ({hire_probability:.1%})"
    return action_plan

# --- 8. Test the Explainer ---
print("\n--- TESTING THE ACTION PLAN GENERATOR ---")
# We use X_test, which we just created in this cell
sample_user_df = X_test.iloc[0:1]
true_label = y_test.iloc[0]
print(f"Generating plan for a sample user. (True Label: {'Hired' if true_label == 1 else 'Not Hired'})...")
print(generate_action_plan_test(sample_user_df))
print("---------------------------------")

--- Phase 3: Building the Action Plan Generator (SHAP) ---
Successfully loaded AI Brain from: /content/drive/MyDrive/ai_brain_pipeline.joblib
Getting feature names directly from the preprocessor...
Successfully got 297 feature names.
Transforming training data for SHAP explainer...
...Data transformed successfully (dense array).
Building SHAP TreeExplainer...
...Explainer built successfully.
SUCCESS: SHAP Explainer saved to: /content/drive/MyDrive/shap_explainer.joblib

--- TESTING THE ACTION PLAN GENERATOR ---
Generating plan for a sample user. (True Label: Not Hired)...
--- (TEST) Recommended Action Plan ---
  [+] num__Age (Impact: 0.39)
  [-] cat__JS Town_Dammam (Impact: -0.31)
  [-] num__Months workexperience (Impact: -0.29)
  [+] cat__JS Town distrinct_Central (Impact: 0.24)
  [-] ord__Skills English (Impact: -0.23)

  Prediction: Not Hired (43.0%)
---------------------------------


Cell 7

In [21]:
# This is the FINAL, COMBINED cell for the application.
# It REPLACES all previous versions.
# Fixed: Removed invalid 'scale' argument from gr.Markdown.
# Fixed: Cleaned up layout logic.

import datetime
import joblib
import pandas as pd
import json
import httpx
import asyncio
from google.colab import userdata
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import numpy as np
import gradio as gr

print("--- Phase 4 & 5: Defining App Logic and Launching Multi-Page UI ---")

# --- 1. Load API Key from Colab Secrets ---
try:
    MY_GOOGLE_API_KEY = userdata.get('GOOGLE_API_KEY')
    if not MY_GOOGLE_API_KEY:
        raise ValueError("API Key is empty or not found.")
    print("Successfully loaded Google API Key from Colab Secrets.")
except Exception as e:
    print(f"---!!! CRITICAL ERROR: API KEY NOT FOUND: {e} !!!---")
    print("Please go to the 'Key' (ðŸ”‘) icon on the left and add your API key as a secret named 'GOOGLE_API_KEY'")
    raise

# --- 2. Define Global Variables & AI Models ---
ai_models = {"pipeline": None, "explainer": None, "feature_names": None}
global_user_df = pd.DataFrame()

ALL_FEATURE_COLUMNS = [
    'gender', 'Age', 'Marital status', 'JS Town', 'JS Town distrinct',
    'Salary expectations', 'Highest Qualification', 'Highest major',
    'Shifts', 'Working environment', 'Skills English', 'Skills MS Word',
    'Skills MS Excel', 'Skills MS Powerpoint', 'Transport available', 'Driving license',
    'Job goal 1', 'Job goal 2', 'Job goal 3', 'Years workexperience',
    'Months workexperience', 'Communication skills', 'Has CV (SWS)'
]

try:
    if 'master_df' in locals() and not master_df.empty:
        initial_profile_dict = {col: "Unknown" if not np.issubdtype(master_df[col].dtype, np.number) else 0 for col in ALL_FEATURE_COLUMNS}
    else:
        initial_profile_dict = {col: 0 if col in ['Age', 'Years workexperience', 'Months workexperience'] else 'Unknown' for col in ALL_FEATURE_COLUMNS}
except NameError:
    initial_profile_dict = {col: 0 if col in ['Age', 'Years workexperience', 'Months workexperience'] else 'Unknown' for col in ALL_FEATURE_COLUMNS}


# --- 3. Define Dropdown Choices ---
GENDER_CHOICES = ['Male', 'Female', 'Other', 'Prefer not to say']
MARITAL_CHOICES = ['Single', 'Married', 'Divorced', 'Widowed', 'Other']
SALARY_CHOICES = ['< 3,000SAR', '3,000 - 5,000SAR', '5,000 - 7,000SAR', '> 7,000SAR']
QUALIFICATION_CHOICES = ['High School', 'Diploma', 'Bachelor', 'Masters', 'Doctorate']
SHIFTS_CHOICES = ['No preference', 'Straight shifts', 'Rotating shifts']
ENVIRONMENT_CHOICES = ['Mixed', 'Flexible', 'On-site', 'Remote']
SKILL_LEVEL_CHOICES = ['N_A', 'Beginner', 'Good', 'Excellent']
YES_NO_CHOICES = ['Yes', 'No']
COMMUNICATION_CHOICES = ['Beginner', 'Good', 'Excellent']


# --- 4. Load XGBoost AI Brain ---
def on_app_load():
    global ai_models, global_user_df
    print("Gradio App Loading: Loading *XGBoost* AI models from Google Drive...")
    try:
        ai_models["pipeline"] = joblib.load(BRAIN_FILE_PATH)
        ai_models["explainer"] = joblib.load(EXPLAINER_FILE_PATH)
        preprocessor = ai_models["pipeline"].named_steps['preprocessor']
        ai_models["feature_names"] = preprocessor.get_feature_names_out().tolist()
        print("...XGBoost Models loaded successfully.")
        global_user_df = pd.read_csv(MASTER_DATASET_PATH)
        print(f"...Loaded {len(global_user_df)} users for Global Dashboard.")
        return "System Ready. AI Models and Global Data loaded."
    except Exception as e:
        print(f"---!!! CRITICAL ERROR LOADING MODELS: {e} !!!---")
        return f"CRITICAL ERROR: Could not load AI models or data. {e}"

# --- 5. Helper Functions ---
def format_feature_name_for_llm(feature_name):
    name = feature_name.replace("cat__", "").replace("ord__", "").replace("num__", "")
    parts = name.split('_', 1)
    if len(parts) == 2:
        if "Skills" in parts[0]: return parts[0].replace("Skills ", "")
        if "Job goal" in parts[0]: return "Job Goal Setting"
        return parts[0]
    return name

def run_xgb_prediction_and_get_report(profile_dict):
    print("XGBoost Brain: Running prediction...")
    try:
        if ai_models["pipeline"] is None: return "Error: XGBoost model not loaded.", "Unknown", 0.0
        new_user_df = pd.DataFrame([profile_dict], columns=ALL_FEATURE_COLUMNS)
        pipeline = ai_models["pipeline"]
        explainer = ai_models["explainer"]
        feature_names = ai_models["feature_names"]
        prediction_proba = pipeline.predict_proba(new_user_df)[0]
        hire_probability = prediction_proba[1]
        user_transformed = pipeline.named_steps['preprocessor'].transform(new_user_df)
        shap_values = explainer.shap_values(user_transformed)
        df_shap = pd.DataFrame(shap_values, columns=feature_names).iloc[0].T
        df_shap = df_shap.reset_index(); df_shap.columns = ['Feature', 'SHAP_Value']
        df_shap['abs_impact'] = df_shap['SHAP_Value'].abs()
        df_negatives = df_shap[df_shap['SHAP_Value'] < 0].sort_values(by='SHAP_Value', ascending=True)
        top_weakness_topic = "General Profile Improvement"
        if not df_negatives.empty:
            top_weakness_internal_name = df_negatives.iloc[0]['Feature']
            top_weakness_topic = format_feature_name_for_llm(top_weakness_internal_name)
        report = f"STATISTICAL ANALYSIS REPORT:\n"
        report += f"Predicted Hire Probability: {hire_probability:.1%}\n"
        report += "Top 5 Most Important Factors:\n"
        for _, row in df_shap.sort_values(by='abs_impact', ascending=False).head(5).iterrows():
            sign = "POSITIVE" if row['SHAP_Value'] > 0 else "NEGATIVE"
            report += f"  - Factor: {row['Feature']}, Impact: {sign}\n"
        print(f"...XGBoost Brain: Report generated. Top weakness: {top_weakness_topic}")
        return report, top_weakness_topic, hire_probability
    except Exception as e:
        print(f"---!!! ERROR during XGBoost prediction: {e} !!!---")
        return f"Error during XGBoost prediction: {e}", "Unknown", 0.0

# --- 6. Generate Individual Charts ---
def generate_individual_charts(profile_dict):
    print("Generating individual skills radar chart...")
    skills = {
        'English': profile_dict.get('Skills English', 'N_A'),
        'MS Word': profile_dict.get('Skills MS Word', 'N_A'),
        'MS Excel': profile_dict.get('Skills MS Excel', 'N_A'),
        'MS Powerpoint': profile_dict.get('Skills MS Powerpoint', 'N_A'),
        'Communication': profile_dict.get('Communication skills', 'N_A')
    }
    level_map = {'N_A': 0, 'Beginner': 1, 'Good': 2, 'Excellent': 3}
    values = [level_map.get(v, 0) for v in skills.values()]
    labels = list(skills.keys())
    N = len(labels)
    angles = np.linspace(0, 2 * np.pi, N, endpoint=False).tolist()
    values += values[:1]
    angles += angles[:1]
    fig, ax = plt.subplots(figsize=(6, 6), subplot_kw=dict(polar=True))
    ax.fill(angles, values, color='blue', alpha=0.25)
    ax.plot(angles, values, color='blue', linewidth=2)
    ax.set_yticklabels(['', 'Beginner', 'Good', 'Excellent'])
    ax.set_xticks(angles[:-1])
    ax.set_xticklabels(labels)
    ax.set_title("Candidate Skills Profile", size=15)
    plt.tight_layout()
    return fig

# --- 7. Generate Global Dashboard ---
def generate_global_dashboard():
    global global_user_df
    print("Generating global dashboard...")
    if global_user_df.empty:
        fig, ax = plt.subplots()
        ax.text(0.5, 0.5, "Global user data not loaded.", ha='center')
        return fig

    fig = plt.figure(figsize=(16, 12))
    gs = fig.add_gridspec(2, 3) # 2 rows, 3 columns

    try:
        # Chart 1: Domain
        ax1 = fig.add_subplot(gs[0, 0])
        domain_counts = global_user_df['Stream'].value_counts()
        ax1.bar(domain_counts.index, domain_counts.values, color='teal')
        ax1.set_title('Candidates by Domain (Stream)')
        ax1.tick_params(axis='x', rotation=25)

        # Chart 2: Scatter Plot
        ax2 = fig.add_subplot(gs[0, 1:])
        years = pd.to_numeric(global_user_df['Years workexperience'].fillna(0), errors='coerce').fillna(0)
        months = pd.to_numeric(global_user_df['Months workexperience'].fillna(0), errors='coerce').fillna(0)
        global_user_df['TotalYears'] = years + (months / 12)

        streams = global_user_df['Stream'].dropna().unique()
        stream_map = {stream: i for i, stream in enumerate(streams)}
        global_user_df['Stream_Num'] = global_user_df['Stream'].map(stream_map)

        statuses = global_user_df['Work status'].unique()
        for i, status in enumerate(statuses):
            subset = global_user_df[global_user_df['Work status'] == status]
            ax2.scatter(
                subset['Stream'].map(stream_map) + np.random.normal(0, 0.1, len(subset)),
                subset['TotalYears'],
                label=status, alpha=0.6, edgecolors='w'
            )
        ax2.set_xticks(range(len(streams)))
        ax2.set_xticklabels(streams)
        ax2.set_title('Distribution of Experience by Domain (Colored by Work Status)')
        ax2.legend(title="Status")
        ax2.grid(True, linestyle='--', alpha=0.3)

        # Chart 3: Qualification
        ax3 = fig.add_subplot(gs[1, 0])
        qual_counts = global_user_df['Highest Qualification'].value_counts().head(7).sort_values()
        ax3.barh(qual_counts.index, qual_counts.values, color='mediumpurple')
        ax3.set_title('Highest Qualification')

        # Chart 4: Gender
        ax4 = fig.add_subplot(gs[1, 1])
        gender_counts = global_user_df['gender'].value_counts()
        ax4.pie(gender_counts.values, labels=gender_counts.index, autopct='%1.1f%%',
               colors=['lightblue', 'lightpink', 'lightgrey'], startangle=90)
        ax4.set_title('Gender')

        # Chart 5: Goals
        ax5 = fig.add_subplot(gs[1, 2])
        job_goal_counts = global_user_df['Job goal 1'].value_counts().head(5)
        ax5.barh(job_goal_counts.index, job_goal_counts.values, color='gold')
        ax5.set_title('Top 5 Job Goals')

        plt.tight_layout()

    except Exception as e:
        print(f"Error generating global dashboard: {e}")
        fig, ax = plt.subplots()
        ax.text(0.5, 0.5, f"Error generating charts: {e}", ha='center')

    return fig

# --- 8. Submit Function ---
async def process_new_user_submission(*args):
    print("New user submitted. Processing...")
    if ai_models["pipeline"] is None:
        return "ERROR: AI Models not loaded.", "Error", None, "Error", None, None

    new_profile_dict = dict(zip(ALL_FEATURE_COLUMNS, args))

    try:
        save_data = pd.DataFrame([new_profile_dict], columns=ALL_FEATURE_COLUMNS)
        save_data['Journey id'] = f"WEB-{int(datetime.datetime.now().timestamp())}"
        save_data['Attachment date'] = datetime.date.today().isoformat()
        save_data['Stream'] = 'Online Submission'
        save_data['Work status'] = 'On programme'
        file_exists = os.path.isfile(NEW_SUBMISSIONS_FILE_PATH)
        save_data.to_csv(NEW_SUBMISSIONS_FILE_PATH, mode='a', header=not file_exists, index=False)
        print("Profile saved to new_submissions.csv")
    except Exception as e:
        print(f"--- ERROR Saving to Drive: {e} ---")

    statistical_report, top_weakness, hire_probability = run_xgb_prediction_and_get_report(new_profile_dict)
    individual_skill_chart = generate_individual_charts(new_profile_dict)

    years = new_profile_dict.get('Years workexperience', 0)
    total_years = float(years) + (float(new_profile_dict.get('Months workexperience', 0)) / 12)
    if total_years <= 2: level = 'Junior'
    elif total_years <= 5: level = 'Mid-level'
    else: level = 'Senior'

    categorization_text = f"**Profile Category:**\n"
    categorization_text += f"  - **Level:** {level}\n"
    categorization_text += f"  - **Qualification:** {new_profile_dict.get('Highest Qualification', 'N/A')}\n"
    categorization_text += f"  - **Top Job Goal:** {new_profile_dict.get('Job goal 1', 'N/A')}"

    print("AI Counselor: Calling Gemini for RAG Action Plan...")
    job_goal = new_profile_dict.get('Job goal 1', 'N/A')
    major = new_profile_dict.get('Highest major', 'N/A')

    search_query_1 = f"free online courses for {job_goal} with {major} background"
    search_query_2 = f"best paid certificate programs on Coursera or Udemy for {job_goal}"
    search_query_3 = f"professional networking groups or meetups for {job_goal} in Saudi Arabia"

    system_prompt = f"""
    You are an expert AI Career Counselor. A candidate has just completed their profile.
    Your job is to write a comprehensive, encouraging, and actionable career plan.

    HERE IS THE DATA YOU MUST USE:
    1. THE CANDIDATE'S PROFILE: {json.dumps(new_profile_dict, indent=2)}
    2. THE STATISTICAL ANALYSIS REPORT (from our predictive model): {statistical_report}
    3. A LIST OF *INTERNAL* WORKSHOPS:
       - "Work ethics and an introduction to labor culture"
       - "Initial appointment / Induction", "CV Writing", "Interview Skills"

    YOUR TASK:
    1.  **Analyze:** Read the STATISTICAL REPORT. Identify the candidate's hire probability and their biggest weakness.
    2.  **Search:** **USE THE GOOGLE SEARCH TOOL** to find real-world resources. You will perform 3 searches:
        * Search 1: `{search_query_1}` (for free courses)
        * Search 2: `{search_query_2}` (for paid certificates)
        * Search 3: `{search_query_3}` (for meetups/networking)
    4.  **Synthesize Plan:** Write a comprehensive, multi-part career plan.
        * **Introduction:** Start by being encouraging.
        * **Part 1: Immediate Profile Improvement (The Weakness):** Use the STATISTICAL REPORT to explain their *biggest statistical weakness*.
        * **Part 2: Internal Workshops:** Recommend one *internal workshop* from the list.
        * **Part 3: External Training Plan (Free):** Recommend 1-2 **free courses** you found from Search 1. **You MUST include the source title and the URL.**
        * **Part 4: External Training Plan (Paid/Certificate):** Recommend 1-2 **paid certificate programs** from Search 2. **You MUST include the source title and the URL.**
        * **Part 5: Community & Networking:** Recommend 1-2 **networking groups or meetups** from Search 3. **You MUST include the source title and the URL.**
        * **Conclusion:** End with an encouraging closing statement.
    """

    apiKey = MY_GOOGLE_API_KEY
    apiUrl = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash-preview-09-2025:generateContent?key={apiKey}"

    payload = {
        "contents": [{"parts": [{"text": "Generate the career plan."}]}],
        "systemInstruction": {"parts": [{"text": system_prompt}]},
        "generationConfig": {"responseMimeType": "text/plain"},
        "tools": [{"google_search": {}}]
    }

    try:
        transport = httpx.AsyncHTTPTransport(retries=3)
        async with httpx.AsyncClient(transport=transport) as client:
            response = await client.post(apiUrl, json=payload, headers={'Content-Type': 'application/json'}, timeout=90.0)

        if not response.status_code == 200:
            raise Exception(f"RAG API call failed: {response.text}")

        result = response.json()
        if not result.get('candidates'): raise Exception("Invalid RAG response")

        final_action_plan = result['candidates'][0]['content']['parts'][0]['text']
        print("...RAG Action Plan generated.")

    except Exception as e:
        print(f"---!!! ERROR Calling RAG API: {e} !!!---")
        final_action_plan = f"An error occurred while generating the RAG action plan: {e}"

    updated_dashboard = generate_global_dashboard()

    return (
        f"{hire_probability:.1%}",
        categorization_text,
        individual_skill_chart,
        final_action_plan,
        final_action_plan,
        updated_dashboard
    )

# --- 9. Follow-up Chat Function ---
async def call_gemini_follow_up_chat(user_message: str, chat_history: list, report_context: str):
    chat_history.append([user_message, None])
    if not report_context:
        chat_history[-1][1] = "I'm sorry, you must generate a report in the 'New Candidate Report' tab first."
        return "", chat_history

    system_prompt = f"""
    You are an AI Career Counselor. Context:
    {report_context}
    User question: "{user_message}"
    Answer the question using the report context and Google Search if needed.
    """

    apiKey = MY_GOOGLE_API_KEY
    apiUrl = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash-preview-09-2025:generateContent?key={apiKey}"

    payload = {
        "contents": [{"parts": [{"text": user_message}]}],
        "systemInstruction": {"parts": [{"text": system_prompt}]},
        "generationConfig": {"responseMimeType": "text/plain"},
        "tools": [{"google_search": {}}]
    }

    try:
        transport = httpx.AsyncHTTPTransport(retries=3)
        async with httpx.AsyncClient(transport=transport) as client:
            response = await client.post(apiUrl, json=payload, headers={'Content-Type': 'application/json'}, timeout=90.0)
        if not response.status_code == 200: raise Exception(f"API failed: {response.text}")
        result = response.json()
        bot_message = result['candidates'][0]['content']['parts'][0]['text']
    except Exception as e:
        bot_message = f"Error: {e}"

    chat_history[-1][1] = bot_message
    return "", chat_history


# -----------------------------------------------------------------
# --- END OF LOGIC / START OF FULL PAGE UI (MULTI-PAGE TABS) ---
# -----------------------------------------------------------------

print("--- Phase 5: Launching the Gradio MULTI-PAGE App ---")

with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown("# AI-Powered Predictive Hiring & Action Intelligence System")

    report_context_state = gr.State(value="")
    chat_history_state = gr.State(value=[])

    # --- TOP LEVEL TABS ---
    with gr.Tabs():

        # --- PAGE 1: AI COUNSELOR ---
        with gr.TabItem("ðŸ¤– AI Counselor & Prediction"):
            with gr.Row():
                with gr.Column(scale=3):
                    gr.Markdown("### 1. Candidate Profile Input")
                    all_inputs = []
                    with gr.Accordion("Personal Info", open=True):
                        with gr.Row():
                            gender_input = gr.Dropdown(label="Gender", choices=GENDER_CHOICES, value='Female')
                            age_input = gr.Number(label="Age", value=25)
                        marital_input = gr.Dropdown(label="Marital Status", choices=MARITAL_CHOICES, value='Single')
                        all_inputs.extend([gender_input, age_input, marital_input])

                    with gr.Accordion("Location", open=False):
                        town_input = gr.Textbox(label="Town", value='Riyadh')
                        district_input = gr.Textbox(label="District", value='Central')
                        all_inputs.extend([town_input, district_input])

                    with gr.Accordion("Education & Salary", open=False):
                        salary_input = gr.Dropdown(label="Salary Expectations", choices=SALARY_CHOICES, value='3,000 - 5,000SAR')
                        qual_input = gr.Dropdown(label="Highest Qualification", choices=QUALIFICATION_CHOICES, value='Bachelor')
                        major_input = gr.Textbox(label="Highest Major", value='Computer Science')
                        all_inputs.extend([salary_input, qual_input, major_input])

                    with gr.Accordion("Work Preferences", open=False):
                        shifts_input = gr.Dropdown(label="Shifts", choices=SHIFTS_CHOICES, value='No preference')
                        env_input = gr.Dropdown(label="Environment", choices=ENVIRONMENT_CHOICES, value='Mixed')
                        all_inputs.extend([shifts_input, env_input])

                    with gr.Accordion("Skills", open=False):
                        eng_input = gr.Dropdown(label="English", choices=SKILL_LEVEL_CHOICES, value='Good')
                        word_input = gr.Dropdown(label="Word", choices=SKILL_LEVEL_CHOICES, value='Good')
                        excel_input = gr.Dropdown(label="Excel", choices=SKILL_LEVEL_CHOICES, value='Good')
                        ppt_input = gr.Dropdown(label="PPT", choices=SKILL_LEVEL_CHOICES, value='Good')
                        comm_input = gr.Dropdown(label="Communication", choices=COMMUNICATION_CHOICES, value='Good')
                        all_inputs.extend([eng_input, word_input, excel_input, ppt_input])

                    with gr.Accordion("Logistics", open=False):
                        transport_input = gr.Radio(label="Transport?", choices=YES_NO_CHOICES, value='No')
                        license_input = gr.Radio(label="License?", choices=YES_NO_CHOICES, value='No')
                        cv_input = gr.Radio(label="CV?", choices=YES_NO_CHOICES, value='No')
                        all_inputs.extend([transport_input, license_input])

                    with gr.Accordion("Career Goals", open=False):
                        goal1_input = gr.Textbox(label="Job Goal 1", value='Data Analyst')
                        goal2_input = gr.Textbox(label="Job Goal 2", value='Developer')
                        goal3_input = gr.Textbox(label="Job Goal 3", value='IT Support')
                        all_inputs.extend([goal1_input, goal2_input, goal3_input])

                    with gr.Accordion("Experience", open=False):
                        with gr.Row():
                            years_exp_input = gr.Number(label="Years", value=1)
                            months_exp_input = gr.Number(label="Months", value=0)
                        all_inputs.extend([years_exp_input, months_exp_input])

                    all_inputs.append(comm_input)
                    all_inputs.append(cv_input)

                    submit_btn = gr.Button("Generate AI Report", variant="primary")
                    status_output = gr.Textbox(label="Status", interactive=False, value="App starting...")

                with gr.Column(scale=7):
                    gr.Markdown("### 2. AI Command Center")

                    with gr.Row():
                        prediction_output = gr.Textbox(label="Hire Probability", value="N/A", scale=1)
                        category_output = gr.Markdown(value="*Categorization*") # Removed 'scale' arg

                    with gr.Row():
                        with gr.Column(scale=1):
                            skill_chart_output = gr.Plot(label="Skills Radar")
                        with gr.Column(scale=2):
                            action_plan_output = gr.Markdown(value="*Action Plan will appear here*", label="AI Action Plan")

                    gr.Markdown("### 3. AI Counselor Chat")
                    chat_window = gr.Chatbot(label="Ask follow-up questions about the plan...", height=300)
                    with gr.Row():
                        chat_textbox = gr.Textbox(show_label=False, placeholder="Ask me anything about the report above...", scale=8)
                        chat_btn = gr.Button("Send", scale=1)

        # --- PAGE 2: DASHBOARD ---
        with gr.TabItem("ðŸ“Š Executive Analytics Dashboard"):
            gr.Markdown("### Global User Population Analysis")
            global_dashboard_plot = gr.Plot(label="Global Dashboard")
            refresh_button = gr.Button("Refresh Global Dashboard", variant="secondary")


    # --- CONNECTING FUNCTIONS ---
    demo.load(fn=on_app_load, outputs=[status_output]).then(fn=generate_global_dashboard, outputs=[global_dashboard_plot])

    submit_btn.click(
        fn=process_new_user_submission,
        inputs=all_inputs,
        outputs=[prediction_output, category_output, skill_chart_output, action_plan_output, report_context_state, global_dashboard_plot]
    )

    refresh_button.click(fn=generate_global_dashboard, outputs=[global_dashboard_plot])

    chat_btn.click(fn=call_gemini_follow_up_chat, inputs=[chat_textbox, chat_window, report_context_state], outputs=[chat_textbox, chat_window])
    chat_textbox.submit(fn=call_gemini_follow_up_chat, inputs=[chat_textbox, chat_window, report_context_state], outputs=[chat_textbox, chat_window])

print("Launching...")
demo.launch(debug=True, share=True)

--- Phase 4 & 5: Defining App Logic and Launching Multi-Page UI ---
Successfully loaded Google API Key from Colab Secrets.
--- Phase 5: Launching the Gradio MULTI-PAGE App ---
Launching...
Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://6c4838b7db37cbe914.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Gradio App Loading: Loading *XGBoost* AI models from Google Drive...
...XGBoost Models loaded successfully.
...Loaded 2000 users for Global Dashboard.
Generating global dashboard...
Gradio App Loading: Loading *XGBoost* AI models from Google Drive...
...XGBoost Models loaded successfully.
...Loaded 2000 users for Global Dashboard.
New user submitted. Processing...
Profile saved to new_submissions.csv
XGBoost Brain: Running prediction...
...XGBoost Brain: Report generated. Top weakness: Highest Qualification
Generating individual skills radar chart...
AI Counselor: Calling Gemini for RAG Action Plan...
...RAG Action Plan generated.
Generating global dashboard...
New user submitted. Processing...
Profile saved to new_submissions.csv
XGBoost Brain: Running prediction...
...XGBoost Brain: Report generated. Top weakness: Transport available
Generating individual skills radar chart...
AI Counselor: Calling Gemini for RAG Action Plan...
...RAG Action Plan generated.
Generating global dashboar



Cell 8

Cell 9

Cell 10