In [None]:

# Random Forest

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    classification_report
)

# --- 1. Load Data ---
file_name = 'encoded_jobs_dataset.csv'

try:
    df = pd.read_csv(file_name)

    print(f"Successfully loaded {file_name}")
    print(f"Shape of data: {df.shape}")
    print("-" * 40)

    # --- 2. Define Features (X) and Target (y) ---

    # The last column is the target
    target_column = df.columns[-1]

    # All columns *except* the last one are features
    feature_columns = df.columns[:-1]

    X = df[feature_columns]
    y_categorical = df[target_column]

    print(f"Identified Target Column: {target_column}")
    print(f"Identified {len(feature_columns)} Feature Columns.")
    print("-" * 40)

    # --- 3. Preprocessing: Encode the Target Variable ---
    # The model needs the target to be numbers (0, 1, 2...)

    encoder = LabelEncoder()
    y = encoder.fit_transform(y_categorical)

    # Store the class names for later reports
    class_names = encoder.classes_
    n_classes = len(class_names)

    print("Target column (Job_Role) encoded into numbers.")
    print(f"Found {n_classes} unique classes.")
    print("-" * 40)

    # --- 4. Create Training and Test Sets ---
    # We stratify by 'y' to ensure the train and test sets
    # have the same proportion of classes as the original data.

    X_train, X_test, y_train, y_test = train_test_split(
        X, y,
        test_size=0.3,
        random_state=42,
        stratify=y
    )

    print(f"Data split into:")
    print(f"  Training set: {X_train.shape[0]} samples")
    print(f"  Test set:     {X_test.shape[0]} samples")
    print("-" * 40)

    # --- 5. Train the Random Forest Algorithm ---
    print("Training Random Forest model...")

    # We use class_weight='balanced' to help with the
    # class imbalance you identified earlier.
    rf_model = RandomForestClassifier(
        n_estimators=300,
        random_state=42,
        class_weight='balanced',
        n_jobs=-1
    )
    rf_model.fit(X_train, y_train)

    print("Model training complete.")
    print("-" * 40)

    # --- 6. Make Predictions ---
    y_pred = rf_model.predict(X_test)
    # Get probabilities for ROC AUC score
    y_pred_proba = rf_model.predict_proba(X_test)

    # --- 7. Calculate Model Performance Metrics ---
    print("--- Overall Model Performance Metrics ---")

    # Accuracy
    accuracy = accuracy_score(y_test, y_pred)
    print(f"\n1. Accuracy:")
    print(f"   {accuracy:.4f}  (or {accuracy * 100:.2f}%)")

    # --- Precision, Recall, F1-Score (Averages) ---
    # 'macro' treats all classes equally, good for imbalance
    # 'weighted' accounts for imbalance (favors more common classes)

    print(f"\n2. Average Precision, Recall, and F1-Score:")

    precision_macro = precision_score(y_test, y_pred, average='macro')
    recall_macro = recall_score(y_test, y_pred, average='macro')
    f1_macro = f1_score(y_test, y_pred, average='macro')

    precision_weighted = precision_score(y_test, y_pred, average='weighted')
    recall_weighted = recall_score(y_test, y_pred, average='weighted')
    f1_weighted = f1_score(y_test, y_pred, average='weighted')

    print(f"   Macro Average:   Precision={precision_macro:.4f}, Recall={recall_macro:.4f}, F1-Score={f1_macro:.4f}")
    print(f"   Weighted Avg:  Precision={precision_weighted:.4f}, Recall={recall_weighted:.4f}, F1-Score={f1_weighted:.4f}")

    # --- ROC AUC Score ---
    # This must be handled differently for multi-class problems

    print(f"\n3. ROC AUC Score (One-vs-Rest):")
    # 'ovr' = One-vs-Rest, 'weighted' averages the score for each class
    roc_auc = roc_auc_score(y_test, y_pred_proba, multi_class='ovr', average='weighted')
    print(f"   Weighted OVR ROC AUC: {roc_auc:.4f}")

    # --- Classification Report ("Prediction") ---
    # This shows Precision, Recall, and F1-score for *each class*

    print(f"\n4. Classification Report (Metrics per Class):")
    report = classification_report(y_test, y_pred, target_names=class_names)
    print(report)


except FileNotFoundError:
    print(f"\n--- ERROR ---")
    print(f"File not found: '{file_name}'")
    print("Please make sure you have uploaded the file to your Colab session.")
except Exception as e:
    print(f"An error occurred: {e}")

Successfully loaded encoded_jobs_dataset.csv
Shape of data: (852, 178)
----------------------------------------
Identified Target Column: Job_Role
Identified 177 Feature Columns.
----------------------------------------
Target column (Job_Role) encoded into numbers.
Found 25 unique classes.
----------------------------------------
Data split into:
  Training set: 596 samples
  Test set:     256 samples
----------------------------------------
Training Random Forest model...
Model training complete.
----------------------------------------
--- Overall Model Performance Metrics ---

1. Accuracy:
   0.9141  (or 91.41%)

2. Average Precision, Recall, and F1-Score:
   Macro Average:   Precision=0.9319, Recall=0.9192, F1-Score=0.9229
   Weighted Avg:  Precision=0.9193, Recall=0.9141, F1-Score=0.9136

3. ROC AUC Score (One-vs-Rest):
   Weighted OVR ROC AUC: 0.9951

4. Classification Report (Metrics per Class):
                           precision    recall  f1-score   support

               

In [None]:
# 10 fold stratified cross validation - logistic regression

import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    confusion_matrix
)
from sklearn.preprocessing import LabelEncoder
import warnings

# Suppress warnings for cleaner output (e.g., convergence warnings or zero_division)
warnings.filterwarnings('ignore')

# --- 1. Load and Prepare Data ---

file_name = 'encoded_jobs_dataset.csv'

try:
    df = pd.read_csv(file_name)
    print(f"Successfully loaded '{file_name}'.")
    print("-" * 30)

    # Assume the last column is the target (y) and all others are features (X)
    X = df.iloc[:, :-1]
    y_categorical = df.iloc[:, -1]

    # Encode the categorical target variable 'Job_Role' into numbers
    le = LabelEncoder()
    y = le.fit_transform(y_categorical)

    # Get class names for reporting
    class_names = le.classes_
    n_classes = len(class_names)

    print(f"Features (X) shape: {X.shape}")
    print(f"Target (y) shape: {y.shape}")
    print(f"Number of classes: {n_classes}")
    print(f"Class names: {list(class_names)}")
    print("-" * 30)

    # --- 2. Set up Cross-Validation ---

    N_SPLITS = 10  # You can change this (e.g., 5 or 10)
    skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=42)

    # Initialize lists to store metrics for each fold
    accuracies = []
    precisions = []
    recalls = []
    f1s = []
    aucs = []

    # Initialize an aggregate confusion matrix
    # This will sum up the confusion matrices from all folds
    total_cm = np.zeros((n_classes, n_classes), dtype=int)

    print(f"Starting Stratified {N_SPLITS}-Fold Cross-Validation...")
    print("-" * 30)

    # --- 3. Run Cross-Validation Loop ---

    for fold, (train_index, test_index) in enumerate(skf.split(X, y)):
        # print(f"--- Fold {fold+1}/{N_SPLITS} ---")

        # Split the data
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y[train_index], y[test_index]

        # Initialize the model
        # Using 'multinomial' for multiclass, 'lbfgs' solver, and increased max_iter
        model = LogisticRegression(
            multi_class='multinomial',
            solver='lbfgs',
            max_iter=1000,
            random_state=42
        )

        # Fit the model
        model.fit(X_train, y_train)

        # Make predictions
        y_pred = model.predict(X_test)

        # Get probabilities for ROC AUC
        y_proba = model.predict_proba(X_test)

        # --- Calculate and store metrics for this fold ---

        # Note: For multiclass, 'average='weighted'' accounts for class imbalance.
        # 'zero_division=0' prevents warnings if a class is never predicted.

        # Accuracy
        accuracies.append(accuracy_score(y_test, y_pred))

        # Precision
        precisions.append(precision_score(y_test, y_pred, average='weighted', zero_division=0))

        # Recall
        recalls.append(recall_score(y_test, y_pred, average='weighted', zero_division=0))

        # F1-Score
        f1s.append(f1_score(y_test, y_pred, average='weighted', zero_division=0))

        # ROC AUC Score
        # Needs probabilities and 'multi_class='ovr'' (One-vs-Rest)
        aucs.append(roc_auc_score(y_test, y_proba, multi_class='ovr', average='weighted'))

        # Update the aggregate confusion matrix
        # Ensure labels are explicitly set to range(n_classes)
        cm_fold = confusion_matrix(y_test, y_pred, labels=range(n_classes))
        total_cm += cm_fold

    print("Cross-Validation complete.")
    print("=" * 30)
    print("           METRIC RESULTS             ")
    print("=" * 30)

    # --- 4. Report Average Metrics ---

    print(f"Average Accuracy:  {np.mean(accuracies):.4f} (± {np.std(accuracies):.4f})")
    print(f"Average Precision: {np.mean(precisions):.4f} (± {np.std(precisions):.4f})")
    print(f"Average Recall:    {np.mean(recalls):.4f} (± {np.std(recalls):.4f})")
    print(f"Average F1-Score:  {np.mean(f1s):.4f} (± {np.std(f1s):.4f})")
    print(f"Average ROC AUC:   {np.mean(aucs):.4f} (± {np.std(aucs):.4f})")
    print("\n")


    # --- 5. Report TP, TN, FP, FN from Aggregate Matrix ---

    print("=" * 30)
    print("  AGGREGATE CONFUSION MATRIX & METRICS  ")
    print("=" * 30)

    print("Aggregate Confusion Matrix (Rows=Actual, Cols=Predicted):")
    # Pretty print the matrix with labels
    cm_df = pd.DataFrame(total_cm, index=class_names, columns=class_names)
    print(cm_df)
    print("\n")

    print("Per-Class TP, TN, FP, FN (Calculated from aggregate matrix):")
    total_samples = total_cm.sum()

    for i, class_name in enumerate(class_names):
        TP = total_cm[i, i]
        FP = total_cm[:, i].sum() - TP
        FN = total_cm[i, :].sum() - TP
        TN = total_samples - (TP + FP + FN)

        print(f"\n--- Class: {class_name} ---")
        print(f"  True Positives (TP):  {TP}")
        print(f"  True Negatives (TN):  {TN}")
        print(f"  False Positives (FP): {FP}")
        print(f"  False Negatives (FN): {FN}")

except FileNotFoundError:
    print(f"Error: File '{file_name}' not found.")
    print("Please make sure you have uploaded the file to your Colab session.")
except Exception as e:
    print(f"An error occurred: {e}")

Successfully loaded 'encoded_jobs_dataset.csv'.
------------------------------
Features (X) shape: (852, 177)
Target (y) shape: (852,)
Number of classes: 25
Class names: ['Architect', 'Business Analyst', 'Civil Engineer', 'Cybersecurity Analyst', 'Data Analyst', 'Doctor', 'Education Consultant', 'Electrical Engineer', 'Financial Analyst', 'Graphic Designer', 'HR Executive', 'Lawyer', 'Legal Advisor', 'Machine Learning Engineer', 'Marketing Manager', 'Mechanical Engineer', 'Medical Researcher', 'Network Engineer', 'Nurse', 'Pharmacist', 'Professor', 'Project Manager', 'Psychologist', 'Software Developer', 'Teacher']
------------------------------
Starting Stratified 10-Fold Cross-Validation...
------------------------------
Cross-Validation complete.
           METRIC RESULTS             
Average Accuracy:  0.9566 (± 0.0183)
Average Precision: 0.9600 (± 0.0184)
Average Recall:    0.9566 (± 0.0183)
Average F1-Score:  0.9552 (± 0.0195)
Average ROC AUC:   0.9996 (± 0.0004)


  AGGREGATE CO

In [None]:
# SVM
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC  # Import Support Vector Classifier
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    confusion_matrix
)
from sklearn.preprocessing import LabelEncoder
import warnings

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

# --- 1. Load and Prepare Data ---

file_name = 'encoded_jobs_dataset.csv'

try:
    df = pd.read_csv(file_name)

    # Assume the last column is the target (y) and all others are features (X)
    X = df.iloc[:, :-1]
    y_categorical = df.iloc[:, -1]

    # Encode the categorical target variable 'Job_Role' into numbers
    le = LabelEncoder()
    y = le.fit_transform(y_categorical)

    # Get class names for reporting
    class_names = le.classes_
    n_classes = len(class_names)

    # --- 2. Split Data (No Cross-Validation) ---

    # We will do a single 80/20 train/test split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y,
        test_size=0.20,  # 20% for testing
        random_state=42,
        stratify=y
    )

    # --- 3. Initialize and Train the SVM Model ---

    print("Training the SVM model...")
    model = SVC(kernel='linear', probability=True, random_state=42)
    model.fit(X_train, y_train)
    print("Model training complete.")

    # --- 4. Make Predictions ---

    print("Evaluating model on the test set...")
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)

    # --- 5. Calculate Requested Metrics ---

    # Calculate overall average metrics
    # 'weighted' average accounts for class imbalance
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
    recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
    f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)

    try:
        auc = roc_auc_score(y_test, y_proba, multi_class='ovr', average='weighted')
    except ValueError as e:
        auc = np.nan
        print(f"Could not calculate ROC AUC: {e}")

    # Calculate per-class TP, TN, FP, FN and then average them
    cm = confusion_matrix(y_test, y_pred, labels=range(n_classes))
    total_samples = cm.sum()

    tps = []
    tns = []
    fps = []
    fns = []

    for i in range(n_classes):
        TP = cm[i, i]
        FP = cm[:, i].sum() - TP
        FN = cm[i, :].sum() - TP
        TN = total_samples - (TP + FP + FN)

        tps.append(TP)
        tns.append(TN)
        fps.append(FN)
        fns.append(FN)

    # Calculate the average of TP, TN, FP, FN across all classes
    avg_tp = np.mean(tps)
    avg_tn = np.mean(tns)
    avg_fp = np.mean(fps)
    avg_fn = np.mean(fns)

    print("\n" + "=" * 30)
    print("      CONCISE METRIC RESULTS (SVM)      ")
    print("=" * 30)

    print(f"Accuracy:  {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall:    {recall:.4f}")
    print(f"F1-Score:  {f1:.4f}")
    print(f"ROC AUC:   {auc:.4f}")
    print("-" * 30)
    print(f"Average True Positives (TP):  {avg_tp:.4f}")
    print(f"Average True Negatives (TN):  {avg_tn:.4f}")
    print(f"Average False Positives (FP): {avg_fp:.4f}")
    print(f"Average False Negatives (FN): {avg_fn:.4f}")

except FileNotFoundError:
    print(f"Error: File '{file_name}' not found.")
    print("Please make sure you have uploaded the file to your Colab session.")
except Exception as e:
    print(f"An error occurred: {e}")

Training the SVM model...
Model training complete.
Evaluating model on the test set...

      CONCISE METRIC RESULTS (SVM)      
Accuracy:  0.9298
Precision: 0.9385
Recall:    0.9298
F1-Score:  0.9276
ROC AUC:   0.9991
------------------------------
Average True Positives (TP):  6.3600
Average True Negatives (TN):  163.6800
Average False Positives (FP): 0.4800
Average False Negatives (FN): 0.4800


In [None]:
# ADA BOOST

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    confusion_matrix
)
from sklearn.preprocessing import LabelEncoder
import warnings

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

# --- 1. Load and Prepare Data ---

file_name = 'encoded_jobs_dataset.csv'

try:
    df = pd.read_csv(file_name)

    # Assume the last column is the target (y) and all others are features (X)
    X = df.iloc[:, :-1]
    y_categorical = df.iloc[:, -1]

    # Encode the categorical target variable 'Job_Role' into numbers
    le = LabelEncoder()
    y = le.fit_transform(y_categorical)

    # Get class names for reporting
    class_names = le.classes_
    n_classes = len(class_names)

    # --- 2. Split Data (No Cross-Validation) ---

    # We will do a single 80/20 train/test split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y,
        test_size=0.20,  # 20% for testing
        random_state=42,
        stratify=y
    )

    # --- 3. Initialize and Train the AdaBoost Model ---

    print("Training the AdaBoost model with n_estimators=200...")

    # Using n_estimators=200 as requested
    model = AdaBoostClassifier(n_estimators=200, random_state=42)

    model.fit(X_train, y_train)
    print("Model training complete.")

    # --- 4. Make Predictions ---

    print("Evaluating model on the test set...")
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)

    # --- 5. Calculate Requested Metrics ---

    # Calculate overall average metrics
    # 'weighted' average accounts for class imbalance
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
    recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
    f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)

    try:
        auc = roc_auc_score(y_test, y_proba, multi_class='ovr', average='weighted')
    except ValueError as e:
        auc = np.nan
        print(f"Could not calculate ROC AUC: {e}")

    # Calculate per-class TP, TN, FP, FN and then average them
    cm = confusion_matrix(y_test, y_pred, labels=range(n_classes))
    total_samples = cm.sum()

    tps = []
    tns = []
    fps = []
    fns = []

    for i in range(n_classes):
        TP = cm[i, i]
        FP = cm[:, i].sum() - TP
        FN = cm[i, :].sum() - TP
        TN = total_samples - (TP + FP + FN)

        tps.append(TP)
        tns.append(TN)
        fps.append(FP)
        fns.append(FN)

    # Calculate the average of TP, TN, FP, FN across all classes
    avg_tp = np.mean(tps)
    avg_tn = np.mean(tns)
    avg_fp = np.mean(fps)
    avg_fn = np.mean(fns)

    print("\n" + "=" * 30)
    print(" CONCISE METRIC RESULTS (AdaBoost, 200) ")
    print("=" * 30)

    print(f"Accuracy:  {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall:    {recall:.4f}")
    print(f"F1-Score:  {f1:.4f}")
    print(f"ROC AUC:   {auc:.4f}")
    print("-" * 30)
    print(f"Average True Positives (TP):  {avg_tp:.4f}")
    print(f"Average True Negatives (TN):  {avg_tn:.4f}")
    print(f"Average False Positives (FP): {avg_fp:.4f}")
    print(f"Average False Negatives (FN): {avg_fn:.4f}")

except FileNotFoundError:
    print(f"Error: File '{file_name}' not found.")
    print("Please make sure you have uploaded the file to your Colab session.")
except Exception as e:
    print(f"An error occurred: {e}")

Training the AdaBoost model with n_estimators=200...
Model training complete.
Evaluating model on the test set...

 CONCISE METRIC RESULTS (AdaBoost, 200) 
Accuracy:  0.1345
Precision: 0.0701
Recall:    0.1345
F1-Score:  0.0612
ROC AUC:   0.7691
------------------------------
Average True Positives (TP):  0.9200
Average True Negatives (TN):  158.2400
Average False Positives (FP): 5.9200
Average False Negatives (FN): 5.9200


In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import plotly.express as px
import warnings

warnings.filterwarnings('ignore')

# --- 1. Load and Prepare Data ---

file_name = 'encoded_jobs_dataset.csv'

try:
    df = pd.read_csv(file_name)

    # Assume the last column is the target (y) and all others are features (X)
    X = df.iloc[:, :-1]
    y_categorical = df.iloc[:, -1] # This is the 'Job_Role' string

    # Encode the categorical target variable 'Job_Role' into numbers
    le = LabelEncoder()
    y_encoded = le.fit_transform(y_categorical)
    n_classes = len(le.classes_)

    print(f"Original data shape: {X.shape}")
    print(f"Number of classes: {n_classes}")
    print("-" * 30)

    # --- 2. Reduce Dimensions with PCA ---

    print("Reducing 177 dimensions to 3 using PCA...")
    pca = PCA(n_components=3)
    X_3d = pca.fit_transform(X)

    # Create a new DataFrame for plotting
    df_3d = pd.DataFrame(X_3d, columns=['PC1', 'PC2', 'PC3'])
    df_3d['Job_Role'] = y_categorical # Add the string names for coloring
    df_3d['target'] = y_encoded       # Add the encoded numbers

    print("Data reduction complete.")
    print(f"New 3D data shape: {X_3d.shape}")
    print("-" * 30)

    # --- 3. Train an SVM on the 3D data ---

    print("Training SVM on 3D data (using all data for visualization)...")
    # We use the full dataset for the viz, so we'll train/test on it
    model_3d = SVC(kernel='linear', random_state=42)
    model_3d.fit(X_3d, y_encoded)

    # Make predictions to see how accurate the 3D space is
    y_pred_3d = model_3d.predict(X_3d)
    accuracy_3d = accuracy_score(y_encoded, y_pred_3d)

    print(f"Accuracy of SVM in 3D space: {accuracy_3d * 100:.2f}%")
    print("This shows the 3D space still separates the classes well!")
    print("-" * 30)

    # --- 4. Generate the 3D Plot ---

    print("Generating interactive 3D plot...")

    fig = px.scatter_3d(
        df_3d,
        x='PC1',
        y='PC2',
        z='PC3',
        color='Job_Role',  # Color the points by their job name
        title=f'3D View of Job Roles (SVM Accuracy in this space: {accuracy_3d * 100:.2f}%)'
    )

    # Make markers smaller for a cleaner look
    fig.update_traces(marker=dict(size=4))

    # Show the plot
    fig.show()

except FileNotFoundError:
    print(f"Error: File '{file_name}' not found.")
    print("Please make sure you have uploaded the file to your Colab session.")
except Exception as e:
    print(f"An error occurred: {e}")

Original data shape: (852, 177)
Number of classes: 25
------------------------------
Reducing 177 dimensions to 3 using PCA...
Data reduction complete.
New 3D data shape: (852, 3)
------------------------------
Training SVM on 3D data (using all data for visualization)...
Accuracy of SVM in 3D space: 21.48%
This shows the 3D space still separates the classes well!
------------------------------
Generating interactive 3D plot...
