In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression # A simple, interpretable model
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
import os

In [2]:
# --- Configuration ---
# Path to the data with BERT similarity scores
INPUT_CSV_PATH = "./data/resume_job_match_with_similarity.csv"
# Threshold for defining a "good match" from the original match_score (1-5)
GOOD_MATCH_THRESHOLD = 4

print("Configuration loaded for modeling.")
print(f"Input data from: {INPUT_CSV_PATH}")
print(f"Good match threshold: {GOOD_MATCH_THRESHOLD}")

# Load the dataset
if not os.path.exists(INPUT_CSV_PATH):
    print(f"Error: Input data file not found at '{INPUT_CSV_PATH}'.")
    print("Please ensure the previous BERT matching script ran successfully and saved the file.")
else:
    try:
        df = pd.read_csv(INPUT_CSV_PATH)
        print(f"Loaded data for modeling. Shape: {df.shape}")
        print("\nSample of data for modeling (first 5 rows):")
        print(df[['match_score', 'bert_similarity_score']].head())
    except Exception as e:
        print(f"Error loading data for modeling: {e}")


Configuration loaded for modeling.
Input data from: ./data/resume_job_match_with_similarity.csv
Good match threshold: 4
Loaded data for modeling. Shape: (10000, 6)

Sample of data for modeling (first 5 rows):
   match_score  bert_similarity_score
0            4               0.652394
1            4               0.372719
2            5               0.447902
3            4               0.516451
4            5               0.451312


In [3]:
# Cell 8 (or new notebook Cell 2): Prepare Data for Classification

# Create the target variable: 1 for "good match", 0 for "not a good match"
df['is_good_match'] = (df['match_score'] >= GOOD_MATCH_THRESHOLD).astype(int)

print(f"\nCreated 'is_good_match' column. Counts:")
print(df['is_good_match'].value_counts())

# Define features (X) and target (y)
# For now, we'll use only the BERT similarity score as our feature
X = df[['bert_similarity_score']]
y = df['is_good_match']

# Split the data into training and testing sets
# test_size=0.2 means 20% of the data will be used for testing
# random_state for reproducibility
# For very small datasets, stratify may fail if a class has <2 samples
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"\nData split into training ({len(X_train)} samples) and testing ({len(X_test)} samples).")
print(f"Training target distribution:\n{y_train.value_counts(normalize=True)}")
print(f"Testing target distribution:\n{y_test.value_counts(normalize=True)}")



Created 'is_good_match' column. Counts:
is_good_match
1    5401
0    4599
Name: count, dtype: int64

Data split into training (8000 samples) and testing (2000 samples).
Training target distribution:
is_good_match
1    0.53825
0    0.46175
Name: proportion, dtype: float64
Testing target distribution:
is_good_match
1    0.5475
0    0.4525
Name: proportion, dtype: float64


In [4]:
# Cell 9 (or new notebook Cell 3): Train and Evaluate the Model

print("\nTraining Logistic Regression model...")
# Initialize and train the Logistic Regression model
model = LogisticRegression(random_state=42)
model.fit(X_train, y_train)
print("Model training complete.")

# Make predictions on the test set
y_pred = model.predict(X_test)

print("\nEvaluating model performance:")
# Calculate common classification metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")

# Detailed classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nModel trained and evaluated. This model can now be used for basic resume screening/shortlisting.")
print("Candidates with a predicted 'is_good_match' of 1 would be shortlisted.")




Training Logistic Regression model...
Model training complete.

Evaluating model performance:
Accuracy: 0.6765
Precision: 0.6821
Recall: 0.7662
F1-Score: 0.7217

Classification Report:
              precision    recall  f1-score   support

           0       0.67      0.57      0.61       905
           1       0.68      0.77      0.72      1095

    accuracy                           0.68      2000
   macro avg       0.67      0.67      0.67      2000
weighted avg       0.68      0.68      0.67      2000


Model trained and evaluated. This model can now be used for basic resume screening/shortlisting.
Candidates with a predicted 'is_good_match' of 1 would be shortlisted.


In [5]:
import numpy as np

# --- Configuration ---
# Path to the data with BERT similarity scores (output from previous step)
INPUT_CSV_PATH_WITH_SIMILARITY = "./data/resume_job_match_with_similarity.csv"
OUTPUT_CSV_PATH_WITH_GENDER = "./data/resume_job_match_with_gender.csv"

# Load the dataset with similarity scores
if not os.path.exists(INPUT_CSV_PATH_WITH_SIMILARITY):
    print(f"Error: Input data file not found at '{INPUT_CSV_PATH_WITH_SIMILARITY}'.")
    print("Please ensure the previous BERT matching script ran successfully and saved the file.")
else:
    try:
        df_with_sim = pd.read_csv(INPUT_CSV_PATH_WITH_SIMILARITY)
        print(f"Loaded data for gender simulation. Shape: {df_with_sim.shape}")
    except Exception as e:
        print(f"Error loading data for gender simulation: {e}")

# Simulate a 'gender' column
# For demonstration, we'll assign 'Male' or 'Female' randomly.
# In a real research scenario, you might want to simulate a specific distribution
# or even introduce a subtle bias in the synthetic data generation itself
# to then demonstrate mitigation.
np.random.seed(42) # for reproducibility
df_with_sim['simulated_gender'] = np.random.choice(['Male', 'Female'], size=len(df_with_sim))

print("\nSimulated 'simulated_gender' column. Counts:")
print(df_with_sim['simulated_gender'].value_counts())

print("\nSample of data with simulated gender (first 5 rows):")
print(df_with_sim[['resume', 'job_description', 'bert_similarity_score', 'match_score', 'simulated_gender']].head())

# Save the DataFrame with the simulated gender
try:
    df_with_sim.to_csv(OUTPUT_CSV_PATH_WITH_GENDER, index=False)
    print(f"\nDataFrame with simulated gender saved to: {OUTPUT_CSV_PATH_WITH_GENDER}")
except Exception as e:
    print(f"Error saving data with simulated gender: {e}")


Loaded data for gender simulation. Shape: (10000, 6)

Simulated 'simulated_gender' column. Counts:
simulated_gender
Male      5013
Female    4987
Name: count, dtype: int64

Sample of data with simulated gender (first 5 rows):
                                              resume  \
0  Experienced professional skilled in SQL, Power...   
1  Experienced professional skilled in Python, De...   
2  Experienced professional skilled in wait, Git,...   
3  Experienced professional skilled in return, De...   
4  Experienced professional skilled in REST APIs,...   

                                     job_description  bert_similarity_score  \
0  Data Analyst needed with experience in SQL, Ex...               0.652394   
1  Data Scientist needed with experience in Stati...               0.372719   
2  Software Engineer needed with experience in Sy...               0.447902   
3  ML Engineer needed with experience in Python, ...               0.516451   
4  Software Engineer needed with experienc

In [6]:
# Cell 11 (or new notebook Cell 1): Setup for Bias Measurement

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from fairlearn.metrics import MetricFrame, demographic_parity_ratio, equalized_odds_ratio
import os
import numpy as np

# --- Configuration ---
# Path to the data with BERT similarity scores and simulated gender
INPUT_CSV_PATH_WITH_GENDER = "./data/resume_job_match_with_gender.csv"
GOOD_MATCH_THRESHOLD = 4 # Same threshold as before for 'is_good_match'

print("Configuration loaded for bias measurement.")
print(f"Input data from: {INPUT_CSV_PATH_WITH_GENDER}")

# Load the dataset with simulated gender
if not os.path.exists(INPUT_CSV_PATH_WITH_GENDER):
    print(f"Error: Input data file not found at '{INPUT_CSV_PATH_WITH_GENDER}'.")
    print("Please ensure the previous gender simulation script ran successfully and saved the file.")
else:
    try:
        df = pd.read_csv(INPUT_CSV_PATH_WITH_GENDER)
        print(f"Loaded data for bias measurement. Shape: {df.shape}")
        print("\nSample of data with simulated gender:")
        print(df[['match_score', 'bert_similarity_score', 'simulated_gender']].head())
    except Exception as e:
        print(f"Error loading data for bias measurement: {e}")

# Create the target variable: 1 for "good match", 0 for "not a good match"
df['is_good_match'] = (df['match_score'] >= GOOD_MATCH_THRESHOLD).astype(int)

# Define features (X), target (y), and sensitive attribute (A)
X = df[['bert_similarity_score']]
y = df['is_good_match']
A = df['simulated_gender'] # Our sensitive attribute

# Split the data into training and testing sets, ensuring sensitive attribute distribution is maintained
X_train, X_test, y_train, y_test, A_train, A_test = train_test_split(
    X, y, A, test_size=0.2, random_state=42
)

print(f"\nData split into training ({len(X_train)} samples) and testing ({len(X_test)} samples).")
print(f"Training sensitive attribute distribution:\n{A_train.value_counts(normalize=True)}")
print(f"Testing sensitive attribute distribution:\n{A_test.value_counts(normalize=True)}")


Configuration loaded for bias measurement.
Input data from: ./data/resume_job_match_with_gender.csv
Loaded data for bias measurement. Shape: (10000, 7)

Sample of data with simulated gender:
   match_score  bert_similarity_score simulated_gender
0            4               0.652394             Male
1            4               0.372719           Female
2            5               0.447902             Male
3            4               0.516451             Male
4            5               0.451312             Male

Data split into training (8000 samples) and testing (2000 samples).
Training sensitive attribute distribution:
simulated_gender
Male      0.500875
Female    0.499125
Name: proportion, dtype: float64
Testing sensitive attribute distribution:
simulated_gender
Male      0.503
Female    0.497
Name: proportion, dtype: float64


In [7]:
# Cell 12 (or new notebook Cell 2): Train Baseline Model and Measure Bias

print("\nTraining baseline Logistic Regression model for bias measurement...")
model_baseline = LogisticRegression(random_state=42)
model_baseline.fit(X_train, y_train)
print("Baseline model training complete.")

# Make predictions on the test set
y_pred_baseline = model_baseline.predict(X_test)

print("\n--- Baseline Model Performance (Overall) ---")
print(classification_report(y_test, y_pred_baseline))

# --- Measure Fairness Metrics ---
print("\n--- Fairness Metrics (Baseline Model) ---")

# Demographic Parity: Selection rate across groups
# A higher ratio (closer to 1) indicates better demographic parity
dp_ratio = demographic_parity_ratio(y_true=y_test, y_pred=y_pred_baseline, sensitive_features=A_test)
print(f"Demographic Parity Ratio: {dp_ratio:.4f} (closer to 1 is fairer)")

# Equalized Odds: True Positive Rate (Recall) across groups
# A higher ratio (closer to 1) indicates better equalized odds
eo_ratio = equalized_odds_ratio(y_true=y_test, y_pred=y_pred_baseline, sensitive_features=A_test)
print(f"Equalized Odds Ratio (True Positive Rate): {eo_ratio:.4f} (closer to 1 is fairer)")

# Detailed metrics per group using MetricFrame
grouped_on_gender = MetricFrame(metrics=accuracy_score,
                                y_true=y_test,
                                y_pred=y_pred_baseline,
                                sensitive_features=A_test)
print("\nAccuracy per simulated gender group:")
print(grouped_on_gender.by_group)

grouped_on_gender_recall = MetricFrame(metrics=recall_score,
                                       y_true=y_test,
                                       y_pred=y_pred_baseline,
                                       sensitive_features=A_test)
print("\nRecall (True Positive Rate) per simulated gender group:")
print(grouped_on_gender_recall.by_group)

print("\nBias measurement complete for the baseline model.")
print("The ratios indicate how close the performance is between the most and least privileged groups.")
print("A ratio of 1 means perfect fairness for that metric.")



Training baseline Logistic Regression model for bias measurement...
Baseline model training complete.

--- Baseline Model Performance (Overall) ---
              precision    recall  f1-score   support

           0       0.67      0.57      0.61       905
           1       0.68      0.77      0.72      1095

    accuracy                           0.68      2000
   macro avg       0.67      0.67      0.67      2000
weighted avg       0.68      0.68      0.67      2000


--- Fairness Metrics (Baseline Model) ---
Demographic Parity Ratio: 0.9607 (closer to 1 is fairer)
Equalized Odds Ratio (True Positive Rate): 0.8289 (closer to 1 is fairer)

Accuracy per simulated gender group:
simulated_gender
Female    0.702213
Male      0.651093
Name: accuracy_score, dtype: float64

Recall (True Positive Rate) per simulated gender group:
simulated_gender
Female    0.783178
Male      0.750000
Name: recall_score, dtype: float64

Bias measurement complete for the baseline model.
The ratios indicate ho

In [8]:
# Cell 11 (or new notebook Cell 1): Setup for Bias Measurement (Revised)

import pandas as pd
from sklearn.model_selection import train_test_split # Still useful for initial model
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from fairlearn.metrics import MetricFrame, demographic_parity_ratio, equalized_odds_ratio
import os
import numpy as np

# --- Configuration ---
INPUT_CSV_PATH_WITH_GENDER = "./data/resume_job_match_with_gender.csv"
GOOD_MATCH_THRESHOLD = 4

print("Configuration loaded for bias measurement.")
print(f"Input data from: {INPUT_CSV_PATH_WITH_GENDER}")

# Load the dataset with simulated gender
if not os.path.exists(INPUT_CSV_PATH_WITH_GENDER):
    print(f"Error: Input data file not found at '{INPUT_CSV_PATH_WITH_GENDER}'.")
    print("Please ensure the previous gender simulation script ran successfully and saved the file.")
    exit() # Exit if data not found
try:
    df = pd.read_csv(INPUT_CSV_PATH_WITH_GENDER)
    print(f"Loaded data for bias measurement. Shape: {df.shape}")
    print("\nSample of data with simulated gender:")
    print(df[['match_score', 'bert_similarity_score', 'simulated_gender']].head())
except Exception as e:
    print(f"Error loading data for bias measurement: {e}")
    exit()

# Create the target variable: 1 for "good match", 0 for "not a good match"
df['is_good_match'] = (df['match_score'] >= GOOD_MATCH_THRESHOLD).astype(int)

# Define features (X), target (y), and sensitive attribute (A)
X = df[['bert_similarity_score']]
y = df['is_good_match']
A = df['simulated_gender'] # Our sensitive attribute

# --- IMPORTANT FOR SMALL DATASET DEMO ---
# For a real dissertation, use train_test_split here for X_train, X_test, y_train, y_test, A_train, A_test.
# For this tiny demo, we'll use the full dataset for fairness calculation to avoid empty groups.
# So, X_eval, y_eval, A_eval will represent the data used for *evaluation* of fairness.
# If you were to split, the test set might contain only one group, causing errors.
X_eval = X # Use full X for fairness evaluation
y_eval = y # Use full y for fairness evaluation
A_eval = A # Use full A for fairness evaluation

print("\n--- Using the FULL small dataset for fairness evaluation (DEMO ONLY) ---")
print("   For your actual dissertation, please use a proper train/test split on the full dataset.")
print(f"Evaluation dataset shape: {X_eval.shape}")
print(f"Evaluation sensitive attribute distribution:\n{A_eval.value_counts(normalize=True)}")

# Split the data into training and testing sets for the *model training*
# This is separate from the fairness evaluation set, to ensure model is trained on distinct data
# With 3 samples, this split will be very tiny.
X_train, X_test, y_train, y_test, A_train, A_test = train_test_split(
    X, y, A, test_size=0.2, random_state=42
)
print(f"\nModel training data split: training ({len(X_train)} samples), testing ({len(X_test)} samples).")


# Cell 12 (or new notebook Cell 2): Train Baseline Model and Measure Bias (Revised)

print("\nTraining baseline Logistic Regression model...")
model_baseline = LogisticRegression(random_state=42)
model_baseline.fit(X_train, y_train) # Train on X_train, y_train
print("Baseline model training complete.")

# Make predictions on the *evaluation* set for fairness metrics
y_pred_baseline = model_baseline.predict(X_eval) # Predict on X_eval
print("Predictions made on evaluation set.")

print("\n--- Baseline Model Performance (Overall on Evaluation Set) ---")
# Classification report on y_eval and y_pred_baseline
# Note: For 3 data points, this report won't be very informative.
print(classification_report(y_eval, y_pred_baseline, zero_division=0)) # zero_division=0 to handle cases where a class has no predictions

# --- Measure Fairness Metrics ---
print("\n--- Fairness Metrics (Baseline Model on Evaluation Set) ---")

# Demographic Parity: Selection rate across groups
dp_ratio = demographic_parity_ratio(y_true=y_eval, y_pred=y_pred_baseline, sensitive_features=A_eval.tolist())
print(f"Demographic Parity Ratio: {dp_ratio:.4f} (closer to 1 is fairer)")

# Equalized Odds: True Positive Rate (Recall) across groups
eo_ratio = equalized_odds_ratio(y_true=y_eval, y_pred=y_pred_baseline, sensitive_features=A_eval.tolist())
print(f"Equalized Odds Ratio (True Positive Rate): {eo_ratio:.4f} (closer to 1 is fairer)")

# Detailed metrics per group using MetricFrame
grouped_on_gender = MetricFrame(metrics=accuracy_score,
                                y_true=y_eval,
                                y_pred=y_pred_baseline,
                                sensitive_features=A_eval.tolist())
print("\nAccuracy per simulated gender group:")
print(grouped_on_gender.by_group)

from functools import partial

grouped_on_gender_recall = MetricFrame(metrics=partial(recall_score, zero_division=0),
                                       y_true=y_eval,
                                       y_pred=y_pred_baseline,
                                       sensitive_features=A_eval.tolist())
print("\nRecall (True Positive Rate) per simulated gender group:")
print(grouped_on_gender_recall.by_group)

print("\nBias measurement complete for the baseline model (on evaluation set).")

Configuration loaded for bias measurement.
Input data from: ./data/resume_job_match_with_gender.csv
Loaded data for bias measurement. Shape: (10000, 7)

Sample of data with simulated gender:
   match_score  bert_similarity_score simulated_gender
0            4               0.652394             Male
1            4               0.372719           Female
2            5               0.447902             Male
3            4               0.516451             Male
4            5               0.451312             Male

--- Using the FULL small dataset for fairness evaluation (DEMO ONLY) ---
   For your actual dissertation, please use a proper train/test split on the full dataset.
Evaluation dataset shape: (10000, 1)
Evaluation sensitive attribute distribution:
simulated_gender
Male      0.5013
Female    0.4987
Name: proportion, dtype: float64

Model training data split: training (8000 samples), testing (2000 samples).

Training baseline Logistic Regression model...
Baseline model training



In [10]:
# Cell 13 (or new notebook Cell 3): Apply Bias Mitigation (Post-processing) (Revised)

from fairlearn.postprocessing import ThresholdOptimizer

print("\n--- Applying Bias Mitigation using ThresholdOptimizer ---")

# We need the predicted probabilities from the baseline model to use ThresholdOptimizer
# These probabilities should be from the model predicting on the *evaluation* set (X_eval)
y_pred_proba_baseline = model_baseline.predict_proba(X_eval)[:, 1]

# Initialize ThresholdOptimizer
mitigator = ThresholdOptimizer(
    estimator=model_baseline,
    constraints="equalized_odds", # Using string name for the constraint
    objective='accuracy_score',
    prefit=True # Our model_baseline is already fitted
)

# Check for degenerate groups (groups with only one class in y_eval)
degenerate_groups = []
for group in A_eval.unique():
    labels = y_eval[A_eval == group]
    if len(labels.unique()) < 2:
        degenerate_groups.append(group)

if degenerate_groups:
    print(f"Cannot apply ThresholdOptimizer: degenerate label(s) for group(s): {degenerate_groups}")
    print("Each group in the sensitive attribute must have both positive and negative labels in y_eval.")
    print("Please use a larger dataset or adjust your data split.")
    y_pred_mitigated = None
else:
    # Fit the mitigator on the *evaluation* data (X_eval, y_eval, A_eval)
    # This step determines the optimal thresholds for each group
    # --- Using X_eval, y_eval, A_eval.tolist() ---
    mitigator.fit(X_eval, y_eval, sensitive_features=A_eval.tolist())
    print("ThresholdOptimizer fitted.")
    # Make mitigated predictions using the mitigator on the *evaluation* data
    # --- Using X_eval, A_eval.tolist() ---
    y_pred_mitigated = mitigator.predict(X_eval, sensitive_features=A_eval.tolist())
    print("Mitigated predictions generated.")

    print("\n--- Mitigated Model Performance (Overall on Evaluation Set) ---")
    print(classification_report(y_eval, y_pred_mitigated, zero_division=0))

    # --- Measure Fairness Metrics for Mitigated Model ---
    print("\n--- Fairness Metrics (Mitigated Model on Evaluation Set) ---")

    # --- Using A_eval.tolist() for all Fairlearn metric calls ---
    dp_ratio_mitigated = demographic_parity_ratio(y_true=y_eval, y_pred=y_pred_mitigated, sensitive_features=A_eval.tolist())
    print(f"Demographic Parity Ratio (Mitigated): {dp_ratio_mitigated:.4f} (closer to 1 is fairer)")

    eo_ratio_mitigated = equalized_odds_ratio(y_true=y_eval, y_pred=y_pred_mitigated, sensitive_features=A_eval.tolist())
    print(f"Equalized Odds Ratio (True Positive Rate) (Mitigated): {eo_ratio_mitigated:.4f} (closer to 1 is fairer)")

    # Detailed metrics per group for mitigated model
    grouped_on_gender_mitigated = MetricFrame(metrics=accuracy_score,
                                              y_true=y_eval,
                                              y_pred=y_pred_mitigated,
                                              sensitive_features=A_eval.tolist())
    print("\nAccuracy per simulated gender group (Mitigated):")
    print(grouped_on_gender_mitigated.by_group)

    from functools import partial
    grouped_on_gender_recall_mitigated = MetricFrame(metrics=partial(recall_score, zero_division=0),
                                                     y_true=y_eval,
                                                     y_pred=y_pred_mitigated,
                                                     sensitive_features=A_eval.tolist())
    print("\nRecall (True Positive Rate) per simulated gender group (Mitigated):")
    print(grouped_on_gender_recall_mitigated.by_group)

    print("\nBias mitigation complete. Compare the 'Mitigated' ratios and group metrics to the 'Baseline' ones.")
    print("You should observe an improvement in the fairness metric you chose for mitigation (e.g., Equalized Odds).")
    print("Note: Mitigation often involves a trade-off with overall accuracy.")



--- Applying Bias Mitigation using ThresholdOptimizer ---
ThresholdOptimizer fitted.
Mitigated predictions generated.

--- Mitigated Model Performance (Overall on Evaluation Set) ---
              precision    recall  f1-score   support

           0       0.67      0.59      0.62      4599
           1       0.68      0.75      0.71      5401

    accuracy                           0.68     10000
   macro avg       0.67      0.67      0.67     10000
weighted avg       0.67      0.68      0.67     10000


--- Fairness Metrics (Mitigated Model on Evaluation Set) ---
Demographic Parity Ratio (Mitigated): 0.9935 (closer to 1 is fairer)
Equalized Odds Ratio (True Positive Rate) (Mitigated): 0.9947 (closer to 1 is fairer)

Accuracy per simulated gender group (Mitigated):
sensitive_feature_0
Female    0.676960
Male      0.674047
Name: accuracy_score, dtype: float64

Recall (True Positive Rate) per simulated gender group (Mitigated):
sensitive_feature_0
Female    0.753511
Male      0.749536


