In [8]:
# from sklearn.metrics import roc_curve, auc
# import matplotlib.pyplot as plt

# # Extract probabilities and labels for male and female applicants
# male_probs = y_probs[X_test['gender'] == 1]
# female_probs = y_probs[X_test['gender'] == 0]

# # True labels for each gender group
# male_labels = y_test[X_test['gender'] == 1]
# female_labels = y_test[X_test['gender'] == 0]

# # Compute ROC curves for both groups
# fpr_male, tpr_male, _ = roc_curve(male_labels, male_probs)
# fpr_female, tpr_female, _ = roc_curve(female_labels, female_probs)

# # Compute AUC for both groups
# auc_male = auc(fpr_male, tpr_male)
# auc_female = auc(fpr_female, tpr_female)

# # Plot ROC curves for both groups
# plt.figure(figsize=(8, 6))
# plt.plot(fpr_male, tpr_male, color='blue', label=f'Male (AUC = {auc_male:.2f})')
# plt.plot(fpr_female, tpr_female, color='red', label=f'Female (AUC = {auc_female:.2f})')
# plt.plot([0, 1], [0, 1], color='gray', linestyle='--')  # Random classifier line
# plt.xlabel('False Positive Rate')
# plt.ylabel('True Positive Rate')
# plt.title('ROC Curves for Male and Female')
# plt.legend(loc='best')
# plt.show()

In [9]:
# -------------------------------------------------------------------------------
# import packages
import import_ipynb
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

# -------------------------------------------------------------------------------
# import scripts
from data_processing import hiring_data

In [10]:
# data to X and y
X = hiring_data.drop(columns=['decision', 'Id', 'company', 'ind-exact_study'])
y = hiring_data['decision']

# test train set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
# Remove gender from training features
X_train_no_gender = X_train.drop(columns=['gender'])
X_test_no_gender = X_test.drop(columns=['gender'])

# Train the logistic regression model WITHOUT gender
model = LogisticRegression(max_iter=4000)
model_standard = model.fit(X_train_no_gender, y_train)


In [12]:
# Get prediction probabilities for the positive class (decision = 1)
y_probs = model_standard.predict_proba(X_test_no_gender)[:, 1]
y_pred = model_standard.predict(X_test_no_gender)


In [13]:
# Create a DataFrame with predictions and actual labels
results = pd.DataFrame({'gender': X_test['gender'], 'prediction': y_pred, 'actual': y_test})

# Count selection rates by gender
selection_rate_by_gender = results.groupby('gender')['prediction'].mean()
print("\nSelection rate by gender:")
print(selection_rate_by_gender)

# Display counts of predictions per gender
print("\nCount of selections by gender:")
print(results.groupby('gender')['prediction'].value_counts())

# Confusion Matrix
from sklearn.metrics import confusion_matrix, classification_report
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Scores (F1, accuracy, precision, recall)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))



Selection rate by gender:
gender
0    0.114613
1    0.230068
2    0.416667
Name: prediction, dtype: float64

Count of selections by gender:
gender  prediction
0       0             309
        1              40
1       0             338
        1             101
2       0               7
        1               5
Name: count, dtype: int64

Confusion Matrix:
[[505  62]
 [149  84]]

Classification Report:
              precision    recall  f1-score   support

           0       0.77      0.89      0.83       567
           1       0.58      0.36      0.44       233

    accuracy                           0.74       800
   macro avg       0.67      0.63      0.64       800
weighted avg       0.71      0.74      0.72       800



In [14]:
import numpy as np
import pandas as pd

# Get feature importance (absolute values of coefficients)
feature_importance = pd.DataFrame({
    'Feature': X_train_no_gender.columns,
    'Coefficient': model_standard.coef_[0]
})

# Sort by absolute coefficient value (importance)
feature_importance['Abs_Coefficient'] = np.abs(feature_importance['Coefficient'])
feature_importance = feature_importance.sort_values(by='Abs_Coefficient', ascending=False)

print("\nMost important features:")
print(feature_importance[['Feature', 'Coefficient']])



Most important features:
                 Feature  Coefficient
8          ind-languages     1.210821
9             ind-degree     0.757908
6  ind-international_exp     0.446773
5    ind-programming_exp    -0.332501
7    ind-entrepeneur_exp    -0.268937
4         ind-debateclub    -0.157348
3   ind-university_grade     0.075180
2                  sport    -0.040483
1            nationality    -0.034473
0                    age     0.010572
