In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, roc_auc_score, precision_recall_curve
import numpy as np

In [60]:
# Load and clean data
df = pd.read_csv("final_data.csv").dropna()
X = df.drop(columns=["match","intelligence_o","sinsere_o",])
y = df["match"]

In [61]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)
# Scale data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# Get feature names
feature_names = ['const'] + list(X_train.columns)

# Print mapping from x# to actual feature name
for i, name in enumerate(feature_names):
    print(f"x{i} -> {name}")

x0 -> const
x1 -> like
x2 -> funny_partner
x3 -> funny_o
x4 -> shared_interests_o
x5 -> shared_interests_partner
x6 -> attractive_o
x7 -> attractive_partner
x8 -> guess_prob_liked
x9 -> sincere_partner
x10 -> ambitous_o
x11 -> ambition_partner
x12 -> expected_num_matches


In [62]:
# Helper for optimal threshold
def best_threshold(y_true, y_probs):
    precisions, recalls, thresholds = precision_recall_curve(y_true, y_probs)
    f1s = 2 * (precisions * recalls) / (precisions + recalls + 1e-8)
    best_idx = np.argmax(f1s)
    return {
        'threshold': thresholds[best_idx],
        'precision': precisions[best_idx],
        'recall': recalls[best_idx],
        'f1': f1s[best_idx]
    }

def evaluate_model(name, model, X_eval, y_true):
    y_probs = model.predict_proba(X_eval)[:, 1]
    metrics = best_threshold(y_true, y_probs)
    y_pred = (y_probs > metrics['threshold']).astype(int)
    report = classification_report(y_true, y_pred)
    auc = roc_auc_score(y_true, y_probs)

    print(f"\\n{name}")
    print(f"Threshold: {metrics['threshold']:.3f}")
    print(f"Precision: {metrics['precision']:.3f}")
    print(f"Recall: {metrics['recall']:.3f}")
    print(f"F1 Score: {metrics['f1']:.3f}")
    print(f"ROC AUC: {auc:.3f}")
    print("Classification Report:")
    print(report)

In [63]:
print("\\n========== BEFORE HYPERPARAMETER TUNING ==========")

logreg = LogisticRegression(class_weight='balanced', max_iter=500)
rf = RandomForestClassifier(class_weight='balanced')
svm = SVC(probability=True, class_weight='balanced')

logreg.fit(X_train_scaled, y_train)
rf.fit(X_train, y_train)
svm.fit(X_train_scaled, y_train)

evaluate_model("Logistic Regression (Base)", logreg, X_test_scaled, y_test)
evaluate_model("Random Forest (Base)", rf, X_test, y_test)
evaluate_model("SVM (Base)", svm, X_test_scaled, y_test)


\nLogistic Regression (Base)
Threshold: 0.693
Precision: 0.536
Recall: 0.558
F1 Score: 0.547
ROC AUC: 0.820
Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.90      0.91       850
           1       0.53      0.55      0.54       172

    accuracy                           0.84      1022
   macro avg       0.72      0.73      0.72      1022
weighted avg       0.85      0.84      0.84      1022

\nRandom Forest (Base)
Threshold: 0.360
Precision: 0.583
Recall: 0.471
F1 Score: 0.521
ROC AUC: 0.821
Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.93      0.91       850
           1       0.59      0.46      0.51       172

    accuracy                           0.85      1022
   macro avg       0.74      0.70      0.71      1022
weighted avg       0.84      0.85      0.85      1022

\nSVM (Base)
Threshold: 0.314
Precision: 0.475
Recall: 0.605
F1 Score: 0.532
ROC AUC: 0.81

In [64]:
# lets get some statisitcal features
import statsmodels.api as sm
import pandas as pd

# Add constant (intercept)
X_train_sm = sm.add_constant(X_train_scaled)

# Fit logistic regression using statsmodels
logit_model = sm.Logit(y_train, X_train_sm)
result = logit_model.fit()

# Show summary with p-values, confidence intervals, etc.
print(result.summary())


Optimization terminated successfully.
         Current function value: 0.338165
         Iterations 7
                           Logit Regression Results                           
Dep. Variable:                  match   No. Observations:                 4086
Model:                          Logit   Df Residuals:                     4073
Method:                           MLE   Df Model:                           12
Date:                Wed, 11 Jun 2025   Pseudo R-squ.:                  0.2553
Time:                        15:19:57   Log-Likelihood:                -1381.7
converged:                       True   LL-Null:                       -1855.4
Covariance Type:            nonrobust   LLR p-value:                3.913e-195
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -2.2671      0.066    -34.334      0.000      -2.397      -2.138
x1             0.5668      0.

In [65]:
# random forest 
importances = rf.feature_importances_
feature_names = X_train.columns

rf_importance_df = pd.DataFrame({
    "Feature": feature_names,
    "Importance": importances
}).sort_values(by="Importance", ascending=False)

print(rf_importance_df)


                     Feature  Importance
0                       like    0.114100
2                    funny_o    0.103220
3         shared_interests_o    0.103157
5               attractive_o    0.102766
6         attractive_partner    0.096776
1              funny_partner    0.085828
11      expected_num_matches    0.079920
4   shared_interests_partner    0.074595
7           guess_prob_liked    0.072169
9                 ambitous_o    0.060623
8            sincere_partner    0.054721
10          ambition_partner    0.052125


In [66]:
print("\\n========== AFTER HYPERPARAMETER TUNING ==========")

# Define parameter grids
param_grid_logreg = {
    'C': [0.01, 0.1, 1, 10],
    'penalty': ['l2'],
    'solver': ['lbfgs']
}

param_grid_rf = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

param_grid_svm = {
    'C': [0.1, 1, 10],
    'kernel': ['rbf'],
    'gamma': ['scale', 'auto']
}



In [70]:
# Grid search setup
logreg_grid = GridSearchCV(
    LogisticRegression(class_weight='balanced', max_iter=500),
    param_grid_logreg, cv=5, scoring='recall'
)

rf_grid = GridSearchCV(
    RandomForestClassifier(class_weight='balanced'),
    param_grid_rf, cv=5, scoring='recall'
)

svm_grid = GridSearchCV(
    SVC(probability=True, class_weight='balanced'),
    param_grid_svm, cv=5, scoring='recall'
)


In [71]:
# Fit models
logreg_grid.fit(X_train_scaled, y_train)
rf_grid.fit(X_train, y_train)
svm_grid.fit(X_train_scaled, y_train)


In [72]:
# Evaluate best models
evaluate_model("Logistic Regression (Tuned)", logreg_grid.best_estimator_, X_test_scaled, y_test)
evaluate_model("Random Forest (Tuned)", rf_grid.best_estimator_, X_test, y_test)
evaluate_model("SVM (Tuned)", svm_grid.best_estimator_, X_test_scaled, y_test)

\nLogistic Regression (Tuned)
Threshold: 0.663
Precision: 0.519
Recall: 0.564
F1 Score: 0.540
ROC AUC: 0.821
Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.89      0.90       850
           1       0.52      0.56      0.54       172

    accuracy                           0.84      1022
   macro avg       0.71      0.73      0.72      1022
weighted avg       0.84      0.84      0.84      1022

\nRandom Forest (Tuned)
Threshold: 0.501
Precision: 0.484
Recall: 0.599
F1 Score: 0.535
ROC AUC: 0.825
Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.87      0.89       850
           1       0.48      0.59      0.53       172

    accuracy                           0.82      1022
   macro avg       0.70      0.73      0.71      1022
weighted avg       0.84      0.82      0.83      1022

\nSVM (Tuned)
Threshold: 0.314
Precision: 0.475
Recall: 0.605
F1 Score: 0.532
ROC AUC: 0

In [73]:
# lets add a threshhold tuning step
def tune_threshold(model, X_val, y_val, model_name):
    y_probs = model.predict_proba(X_val)[:, 1]
    precisions, recalls, thresholds = precision_recall_curve(y_val, y_probs)

    f1_scores = 2 * (precisions * recalls) / (precisions + recalls + 1e-10)
    best_idx = f1_scores.argmax()
    best_threshold = thresholds[best_idx]

    print(f"\n{model_name} (Tuned Threshold)")
    print(f"Best Threshold: {best_threshold:.3f}")
    print(f"Precision: {precisions[best_idx]:.3f}")
    print(f"Recall: {recalls[best_idx]:.3f}")
    print(f"F1 Score: {f1_scores[best_idx]:.3f}")
    
    # Now evaluate using the best threshold
    y_pred = (y_probs >= best_threshold).astype(int)
    print(classification_report(y_val, y_pred))
    return best_threshold

In [74]:
tune_threshold(logreg_grid.best_estimator_, X_test_scaled, y_test, "Logistic Regression")
tune_threshold(rf_grid.best_estimator_, X_test, y_test, "Random Forest")
tune_threshold(svm_grid.best_estimator_, X_test_scaled, y_test, "SVM")



Logistic Regression (Tuned Threshold)
Best Threshold: 0.663
Precision: 0.519
Recall: 0.564
F1 Score: 0.540
              precision    recall  f1-score   support

           0       0.91      0.89      0.90       850
           1       0.52      0.56      0.54       172

    accuracy                           0.84      1022
   macro avg       0.71      0.73      0.72      1022
weighted avg       0.84      0.84      0.84      1022


Random Forest (Tuned Threshold)
Best Threshold: 0.501
Precision: 0.484
Recall: 0.599
F1 Score: 0.535
              precision    recall  f1-score   support

           0       0.91      0.87      0.89       850
           1       0.48      0.60      0.54       172

    accuracy                           0.82      1022
   macro avg       0.70      0.73      0.71      1022
weighted avg       0.84      0.82      0.83      1022


SVM (Tuned Threshold)
Best Threshold: 0.314
Precision: 0.475
Recall: 0.605
F1 Score: 0.532
              precision    recall  f1-score 

0.3138770020652713

In [77]:
from xgboost import XGBClassifier

# Calculate scale_pos_weight = (number of 0s) / (number of 1s)
neg, pos = np.bincount(y_train)
scale = neg / pos

# Train XGBoost model with class imbalance adjustment
xgb = XGBClassifier(
    scale_pos_weight=scale,
    use_label_encoder=False,
    eval_metric='logloss'
)
xgb.fit(X_train_scaled, y_train)

# Tune threshold and evaluate
tune_threshold(xgb, X_test_scaled, y_test, "XGBoost")



XGBoost (Tuned Threshold)
Best Threshold: 0.371
Precision: 0.454
Recall: 0.628
F1 Score: 0.527
              precision    recall  f1-score   support

           0       0.92      0.85      0.88       850
           1       0.45      0.63      0.53       172

    accuracy                           0.81      1022
   macro avg       0.69      0.74      0.70      1022
weighted avg       0.84      0.81      0.82      1022



Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


0.37077203

In [79]:
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, recall_score

# Step 1: Set up the model
xgb = XGBClassifier(
    use_label_encoder=False,
    eval_metric='logloss',
    scale_pos_weight=(y_train == 0).sum() / (y_train == 1).sum()  # Optional
)

# Step 2: Hyperparameter grid
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1],
}

# Step 3: Use recall_macro as scoring
grid = GridSearchCV(
    estimator=xgb,
    param_grid=param_grid,
    scoring='recall_macro',
    cv=5,
    verbose=1,
    n_jobs=-1
)

# Step 4: Train the model
grid.fit(X_train_scaled, y_train)

# Step 5: Evaluate with tuned threshold
best_model = grid.best_estimator_
tune_threshold(best_model, X_test_scaled, y_test, "XGBoost (Macro Recall)")


Fitting 5 folds for each of 36 candidates, totalling 180 fits

XGBoost (Macro Recall) (Tuned Threshold)
Best Threshold: 0.674
Precision: 0.524
Recall: 0.506
F1 Score: 0.515
              precision    recall  f1-score   support

           0       0.90      0.91      0.90       850
           1       0.52      0.51      0.51       172

    accuracy                           0.84      1022
   macro avg       0.71      0.71      0.71      1022
weighted avg       0.84      0.84      0.84      1022



Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


0.6735915