In [40]:
# Modeling Pipeline for Adjusted Speed Dating Dataset

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.svm import SVR, SVC
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, precision_score, recall_score, f1_score

In [41]:
# Load dataset
file_path = 'adjusted_features_speeddating.csv'
df = pd.read_csv(file_path)


In [42]:
# Separate features and targets
X = df.drop(['like', 'match'], axis=1)
y_like = df['like']
y_match = df['match']

# Identify column types
numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()

# Preprocessing pipelines
numeric_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot',  pd.get_dummies)
])

# Apply preprocessing: we will one‑hot via pandas before pipeline to keep simple
X = pd.get_dummies(X, columns=categorical_cols, drop_first=True)

# Impute & scale numeric columns
imputer = SimpleImputer(strategy='median')
X[numeric_cols] = imputer.fit_transform(X[numeric_cols])
scaler = StandardScaler()
X[numeric_cols] = scaler.fit_transform(X[numeric_cols])

In [43]:
# Split data
X_train_like, X_test_like, y_train_like, y_test_like = train_test_split(X, y_like, test_size=0.2, random_state=42)
X_train_match, X_test_match, y_train_match, y_test_match = train_test_split(X, y_match, test_size=0.2, random_state=42)

In [48]:
# --- Classification: Predicting 'match' (Probability of Matching) ---


from sklearn.metrics import roc_auc_score, brier_score_loss, RocCurveDisplay

# Re‑split with stratification to preserve class ratio
X_train_match, X_test_match, y_train_match, y_test_match = train_test_split(
    X, y_match, test_size=0.2, random_state=42, stratify=y_match)

classifiers = {
    'LogisticRegression': LogisticRegression(max_iter=1000, class_weight='balanced'),
    'RandomForestClassifier': RandomForestClassifier(random_state=42, class_weight='balanced'),
    'SVC (prob)': SVC(probability=True, class_weight='balanced')
}

for name, model in classifiers.items():
    model.fit(X_train_match, y_train_match)
    probas = model.predict_proba(X_test_match)[:, 1]
    preds = (probas >= 0.5).astype(int)  # default 0.5 threshold

    acc = accuracy_score(y_test_match, preds)
    prec = precision_score(y_test_match, preds)
    rec = recall_score(y_test_match, preds)
    f1 = f1_score(y_test_match, preds)
    roc_auc = roc_auc_score(y_test_match, probas)
    brier = brier_score_loss(y_test_match, probas)

    print(f"{name}: Acc={acc:.3f}, Prec={prec:.3f}, Rec={rec:.3f}, F1={f1:.3f}, AUC={roc_auc:.3f}, Brier={brier:.3f}")

print("Sample predicted odds (first 10 rows):")
print(pd.DataFrame({'Actual': y_test_match.iloc[:10].values,
                    'Match_Probability': probas[:10]}))

print("Modeling complete.")


LogisticRegression: Acc=1.000, Prec=1.000, Rec=1.000, F1=1.000, AUC=1.000, Brier=0.000
RandomForestClassifier: Acc=0.944, Prec=1.000, Rec=0.659, F1=0.795, AUC=0.999, Brier=0.042
SVC (prob): Acc=0.999, Prec=1.000, Rec=0.996, F1=0.998, AUC=1.000, Brier=0.001
Sample predicted odds (first 10 rows):
   Actual  Match_Probability
0       0       5.636707e-04
1       0       1.000000e-07
2       0       1.055985e-04
3       0       4.739949e-04
4       0       1.482866e-04
5       0       1.000000e-07
6       1       9.999987e-01
7       0       2.990214e-03
8       0       3.810091e-03
9       1       1.000000e+00
Modeling complete.
