In [None]:
import pandas as pd
import numpy as np
import os

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

# --- Load ---
prs = pd.read_csv("/content/aidev_pop_ge500_agent_prs.csv")
feat_path = "/content/aidev_pop_ge500_pr_features.csv" \

feat = pd.read_csv(feat_path)

# --- Keep only closed PRs (MERGED/REJECTED) ---
prs = prs[prs["pr_outcome"].isin(["MERGED", "REJECTED"])].copy()

# --- Merge features ---
df = prs.merge(feat, on="id", how="inner")

# --- Outcome: 1 = rejected, 0 = merged ---
df["y"] = (df["pr_outcome"] == "REJECTED").astype(int)

# --- Month index from created_at ---
df["created_at"] = pd.to_datetime(df["created_at"], errors="coerce", utc=True)
df = df.dropna(subset=["created_at"])
df["month_key"] = df["created_at"].dt.year * 12 + df["created_at"].dt.month
df["month_index"] = df["month_key"] - df["month_key"].min()

# --- Predictors ---
df["log_diff_size"] = np.log1p(df["diff_size"].clip(lower=0))
df["touches_tests"] = df["touches_tests"].astype(int)
df["touches_docs"]  = df["touches_docs"].astype(int)

y = df["y"].values

# --------
# Model WITHOUT repo effects (for comparison)
# --------
X_base = df[["month_index", "log_diff_size", "touches_tests", "touches_docs"]]

base_pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", LogisticRegression(max_iter=5000, solver="lbfgs"))
])
base_pipe.fit(X_base, y)
p_base = base_pipe.predict_proba(X_base)[:, 1]

eps = 1e-15
ll_base = np.sum(y*np.log(np.clip(p_base, eps, 1-eps)) + (1-y)*np.log(np.clip(1-p_base, eps, 1-eps)))
p_null = np.full_like(y, y.mean(), dtype=float)
ll_null = np.sum(y*np.log(np.clip(p_null, eps, 1-eps)) + (1-y)*np.log(np.clip(1-p_null, eps, 1-eps)))
mcfadden_base = 1 - (ll_base / ll_null)

print("Pseudo-R2 (no repo effects):", round(mcfadden_base, 3))

# --------
# Model WITH repo fixed effects
# --------
X = df[["month_index", "log_diff_size", "touches_tests", "touches_docs", "repo_id"]].copy()

numeric_features = ["month_index", "log_diff_size", "touches_tests", "touches_docs"]
categorical_features = ["repo_id"]

preprocess = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(with_mean=False), numeric_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features),
    ]
)

repo_pipe = Pipeline([
    ("prep", preprocess),
    ("clf", LogisticRegression(
        max_iter=5000,
        solver="saga",
        penalty="l2",
        C=1.0,
        n_jobs=-1
    ))
])

repo_pipe.fit(X, y)
p_repo = repo_pipe.predict_proba(X)[:, 1]

ll_repo = np.sum(y*np.log(np.clip(p_repo, eps, 1-eps)) + (1-y)*np.log(np.clip(1-p_repo, eps, 1-eps)))
mcfadden_repo = 1 - (ll_repo / ll_null)

print("Pseudo-R2 (with repo effects):", round(mcfadden_repo, 3))


Pseudo-R2 (no repo effects): 0.023
Pseudo-R2 (with repo effects): 0.217
