In [None]:
import csv
import numpy as np
import pandas as pd

from pathlib import Path

from core import ROOT, RunConfig, DataLoader

from model_evaluation import OrderedModelWrapper

from estimator_models import read_predictions

In [None]:
from IPython.utils import io

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.linear_model import LogisticRegression, RidgeClassifierCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import RepeatedStratifiedKFold, GridSearchCV
from sklearn.metrics import mean_absolute_error

from statsmodels.miscmodels.ordinal_model import OrderedModel
from mord import LogisticAT, LogisticIT

from surprise import Dataset, Reader
from surprise.model_selection import GridSearchCV as SpGridSearchCV
from surprise.model_selection.split import RepeatedKFold as SpRepeatedKFold
from surprise import SVDpp, KNNWithMeans, KNNWithZScore

In [None]:
Y_vars = ["expertise", "confidence"]

## Clf & Ord models

In [None]:
def clf_ord_fit_predict(Y_var, model_name, model, bids_one_hot, standardize):
    #
    # Load data
    #
    dl = DataLoader()

    A = dl.get_assignment_matrix()
    Bids = dl.get_bids_matrix(RunConfig()) # default bid values
    Aff = dl.get_affinity_matrix()
    Y = dl.get_outcome_matrix(Y_var)
    Y = Y.astype(int) # required by mord, ok with all other
    Y_min, Y_max = dl.get_outcome_bounds(Y_var)
    idxs_2d = np.array([[f"{i}_{j}" \
         for j in range(Y.shape[1])] for i in range(Y.shape[0])])

    # flatten
    A = A.flatten()
    Bids = Bids.flatten()
    Aff = Aff.flatten()
    Y = Y.flatten()
    idxs_2d = idxs_2d.flatten()

    # fetch observed
    Bids_obs = Bids[A == 1]
    Aff_obs = Aff[A == 1]
    idxs_2d_obs = idxs_2d[A == 1]

    # bids: 1-hot encoding
    if bids_one_hot:
        enc_1hot = OneHotEncoder(sparse=False)
        enc_1hot.fit(Bids.reshape(-1, 1))
        Bids_obs = enc_1hot.transform(Bids_obs.reshape(-1, 1))
        Bids = enc_1hot.transform(Bids.reshape(-1, 1))

    X = np.column_stack((Bids, Aff))
    X_obs = np.column_stack((Bids_obs, Aff_obs))
    Y_obs = Y[A == 1]

    # standardize
    if standardize:
        enc_scaler = StandardScaler()
        enc_scaler.fit(X_obs)
        X_obs = enc_scaler.transform(X_obs)
        X = enc_scaler.transform(X)

    if bids_one_hot:
        # remove constant features: happens when using 1-hot encoding
        enc_var = VarianceThreshold(threshold=0)
        enc_var.fit(X_obs)
        X_obs = enc_var.transform(X_obs)
        X = enc_var.transform(X)

    #
    # fit & predict
    #
    model.fit(X_obs, Y_obs)

    Y_obs_hat = model.predict(X_obs)
    Y_hat = model.predict(X)

    best_params = model.best_params_ if isinstance(model, GridSearchCV) else None
    mae_obs = mean_absolute_error(Y_obs, Y_obs_hat)

    print(f">> {Y_var}\t {model_name}\t mae_obs: {mae_obs:.4f}\t best_params: {best_params}")

    #
    # output
    #
    df_y_hat_rows = []

    for ij_str, y_hat_ij in zip(idxs_2d, Y_hat):
        i, j = ij_str.split("_")
        i, j = int(i), int(j)    
        df_y_hat_rows.append({
            "paper_idx": i, 
            "reviewer_idx": j, 
            "pred": y_hat_ij
        })

    df_y_hat = pd.DataFrame(df_y_hat_rows)

    df_y_hat.to_csv(
        f"{ROOT}/model_imputation/preds/{Y_var}_{model_name}.csv",
        index=False
    )

In [None]:
clf_ord_configs = [
    # -- clf
    {
        "model_name": "logistic-reg",
        "model": LogisticRegression(),
        "bids_one_hot": False,
        "standardize": False
    },
    {
        "model_name": "logistic-reg-stded",
        "model": LogisticRegression(),
        "bids_one_hot": False,
        "standardize": True
    },
    {
        "model_name": "knn-clf",
        "model": GridSearchCV(
            estimator=KNeighborsClassifier(), 
            param_grid={"n_neighbors": [1, 3, 5, 7, 10, 15, 20, 25, 30, 40, 50]}, 
            cv=RepeatedStratifiedKFold(n_splits=10, n_repeats=10, random_state=0),
            scoring="neg_mean_absolute_error"),
        "bids_one_hot": False,
        "standardize": True
    },
    {
        "model_name": "rigdge-clf",
        "model": RidgeClassifierCV(
            alphas=np.arange(0.1, 10, 0.25), 
            store_cv_values=True, 
            scoring="neg_mean_absolute_error"),
        "bids_one_hot": True,
        "standardize": True
    },
    # -- ord
    {
        "model_name": "ord-logit",
        "model": OrderedModelWrapper(distr="logit"),
        "bids_one_hot": True,
        "standardize": True
    },
    {
        "model_name": "ord-probit",
        "model": OrderedModelWrapper(distr="probit"),
        "bids_one_hot": True,
        "standardize": True
    },
    {
        "model_name": "ord-logistic-at",
        "model": LogisticAT(),
        "bids_one_hot": False,
        "standardize": True
    },
    {
        "model_name": "ord-logistic-it",
        "model": LogisticIT(),
        "bids_one_hot": False,
        "standardize": True
    }
]

In [None]:
#
# ALL CLF & ORD MODELS
#
for cfg in clf_ord_configs:
    for Y_var in Y_vars:
        clf_ord_fit_predict(
            Y_var, 
            cfg["model_name"],
            cfg["model"], 
            cfg["bids_one_hot"], 
            cfg["standardize"]
        )

## Rec models

In [None]:
def rec_fit_predict(Y_var, model_name, model):
    #
    # Load data
    #
    dl = DataLoader()

    A = dl.get_assignment_matrix()
    Y = dl.get_outcome_matrix(Y_var)
    Y = Y.astype(int)
    Y_min, Y_max = dl.get_outcome_bounds(Y_var)
    idxs_2d = np.array([[f"{i}_{j}" \
         for j in range(Y.shape[1])] for i in range(Y.shape[0])])

    # flatten
    A = A.flatten()
    Y = Y.flatten()
    idxs_2d = idxs_2d.flatten()

    # fetch observed
    idxs_2d_obs = idxs_2d[A == 1]
    Y_obs = Y[A == 1]

    # originaly: (paper, reviewer) 
    # here: (reviewer, paper) 
    idxs_2d_obs_pairs = [(i.split("_")[1], i.split("_")[0]) for i in idxs_2d_obs]
    idxs_2d_pairs = [(i.split("_")[1], i.split("_")[0]) for i in idxs_2d]

    #
    # Make dataset
    #
    df_obs_rows = []

    for ij, Y_ij in zip(idxs_2d_obs_pairs, Y_obs):
        df_obs_rows.append({
            "reviewer": ij[0],
            "paper": ij[1],
            "Y": int(Y_ij)
        })

    df_obs = pd.DataFrame(df_obs_rows)

    reader = Reader(rating_scale=(Y_min, Y_max))

    ds_obs = Dataset.load_from_df(df_obs[["reviewer", "paper", "Y"]], reader)

    #
    # fit & predict
    #
    with io.capture_output():
        if isinstance(model, SpGridSearchCV):
            model.fit(ds_obs)
        else:
            model.fit(ds_obs.build_full_trainset())

    Y_obs_hat = np.array([model.predict(*i).est for i in idxs_2d_obs_pairs])

    best_params = model.best_params['mae'] if isinstance(model, SpGridSearchCV) else None

    mae_obs = mean_absolute_error(Y_obs, Y_obs_hat)

    print(f">> {Y_var}\t {model_name}\t mae_obs: {mae_obs:.4f}\t best_params: {best_params}")

    #
    # Output predictions
    #
    df_y_hat_rows = []

    for i, j in idxs_2d_pairs:
        y_hat_ij = model.predict(i, j).est
        # NB: idxs were switched
        df_y_hat_rows.append({
            "paper_idx": j, 
            "reviewer_idx": i, 
            "pred": y_hat_ij
        })

    df_y_hat = pd.DataFrame(df_y_hat_rows)

    df_y_hat.to_csv(
        f"{ROOT}/model_imputation/preds/{Y_var}_{model_name}.csv",
        index=False
    )

In [None]:
rec_configs = [
    {
        "model_name": "rec-svdpp",
        "model": SpGridSearchCV(
            SVDpp, 
            {"n_factors": [2, 3, 4, 5, 7, 10, 15, 20], "random_state": [0]}, 
            cv=SpRepeatedKFold(n_splits=10, n_repeats=10, random_state=0), 
            measures=["mae"], 
            refit=True
        )
    },
    {
        "model_name": "rec-knn-with-means",
        "model": SpGridSearchCV(
            KNNWithMeans, 
            {"k": [2, 3, 4, 5, 7]}, 
            cv=SpRepeatedKFold(n_splits=10, n_repeats=10, random_state=0), 
            measures=["mae"], 
            refit=True,    
        )
    },
    {
        "model_name": "rec-knn-with-z-score",
        "model": SpGridSearchCV(
            KNNWithZScore, 
            {"k": [2, 3, 4, 5, 7]}, 
            cv=SpRepeatedKFold(n_splits=10, n_repeats=10, random_state=0), 
            measures=["mae"], 
            refit=True
        )
    }
]

In [None]:
#
# ALL REC MODELS
#
for cfg in rec_configs:
    for Y_var in Y_vars:
        rec_fit_predict(
            Y_var, 
            cfg["model_name"],
            cfg["model"], 
        )

## Sanity check predictions

In [None]:
model_names = [i["model_name"] for i in clf_ord_configs + rec_configs]

In [None]:
model_names

In [None]:
for model_name in model_names:
    for Y_var in Y_vars:
        # read true Ys
        dl = DataLoader()
        X_prop = dl.get_proposed_assignment_matrix()
        X_att, X_new = dl.get_manual_reassignments()
        X_obs = ((X_prop == 1) & (X_att != 1)) | (X_new == 1)
        Y = dl.get_outcome_matrix(Y_var)

        # read pred Ys 
        preds_fpath = f"{ROOT}/model_imputation/preds/{Y_var}_{model_name}.csv"
        Y_preds = read_predictions(preds_fpath)

        # compute MAE
        mae = mean_absolute_error(Y[X_obs], Y_preds[X_obs])

        print(f">> {Y_var}\t {model_name}\t mae: {mae:.4f}")