In [None]:
import numpy as np
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoConfig, AutoModel
import os
from types import SimpleNamespace
import yaml
import multiprocessing as mp
from glob import glob
from torch import nn
from torch.utils.data import Dataset, DataLoader
import collections
import lightgbm


os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [None]:
EXP_NAME = "efficiency-prize-v2"

N_CORES = mp.cpu_count()

if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
    data_folder = "test"
    df = pd.read_csv("../input/feedback-prize-effectiveness/test.csv")
    CALC_SCORE = False
else:
    data_folder = "train"
    df = pd.read_csv("../input/feedback-prize-effectiveness/train.csv")
    ids = df.essay_id.unique()
    np.random.seed(1337)
    val_ids = np.random.choice(ids, size=3000, replace=False)
    df = df[df.essay_id.isin(val_ids)]
    df = df.reset_index(drop=True)
    CALC_SCORE = True

In [None]:
def _read_data(essay_id):
    fname = f"../input/feedback-prize-effectiveness/{data_folder}/{essay_id}.txt"
    with open(fname) as f:
        lines = f.read()
        
    return lines

essay_ids = df.essay_id.unique()

pool_obj = mp.Pool(N_CORES)
results = pool_obj.map(_read_data, essay_ids)

essay_texts = dict(zip(essay_ids, results))
df["essay_text"] = df.essay_id.map(essay_texts)

In [None]:
cfg = yaml.safe_load(open(f"../input/{EXP_NAME}/cfg.yaml").read())
for k, v in cfg.items():
    if type(v) == dict:
        cfg[k] = SimpleNamespace(**v)
cfg = SimpleNamespace(**cfg)

cfg.architecture.cache_dir = f"../input/{EXP_NAME}/deberta-v3-large/"

In [None]:
tokenizer = AutoTokenizer.from_pretrained(cfg.architecture.cache_dir)

cfg._tokenizer_sep_token = tokenizer.sep_token

cfg._tokenizer_start_token_id = []
cfg._tokenizer_end_token_id = []

d_types = sorted(df.discourse_type.unique())

for t in d_types:
    tokenizer.add_tokens([f"[START_{t}]"], special_tokens=True)
    cfg._tokenizer_start_token_id.append(tokenizer.encode(f"[START_{t}]")[1])
    
for t in d_types:
    tokenizer.add_tokens([f"[END_{t}]"], special_tokens=True)
    cfg._tokenizer_end_token_id.append(tokenizer.encode(f"[END_{t}]")[1])

tokenizer.add_tokens([f"\n"], special_tokens=True)
cfg._tokenizer_size = len(tokenizer)

In [None]:
grps = df.groupby("essay_id", sort=False)
grp_texts = []

for grp in grps.groups:
    g = grps.get_group(grp)
    t = g.essay_text.values[0]

    end = 0
    for j in range(len(g)):
        d = g.discourse_text.values[j]
        start = t[end:].find(d.strip())
        start = start + end

        end = start + len(d.strip())
        t = (
            t[:start]
            + f" [START_{g.discourse_type.values[j]}]  "
            + t[start:end]
            + f" [END_{g.discourse_type.values[j]}] "
            + t[end:]
        )

    t = " ".join(g.discourse_type.values) + f" {cfg._tokenizer_sep_token} " + t
    grp_texts.append(t)

In [None]:
def encode(text):
    sample = dict()
    encodings = tokenizer(
        text,
        return_tensors="pt",
        padding="max_length",
        truncation=True,
        max_length=cfg.tokenizer.max_length,
    )
    sample["input_ids"] = encodings["input_ids"][0]
    sample["attention_mask"] = encodings["attention_mask"][0]
    return sample

In [None]:
pool_obj = mp.Pool(N_CORES)
grp_texts = pool_obj.map(encode, grp_texts)

lens = [torch.sum(x["attention_mask"]).item() for x in grp_texts]

lens_map = df[["essay_id"]].drop_duplicates()
lens_map["count"] = lens
lens_map["orig_essay_order"] = range(len(lens_map))

df["orig_order"] = range(len(df))
df = df.merge(lens_map)
df = df.sort_values(["count", "essay_id", "orig_order"], ascending=True).reset_index(drop=True)
grp_texts = [grp_texts[i] for i in df["orig_essay_order"].unique()]

In [None]:
class FeedbackDataset(Dataset):
    def __init__(self, grp_texts):
        self.grp_texts = grp_texts

    def __len__(self):
        return len(self.grp_texts)

    def batch_to_device(batch, device):
        if isinstance(batch, torch.Tensor):
            return batch.to(device)
        elif isinstance(batch, collections.abc.Mapping):
            return {
                key: FeedbackDataset.batch_to_device(value, device)
                for key, value in batch.items()
            }

    def __getitem__(self, idx):
        sample = self.grp_texts[idx]
        if idx == 0:
            print(sample)
        return sample

In [None]:
class NLPAllclsTokenPooling(nn.Module):
    def __init__(self, dim):
        super(NLPAllclsTokenPooling, self).__init__()

        self.dim = dim
        self.feat_mult = 3

    def forward(self, x, attention_mask, input_ids, cfg):
        ret = []
        for j in range(x.shape[0]):
            idx0 = torch.where(
                (input_ids[j] >= min(cfg._tokenizer_start_token_id))
                & (input_ids[j] <= max(cfg._tokenizer_start_token_id))
            )[0]
            idx1 = torch.where(
                (input_ids[j] >= min(cfg._tokenizer_end_token_id))
                & (input_ids[j] <= max(cfg._tokenizer_end_token_id))
            )[0]

            xx = []
            for jj in range(len(idx0)):
                xx0 = x[j, idx0[jj]]
                xx1 = x[j, idx1[jj]]
                xx2 = x[j, idx0[jj] + 1 : idx1[jj]].mean(dim=0)
                xxx = torch.cat([xx0, xx1, xx2]).unsqueeze(0)
                xx.append(xxx)
            xx = torch.cat(xx)
            ret.append(xx)

        return ret


class FeedbackModel(nn.Module):
    def __init__(self, cfg):
        super(FeedbackModel, self).__init__()

        self.cfg = cfg
        self.n_classes = 3
        config = AutoConfig.from_pretrained(cfg.architecture.cache_dir)
        self.backbone = AutoModel.from_config(config)
        self.backbone.pooler = None
        self.backbone.resize_token_embeddings(cfg._tokenizer_size)

        self.pooling = NLPAllclsTokenPooling(
            dim=1
        )  # init pooling and pool over token dimension
        self.head = nn.Linear(
            self.backbone.config.hidden_size * self.pooling.feat_mult, self.n_classes
        )

    def get_features(self, batch):
        attention_mask = batch["attention_mask"]
        input_ids = batch["input_ids"]

        x = self.backbone(
            input_ids=input_ids, attention_mask=attention_mask
        ).last_hidden_state

        x = self.pooling(x, attention_mask, input_ids, cfg=self.cfg)
        x = torch.cat(x)

        return x

    def forward(self, batch):
        idx = int(torch.where(batch["attention_mask"] == 1)[1].max())
        idx += 1
        batch["attention_mask"] = batch["attention_mask"][:, :idx]
        batch["input_ids"] = batch["input_ids"][:, :idx]

        x = self.get_features(batch)
        logits = self.head(x)

        return {"logits": logits}

In [None]:
def run_predictions(cfg, grp_texts, bs=1):
    ds = FeedbackDataset(grp_texts)

    model = FeedbackModel(cfg).to("cuda").eval()
    d = torch.load(f"../input/{EXP_NAME}/checkpoint.pth", map_location="cuda")
    model.load_state_dict(collections.OrderedDict(d["model"]), strict=True)

    dl = DataLoader(ds, shuffle=False, batch_size=bs, num_workers=N_CORES)

    with torch.inference_mode():
        preds = []
        for batch in dl:
            batch = FeedbackDataset.batch_to_device(batch, "cuda")

            with torch.cuda.amp.autocast():
                out = model(batch)
                preds.append(
                    out["logits"].float().softmax(dim=1).detach().cpu().numpy()
                )

    return np.concatenate(preds, axis=0)

In [None]:
pp = run_predictions(cfg, grp_texts=grp_texts, bs=16)

In [None]:
df["Adequate"] = pp[:, 0] 
df["Effective"] = pp[:, 1] 
df["Ineffective"] = pp[:, 2] 

In [None]:
orig_preds = pp.copy()

In [None]:
label_cols = ["Adequate", "Effective", "Ineffective"]
oof_cols = []
for j, l in enumerate(label_cols):

    df[f"oof_{l}"] = pp[:,j]
    oof_cols.append(f"oof_{l}")

In [None]:
class FeedbackStackerModel(nn.Module):
    def __init__(self, n_features):
        super(FeedbackStackerModel, self).__init__()

        self.sizes = [256, 128, 64]

        self.features = nn.Sequential(
            nn.utils.weight_norm(nn.Linear(n_features, self.sizes[0])),
            nn.PReLU(),
            nn.Linear(self.sizes[0], self.sizes[1]),
            nn.PReLU(),
            nn.Linear(self.sizes[1], self.sizes[2]),
            nn.PReLU(),
        )
        self.head = nn.Linear(self.sizes[-1], 3)

        self.loss_fn = nn.CrossEntropyLoss()

    def forward(self, x, y):
        x = self.features(x)
        x = self.head(x)

        return {"logits": x}


class FeedbackStackerDataset(Dataset):
    def __init__(self, df, mode):
        self.df = df.copy().reset_index(drop=True)
        self.mode = mode

        self.feature_cols = oof_cols.copy()
        self.label_cols = label_cols.copy()

        df = self.df

        df["len"] = df.groupby("essay_id")[f"discourse_id"].transform("count") / 10
        self.feature_cols.append("len")

        for j, l in enumerate(label_cols):
            df[f"oof_{l}_mean"] = df.groupby("essay_id")[f"oof_{l}"].transform("mean")
            self.feature_cols.append(f"oof_{l}_mean")

            df[f"oof_{l}_t_mean"] = df.groupby(["essay_id", "discourse_type"])[
                f"oof_{l}"
            ].transform("mean")
            self.feature_cols.append(f"oof_{l}_t_mean")

        self.num_features = len(self.feature_cols)

        self.X = self.df[self.feature_cols].values
        self.y = self.df[self.label_cols].values

    def __getitem__(self, idx):
        X = self.X[idx]
        y = self.y[idx]

        return torch.FloatTensor(X), torch.FloatTensor(y)

    def __len__(self):
        return self.df.shape[0]


ds = FeedbackStackerDataset(df.copy(), mode="val")


def run_nn_stacker(exp_name, df, BS=64):
    ds = FeedbackStackerDataset(df.iloc[:].copy(), mode="test")

    checkpoints = glob(f"{exp_name}/*.pth")

    models = []
    for checkpoint in checkpoints:
        print(f"running model {checkpoint}")

        model = FeedbackStackerModel(n_features=ds.num_features).to("cuda").eval()
        model_weights = torch.load(checkpoint, map_location="cuda")

        model.load_state_dict(collections.OrderedDict(model_weights), strict=True)
        models.append(model)

    dl = DataLoader(ds, shuffle=False, batch_size=BS, num_workers=N_CORES)

    with torch.no_grad():
        preds = []
        for batch in dl:
            data = [x.to("cuda") for x in batch]
            inputs, target = data
            p = []
            for model in models:
                out = model(inputs, target)
                p.append(out["logits"].float().softmax(dim=1))
            preds.append(torch.mean(torch.stack(p), dim=0).detach().cpu().numpy())

    preds = np.concatenate(preds, axis=0)

    return preds


nn_stacker_preds = run_nn_stacker("../input/efficiency-prize-v2/nn", df, BS=256)

In [None]:
def gen_x(values):
    return np.histogram(
        np.clip(values, 0.001, 0.999), bins=3, density=True, range=(0, 1)
    )[0]

In [None]:
all_groups = []

gb = df.groupby("essay_id", sort=False)

df["n_types"] = gb["discourse_type"].transform(lambda x: x.nunique())
df["mean_Ineffective"] = gb["Ineffective"].transform("mean")

for name, group in gb:
    class_name = "Ineffective"
    for idx, val in enumerate(gen_x(group[class_name].values)):
        group[f"{class_name}_bin_{idx}"] = val

    all_groups.append(group)

df = pd.concat(all_groups).reset_index(drop=True)

disc_types_mapping = {
    "Lead": 0,
    "Position": 1,
    "Claim": 2,
    "Evidence": 3,
    "Counterclaim": 4,
    "Rebuttal": 5,
    "Concluding Statement": 6,
}

df["len_disc"] = df.discourse_text.str.len()
df["discourse_type"] = df["discourse_type"].map(disc_types_mapping)
df["paragraph_cnt"] = df.essay_text.map(lambda x: len(x.split("\n\n")))

In [None]:
lgb_stacker_preds = []

for fold in range(5):
    gbm = lightgbm.Booster(model_file=f"../input/efficiency-prize-v2/lightgbm/model_fold_{fold}.txt")
    valid_pred = gbm.predict(
        df[
            [
                "discourse_type",
                "Adequate",
                "Effective",
                "Ineffective",
                "n_types",
                "Ineffective_bin_0",
                "Ineffective_bin_1",
                "Ineffective_bin_2",
                "mean_Ineffective",
                "len_disc",
                "paragraph_cnt",
            ]
        ]
    )
    lgb_stacker_preds.append(valid_pred)
    
lgb_stacker_preds = np.array(lgb_stacker_preds).mean(axis=0)

In [None]:
all_preds = [
    orig_preds,
    lgb_stacker_preds,
    nn_stacker_preds,
]

all_preds = np.average(all_preds, axis=0, weights=[2, 1, 1 ])

In [None]:
df["Adequate"] = all_preds[:, 0]
df["Effective"] = all_preds[:, 1]
df["Ineffective"] = all_preds[:, 2]

df[["discourse_id", "Ineffective", "Adequate", "Effective"]].to_csv("submission.csv", index=False)

In [None]:
if CALC_SCORE:
    from sklearn.metrics import log_loss
    
    label_cols = ["Adequate", "Effective", "Ineffective"]
    
    y = np.zeros_like(all_preds)
    
    for ii, jj in enumerate([label_cols.index(x) for x in df["discourse_effectiveness"].values]):
        y[ii,jj] = 1
        
    print(log_loss(y, df[label_cols]))