In [1]:
import logging
import os
import re
from pathlib import Path

import numpy as np
import pandas as pd
import torch
from hydra import compose, initialize
from hydra.core.hydra_config import HydraConfig
from hydra.utils import instantiate
from lightning import seed_everything
from torch import nn
from torch.utils.data import DataLoader
from tqdm import tqdm

from src.utils.torch_utils import collate

In [2]:
EXPERIMENT = os.getenv("EXPERIMENT", "000")  # input your experiment number as environment variable

In [3]:
if EXPERIMENT is None:
    raise ValueError("EXPERIMENT is not set")

with initialize(version_base=None, config_path="../../configs"):
    CFG = compose(
        config_name="config.yaml",
        return_hydra_config=True,
        overrides=[f"experiment={EXPERIMENT}"],
    )
    HydraConfig.instance().set_config(CFG)  # use HydraConfig for notebook to use hydra job

logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

if not logger.handlers:
    handler = logging.StreamHandler()
    logger.addHandler(handler)

INPUT_DIR = Path(CFG.paths.input_dir)

logger.info(f"start experiment={EXPERIMENT} 🚀")
seed_everything(CFG.seed)
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

start experiment=000 🚀
Seed set to 42


42

### load data


In [5]:
def remove_brackets_sentence(text: str) -> str:
    pattern = re.compile(r"^.*\[.*\].*$", re.MULTILINE)
    return re.sub(pattern, "", text)


def remove_title_sentence(text: str) -> str:
    pattern = re.compile(r"D^[ \t]*title:.*$", re.IGNORECASE | re.MULTILINE)
    result = re.sub(pattern, "", text)
    result = re.sub(r"\n{2,}", "\n", result)
    return result.strip()


def remove_quotes(text: str) -> str:
    return re.sub(r'[\'"]', "", text)


def clean_text(text: str) -> pd.DataFrame:
    cleanse_fns = [
        remove_brackets_sentence,
        remove_title_sentence,
        remove_quotes,
    ]
    for fn in cleanse_fns:
        text = fn(text)
    return text.lower()

In [6]:
test_df = pd.read_csv(INPUT_DIR / "test_essays.csv")
test_df["cleaned_text"] = test_df["text"].apply(clean_text)

### Inference


In [None]:
def inference_fn(
    net: nn.Module,
    dataloader: DataLoader,
    device: str,
) -> dict:
    outputs = []
    net.to(device)
    net.eval()

    iteration_bar = tqdm(enumerate(dataloader), total=len(dataloader))
    for _, batch in iteration_bar:
        batch = collate(batch)
        for k, v in batch.items():
            batch[k] = v.to(device)

        with torch.no_grad():
            batch_outputs = net(batch)

        batch_outputs = batch_outputs.cpu().sigmoid().numpy()
        outputs.append(batch_outputs)

    outputs = np.concatenate(outputs)
    return outputs

In [None]:
base_output_dir = Path(CFG.paths.output_dir)  # store output_dir for later use

test_predictions = 0
for i_fold in range(CFG.n_splits):
    if i_fold not in CFG.train_folds:
        continue

    CFG.paths.output_dir = str(base_output_dir / f"fold{i_fold}")
    best_weights_path = Path(CFG.paths.output_dir) / "weights" / "best.pth"

    net = instantiate(CFG.lightning.model.net, pretrained=False)
    test_dataloader = instantiate(CFG.lightning.data.test_dataloader)
    net.load_state_dict(torch.load(best_weights_path))

    outputs = inference_fn(net=net, dataloader=test_dataloader, device=DEVICE)
    test_predictions += outputs / len(CFG.train_folds)

CFG.paths.output_dir = str(base_output_dir)  # restore output_dir

### Make Submission


In [None]:
submission_df = pd.read_csv(INPUT_DIR / "sample_submission.csv")
submission_df["generated"] = test_predictions

submission_df.to_csv(Path(CFG.paths.submission_dir) / "submission.csv", index=False)
submission_df