In [1]:
import wandb
import numpy as np
import pandas as pd
from scipy.special import expit
from sklearn.model_selection import StratifiedGroupKFold
import transformers
import warnings
import logging
import torch
warnings.simplefilter('ignore')
logging.disable(logging.WARNING)

from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import datasets

from coral_pytorch.losses import coral_loss
from coral_pytorch.dataset import levels_from_labelbatch
from coral_pytorch.dataset import proba_to_label

## Init and data load

In [None]:
config = dict(
    project_name="usppm",
    entity="parambharat",
    group="usppm-hf",
    model_name_or_path="anferico/bert-for-patents", # "distilbert-base-uncased", "anferico/bert-for-patents", #  "microsoft/deberta-v3-small",
    problem_type="regression", # "regression", #regression, multi_label_classification
    val_size=0.25,
    learning_rate=3e-6,
    batch_size=64,
    weight_decay=0.01,
    epochs = 5,
    data_dir="../data",
    log_dir = "../logs",
    output_dir="../outputs",
    warmup_ratio=0.1,
    lr_scheduler_type="cosine",
    
    
)


In [None]:
wandb.login()
run = wandb.init(project=config["project_name"], entity=config["entity"], config=config)

In [None]:
train_data = pd.read_csv(run.config.data_dir + "/train.csv").sample(frac=1, random_state=42)
test_data = pd.read_csv(run.config.data_dir +"/test.csv").sample(frac=1, random_state=42)

## EDA

In [None]:
def enrich_metadata(data:pd.DataFrame)->pd.DataFrame:
    context_mapping = {
            "A": "Human Necessities",
            "B": "Operations and Transport",
            "C": "Chemistry and Metallurgy",
            "D": "Textiles",
            "E": "Fixed Constructions",
            "F": "Mechanical Engineering",
            "G": "Physics",
            "H": "Electricity",
            "Y": "Emerging Cross-Sectional Technologies",
        }
    data.loc[:, "context_desc"] = data["context"].apply(lambda x: context_mapping[x[0]])
    data.loc[:, "anchor_length"] = data["anchor"].str.split().map(len)
    data.loc[:, "target_length"] = data["target"].str.split().map(len)

    return data

In [None]:
train_data = enrich_metadata(train_data)
test_data = enrich_metadata(test_data)

train_table = wandb.Table(dataframe=train_data)
test_table = wandb.Table(dataframe=test_data)

In [None]:
run.log({"train_dataset": train_table, "test_dataset": test_table})

In [None]:
def log_col_stats(df:pd.DataFrame, col:str):
    stats_df = (
        pd.DataFrame(df[col].value_counts())
        .reset_index()
        .rename({"index":col, col: "count"}, axis=1)
        .sort_values("count", ascending=False)
    )
    table = wandb.Table(dataframe=stats_df)
    return wandb.log({f"{col}_counts": table})
    
for col in train_data.columns:
    if col not in ["id"]:
        log_col_stats(train_data, col)
        


## Training and Validation

In [None]:
intersection = np.intersect1d(train_data.anchor.unique(), test_data.anchor.unique())
intersection.shape[0]

In [None]:
sgkf = StratifiedGroupKFold(n_splits=int(1/run.config.val_size))
train_index, val_index = next(sgkf.split(train_data, train_data["score"]*100, train_data.anchor))

In [None]:
def convert_df_to_dataset(data_df, stage="train", problem_type="regression"):
    if stage == "train":
        if problem_type == "single_label_classification":
            data_df.loc[:, "score"] = (data_df["score"] * (data_df["score"].nunique()-1)).astype(int)
            
        elif problem_type == "multi_label_classification":
            data_df.loc[:, "score"] = (data_df["score"] * (data_df["score"].nunique()-1)).astype(int)
#             score_mat = np.zeros((data_df.shape[0],4), dtype=int)
#             for i, item in enumerate(data_df.score.tolist()):
#                 score_mat[i,:item]=1
            data_df.loc[:, "score"] = pd.Series(
                levels_from_labelbatch(data_df.score, data_df["score"].nunique())
                .numpy().tolist())
        
        cols = ["anchor", "target", "context_desc", "score"]
        data_df = data_df[cols].rename({"score": "label"}, axis=1)
    else:
        cols = ["anchor", "target", "context_desc"]
        data_df = data_df[cols]
    data_df["inputs"] = (
        "in the context of " +
        data_df["context_desc"]  +
        " how similar is " +
        data_df["anchor"] +
        " to " +
        data_df["target"]
    )

    dataset = datasets.Dataset.from_pandas(data_df, preserve_index=False)
    return dataset

train_dataset = convert_df_to_dataset(train_data.copy(), problem_type=run.config.problem_type)

experiment_datasets = datasets.DatasetDict({
    "train":train_dataset.select(train_index),
    "validation": train_dataset.select(val_index)}
)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(run.config.model_name_or_path)

def tokenizer_fn(examples):
    return tokenizer(examples["inputs"])


In [None]:
dropped_cols = [col for col in ["inputs", "anchor", "target", "context_desc", "label"] if not col == "label"]
preprocessed_datasets = experiment_datasets.map(
    tokenizer_fn, 
    batched=True,
    remove_columns=dropped_cols,
    num_proc=10,
#     batch_size=100
)

In [None]:
def pearson_corr_reg(eval_pred):
    predictions = eval_pred.predictions.flatten()
    labels = eval_pred.label_ids
    return {'pearson': np.corrcoef(labels, predictions)[0][1]}

def pearson_corr_clf(eval_pred):
    predictions = np.argmax(eval_pred.predictions, axis=1).flatten().astype(float)
    labels = eval_pred.label_ids.astype(float)    
    return {'pearson': np.corrcoef(predictions, labels)[0][1]}

def pearson_corr_ord(eval_pred):
    predictions = proba_to_label(torch.Tensor(expit(eval_pred.predictions))).numpy()
    labels = eval_pred.label_ids
    labels = (labels==0).argmax(axis=1)
    print(predictions[:10])
    print(labels[:10])
    return {'pearson': np.corrcoef(predictions, labels)[0][1]}
    

In [None]:
training_args = TrainingArguments(
    run.config.output_dir,
    learning_rate=run.config.learning_rate, 
    warmup_ratio=run.config.warmup_ratio,
    lr_scheduler_type=run.config.lr_scheduler_type,
    per_device_train_batch_size=run.config.batch_size,
    per_device_eval_batch_size=run.config.batch_size,
    num_train_epochs=run.config.epochs,
    weight_decay=run.config.weight_decay,
    report_to="wandb",
    fp16=True,
    evaluation_strategy="epoch",
)

if run.config.problem_type == "regression":
    run.config.num_labels = 1
    metric_fn = pearson_corr_reg

elif run.config.problem_type == "single_label_classification":
    run.config.num_labels = train_data.score.nunique()
    metric_fn = pearson_corr_clf
    
elif run.config.problem_type == "multi_label_classification":
    run.config.num_labels = train_data.score.nunique()-1
    metric_fn = pearson_corr_ord
    
def model_init():
    model = AutoModelForSequenceClassification.from_pretrained(
        run.config.model_name_or_path,
        num_labels=run.config.num_labels,
        problem_type=run.config.problem_type
    )
    return model


class CoralTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        loss = coral_loss(logits, labels)
        return (loss, outputs) if return_outputs else loss

trainer = CoralTrainer(
    model_init=model_init,
    args=training_args,
    train_dataset=preprocessed_datasets['train'],
    eval_dataset=preprocessed_datasets['validation'],
    tokenizer=tokenizer,
    compute_metrics=metric_fn,
)

In [None]:
trainer.hyperparameter_search(
    direction="maximize", 
    backend="wandb",
    n_trials=10, # number of trials
    project = run.config.project_name,
    entity = run.config.entity,
    metric = "pearson",
)

## Inference

In [None]:
test_dataset = convert_df_to_dataset(test_data, stage="test", problem_type=run.config.problem_type)

In [None]:
inference_dataset = test_dataset.map(
    tokenizer_fn, 
    batched=True,
    remove_columns=dropped_cols,
    num_proc=10,
    batch_size=100
)

In [None]:
run.config.problem_type

In [None]:
run.config.problem_type = config["problem_type"] if not run.config.problem_type else run.config.problem_type

In [None]:
def predict(dataset, problem_type="regression"):
    model_outputs = trainer.predict(dataset)
    if problem_type == "regression":
        predictions = model_outputs.predictions.flatten().clip(0,1)
    elif problem_type == "single_label_classification":
        predictions = np.argmax(model_outputs.predictions, axis=1)
        predictions = predictions / (run.config.num_labels -1)
    elif problem_type == "multi_label_classification":
        predictions = (expit(model_outputs.predictions) > 0.5).astype(int)
        predictions = expit(model_outputs.predictions)
#         predictions = (predictions==0).argmax(axis=1)
    return predictions

In [None]:
predict(inference_dataset, problem_type=run.config.problem_type)

In [None]:
# test_predictions = predict(inference_dataset, problem_type=run.config.problem_type)
# val_predictions = predict(preprocessed_datasets["validation"], problem_type=run.config.problem_type)
test_predictions

In [None]:
val_data = train_data.iloc[val_index]

val_data.loc[:, "prediction"] = val_predictions
val_table = wandb.Table(dataframe=val_data)
run.log({"val_predictions": val_table})


test_data.loc[:, "prediction"] = test_predictions
test_table = wandb.Table(dataframe=test_data)

run.log({"test_predictions": test_table})


submissions_df = test_data[["id", 'prediction']].rename({"prediction": "score"}, axis=1)

submission_fname = run.config.output_dir+"/submission_"+wandb.util.generate_id()+".csv"
submissions_df.to_csv(submission_fname, index=False, index_label=False)

In [None]:
%wandb