In [1]:
!pip install wandb kaggle datasets transformers coral_pytorch sentencepiece -qqq

In [2]:
import wandb
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33mparambharat[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

Authenticating with Kaggle using kaggle.json
Navigate to https://www.kaggle.com. Then go to the Account tab of your user profile and select Create API Token. This will trigger the download of kaggle.json, a file containing your API credentials.

Then run the cell below to upload kaggle.json to your Colab runtime.

In [3]:
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))
  
# Then move kaggle.json into the folder where the API expects to find it.
!mkdir -p ~/.kaggle/ && mv kaggle.json ~/.kaggle/ && chmod 600 ~/.kaggle/kaggle.json

Saving kaggle.json to kaggle.json
User uploaded file "kaggle.json" with length 64 bytes


In [5]:
# Download and setup the data for use with this colab
!kaggle competitions download -c us-patent-phrase-to-phrase-matching
!mkdir inputs
!mv us-patent-phrase-to-phrase-matching.zip inputs/
!unzip inputs/us-patent-phrase-to-phrase-matching.zip -d inputs/


Downloading us-patent-phrase-to-phrase-matching.zip to /content
  0% 0.00/682k [00:00<?, ?B/s]
100% 682k/682k [00:00<00:00, 60.6MB/s]
mkdir: cannot create directory ‘inputs’: File exists
Archive:  inputs/us-patent-phrase-to-phrase-matching.zip
replace inputs/sample_submission.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: N


In [6]:
import numpy as np
import pandas as pd
from scipy.special import expit
from sklearn.model_selection import StratifiedGroupKFold
from datasets.utils.logging import set_verbosity_error
set_verbosity_error()
import transformers
import warnings
import logging
import torch
warnings.simplefilter('ignore')
logging.disable(logging.WARNING)

from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import datasets

from coral_pytorch.losses import coral_loss
from coral_pytorch.dataset import levels_from_labelbatch
from coral_pytorch.dataset import proba_to_label

## Init and data load

In [7]:
def enrich_metadata(data:pd.DataFrame)->pd.DataFrame:
    context_mapping = {
            "A": "Human Necessities",
            "B": "Operations and Transport",
            "C": "Chemistry and Metallurgy",
            "D": "Textiles",
            "E": "Fixed Constructions",
            "F": "Mechanical Engineering",
            "G": "Physics",
            "H": "Electricity",
            "Y": "Emerging Cross-Sectional Technologies",
        }
    data.loc[:, "context_desc"] = data["context"].apply(lambda x: context_mapping[x[0]])
    data.loc[:, "anchor_length"] = data["anchor"].str.split().map(len)
    data.loc[:, "target_length"] = data["target"].str.split().map(len)
    return data



def log_col_stats(df:pd.DataFrame, col:str, stage:str):
    stats_df = (
        pd.DataFrame(df[col].value_counts())
        .reset_index()
        .rename({"index":col, col: "count"}, axis=1)
        .sort_values("count", ascending=False)
    )
    table = wandb.Table(dataframe=stats_df)
    return wandb.log({f"{stage}_{col}_counts": table})
    

    
def convert_df_to_dataset(data_df, stage="train", problem_type="regression"):
    if stage == "train":
        if problem_type == "single_label_classification":
            data_df.loc[:, "score"] = (data_df["score"] * (data_df["score"].nunique()-1)).astype(int)
            
        elif problem_type == "multi_label_classification":
            data_df.loc[:, "score"] = (data_df["score"] * (data_df["score"].nunique()-1)).astype(int)
            data_df.loc[:, "score"] = pd.Series(
                levels_from_labelbatch(data_df.score, data_df["score"].nunique())
                .numpy().tolist())
        
        cols = ["anchor", "target", "context_desc", "score"]
        data_df = data_df[cols].rename({"score": "label"}, axis=1)
    else:
        cols = ["anchor", "target", "context_desc"]
        data_df = data_df[cols]
    data_df["inputs"] = (
        "in the context of " +
        data_df["context_desc"]  +
        " how similar is " +
        data_df["anchor"] +
        " to " +
        data_df["target"]
    )

    dataset = datasets.Dataset.from_pandas(data_df, preserve_index=False)
    return dataset


def load_tokenizer_fn(config):
    tokenizer = AutoTokenizer.from_pretrained(config.model_name_or_path)

    def tokenizer_fn(examples):
        return tokenizer(examples["inputs"])

    return tokenizer, tokenizer_fn

def load_and_log_datasets(config):
    train_data = pd.read_csv(config.data_dir + "/train.csv").sample(frac=1, random_state=config.random_state)
    test_data = pd.read_csv(config.data_dir +"/test.csv").sample(frac=1, random_state=config.random_state)
    
    train_data = enrich_metadata(train_data)
    test_data = enrich_metadata(test_data)

    train_table = wandb.Table(dataframe=train_data)
    test_table = wandb.Table(dataframe=test_data)

    wandb.log({"train_dataset": train_table, "test_dataset": test_table})
    
    for col in train_data.columns:
        if col not in ["id"]:
            log_col_stats(train_data, col, stage="train")
    
    for col in test_data.columns:
        if col not in ["id"]:
            log_col_stats(test_data, col, stage="test")
    
    sgkf = StratifiedGroupKFold(n_splits=int(1/config.val_size))
    train_index, val_index = next(sgkf.split(train_data, train_data["score"]*100, train_data.anchor))
    
    train_dataset = convert_df_to_dataset(train_data.copy(), stage="train", problem_type=config.problem_type)
    test_dataset = convert_df_to_dataset(test_data.copy(), stage="test", problem_type=config.problem_type)

    experiment_datasets = datasets.DatasetDict({
        "train":train_dataset.select(train_index),
        "validation": train_dataset.select(val_index),
        "test": test_dataset}
    )
    
    dropped_cols = [col for col in ["inputs", "anchor", "target", "context_desc", "label"] if not col == "label"]
    tokenizer, tokenizer_fn = load_tokenizer_fn(config)
    processed_datasets = experiment_datasets.map(
        tokenizer_fn, 
        batched=True,
        remove_columns=dropped_cols,
        num_proc=2,
    )
    return processed_datasets, tokenizer
    
    
    

In [8]:
def pearson_corr_reg(eval_pred):
    predictions = eval_pred.predictions.flatten()
    labels = eval_pred.label_ids
    return {'pearson': np.corrcoef(labels, predictions)[0][1]}

def pearson_corr_clf(eval_pred):
    predictions = np.argmax(eval_pred.predictions, axis=1).flatten().astype(float)
    labels = eval_pred.label_ids.astype(float)    
    return {'pearson': np.corrcoef(predictions, labels)[0][1]}

def pearson_corr_ord(eval_pred):
    predictions = proba_to_label(torch.Tensor(expit(eval_pred.predictions))).numpy()
    labels = eval_pred.label_ids
    labels = (labels==0).argmax(axis=1)
    return {'pearson': np.corrcoef(predictions, labels)[0][1]}
    

In [9]:
class CoralTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        loss = coral_loss(logits, labels)
        return (loss, outputs) if return_outputs else loss


def get_trainer(config,tokenizer,dataset):
    training_args = TrainingArguments(
        config.output_dir,
        learning_rate=config.learning_rate, 
        warmup_ratio=0.1,
        lr_scheduler_type="cosine",
        per_device_train_batch_size=config.batch_size,
        per_device_eval_batch_size=config.batch_size,
        num_train_epochs=config.epochs,
        weight_decay=config.weight_decay,
        report_to="wandb",
        fp16=True,
        evaluation_strategy="steps",
        logging_strategy="steps",
        logging_steps=100,
        
    )

    if config.problem_type == "regression":
        config.num_labels = 1
        metric_fn = pearson_corr_reg

    elif config.problem_type == "single_label_classification":
        config.num_labels = 5
        metric_fn = pearson_corr_clf

    elif config.problem_type == "multi_label_classification":
        config.num_labels = 4
        metric_fn = pearson_corr_ord


    model = AutoModelForSequenceClassification.from_pretrained(
        config.model_name_or_path,
        num_labels=config.num_labels,
        problem_type=config.problem_type
    )
    if config.problem_type == "multi_label_classification":
        trainer = CoralTrainer(
            model=model,
            args=training_args,
            train_dataset=dataset['train'],
            eval_dataset=dataset['validation'],
            tokenizer=tokenizer,
            compute_metrics=metric_fn,)
    else:
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=dataset['train'],
            eval_dataset=dataset['validation'],
            tokenizer=tokenizer,
            compute_metrics=metric_fn,)
    return trainer


In [10]:
defaults = {
  "batch_size": 64,
  "data_dir": "inputs",
  "entity": "parambharat",
  "epochs": 5,
  "learning_rate": 0.000003,
  "log_dir": "logs",
  "lr_scheduler_type": "cosine",
  "model_name_or_path": "anferico/bert-for-patents",
  "output_dir": "outputs",
  "problem_type": "regression",
  "project_name": "usppm",
  "random_state": 42,
  "val_size": 0.25,
  "warmup_ratio": 0.1,
  "weight_decay": 0.01
}

sweep_config={
  "method": "random",
  "metric": {
    "goal": "maximize",
    "name": "eval/pearson"
  },
  "parameters": {
    "batch_size": {
      "distribution": "q_log_uniform_values",
      "max": 96,
      "min": 32,
      "q": 8
    },
    "data_dir": {
      "value": "inputs"
    },
    "dropout": {
      "values": [
        0.3,
        0.4,
        0.5
      ]
    },
    "epochs": {
      "values": [
        4,
        6,
        8
      ]
    },
    "learning_rate": {
      "distribution": "uniform",
      "max": 5e-4,
      "min": 8e-6
    },
    "log_dir": {
      "value": "logs"
    },
    "model_name_or_path": {
      "values": [
        "microsoft/deberta-v3-small",
        "AI-Growth-Lab/PatentSBERTa",
        "anferico/bert-for-patents",
      ]
    },
    "output_dir": {
      "value": "outputs"
    },
    "problem_type": {
      "values": [
        "regression",
        "single_label_classification",
#         "multi_label_classification"
      ]
    },
    "random_state": {
      "value": 42
    },
    "val_size": {
      "value": 0.25
    },
    "warmup_ratio": {
      "value": 0.1
    },
    "weight_decay": {
      "values": [
        0.01,
        0.03,
        0.001,
        0.003
      ]
    }
  }
}

In [11]:
sweep_id = wandb.sweep(sweep_config, project=defaults["project_name"])

Create sweep with ID: srdgo1s9
Sweep URL: https://wandb.ai/parambharat/usppm/sweeps/srdgo1s9


In [12]:
def train_fn(config=defaults):
    with wandb.init(project=defaults["project_name"], config=config) as run:
        config = run.config
        dds, tokenizer = load_and_log_datasets(config)
        trainer = get_trainer(config,tokenizer,dds)
        trainer.train()

In [None]:
%%wandb
wandb.agent(sweep_id, train_fn, count=15)

[34m[1mwandb[0m: Agent Starting Run: bf38p0ef with config:
[34m[1mwandb[0m: 	batch_size: 56
[34m[1mwandb[0m: 	data_dir: inputs
[34m[1mwandb[0m: 	dropout: 0.5
[34m[1mwandb[0m: 	epochs: 4
[34m[1mwandb[0m: 	learning_rate: 0.00048613502251464256
[34m[1mwandb[0m: 	log_dir: logs
[34m[1mwandb[0m: 	model_name_or_path: microsoft/deberta-v3-small
[34m[1mwandb[0m: 	output_dir: outputs
[34m[1mwandb[0m: 	problem_type: regression
[34m[1mwandb[0m: 	random_state: 42
[34m[1mwandb[0m: 	val_size: 0.25
[34m[1mwandb[0m: 	warmup_ratio: 0.1
[34m[1mwandb[0m: 	weight_decay: 0.001


            

#0:   0%|          | 0/3 [00:00<?, ?ba/s]

#1:   0%|          | 0/3 [00:00<?, ?ba/s]

 

#2:   0%|          | 0/3 [00:00<?, ?ba/s]

 

#3:   0%|          | 0/3 [00:00<?, ?ba/s]

 

#4:   0%|          | 0/3 [00:00<?, ?ba/s]

 

#5:   0%|          | 0/3 [00:00<?, ?ba/s]

  

#6:   0%|          | 0/3 [00:00<?, ?ba/s]

#7:   0%|          | 0/3 [00:00<?, ?ba/s]

  

#8:   0%|          | 0/3 [00:00<?, ?ba/s]

#9:   0%|          | 0/3 [00:00<?, ?ba/s]

           

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#4:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#5:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#6:   0%|          | 0/1 [00:00<?, ?ba/s]

   

#7:   0%|          | 0/1 [00:00<?, ?ba/s]

#8:   0%|          | 0/1 [00:00<?, ?ba/s]

#9:   0%|          | 0/1 [00:00<?, ?ba/s]

           

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

  

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

#4:   0%|          | 0/1 [00:00<?, ?ba/s]

   

#5:   0%|          | 0/1 [00:00<?, ?ba/s]

#7:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#6:   0%|          | 0/1 [00:00<?, ?ba/s]

#8:   0%|          | 0/1 [00:00<?, ?ba/s]

 

#9:   0%|          | 0/1 [00:00<?, ?ba/s]

Downloading:   0%|          | 0.00/273M [00:00<?, ?B/s]



Step,Training Loss,Validation Loss,Pearson
100,0.0642,0.035712,0.711455
200,0.0517,0.036938,0.695295
300,0.0539,0.041289,0.630158
400,0.0487,0.043573,0.603834
500,0.0488,0.051678,0.564088
600,0.0426,0.046523,0.562244
700,0.0445,0.047391,0.539904
800,0.0416,0.048871,0.597625
