In [None]:
#!pip install transformers

In [1]:
import pandas as pd
import numpy as np
import torch

## Import and clean dataset

In [2]:
df = pd.read_csv("hanlon_matched_titles.csv", index_col=0) 

In [3]:
#!pip install -q transformers datasets

from datasets import Dataset, DatasetDict

# Convert to Huggingface dataset
dataset = Dataset.from_pandas(df)
dataset = dataset.remove_columns(['__index_level_0__'])

# Split into train, validation and test dataset
train_temp_split = dataset.train_test_split(test_size=0.30, seed=42)
train_dataset = train_temp_split['train']
temp_dataset = train_temp_split['test']

validation_test_split = temp_dataset.train_test_split(test_size=0.50, seed=42)  # 15% test, 15% validation
validation_dataset = validation_test_split['train']
test_dataset = validation_test_split['test']

# Create the final DatasetDict
final_dataset = DatasetDict({
    'train': train_dataset,
    'validation': validation_dataset,
    'test': test_dataset
})

print(final_dataset)

DatasetDict({
    train: Dataset({
        features: ['patent_id', 'Acids and salts, etc', 'Acids, alkalis, etc', 'Advertising', 'Aeronautics', 'Agricultural appliances, farmyard', 'Agricultural appliances, for treatment of land, etc.', 'Air and gas engines', 'Air and gases, compressing, etc', 'Ammunition', 'Animal powered engines', "Artists' instruments", 'Bearings, etc.', 'Bells, etc', 'Beverages', 'Bleaching, etc.', 'Books', 'Boots, etc', 'Boxes, etc', 'Brushing, etc', 'Buildings', 'Casks', 'Cements', 'Centrifugal drying', 'Chains', 'Chimneys', 'Closets', 'Coin-feed apparatus', 'Cooking, etc', 'Cooling', 'Cutlery', 'Cutting', 'Distilling', 'Drains', 'Drying', 'Dynamo electric generators', 'Electric lamps', 'Electric telegraphs', 'Electricity conducting', 'Electricity measuring', 'Electricity regulating', 'Electrolysis', 'Fabrics, dressing', 'Fastenings, dress', 'Fastenings, lock', 'Fencing', 'Filtering', 'Fire extinction', 'Fish', 'Food', 'Fuel, manufacture', 'Furnaces', 'Furniture'

In [7]:
final_dataset['test'].to_csv("hanlon_data_test.csv") 

Creating CSV from Arrow format:   0%|          | 0/18 [00:00<?, ?ba/s]

6811972

In [4]:
# Identify label columns
label_columns = [
    col for col in df.columns
    if col not in ["patent_id", "patent_title"]
]

print("Label columns:", label_columns)


Label columns: ['Acids and salts, etc', 'Acids, alkalis, etc', 'Advertising', 'Aeronautics', 'Agricultural appliances, farmyard', 'Agricultural appliances, for treatment of land, etc.', 'Air and gas engines', 'Air and gases, compressing, etc', 'Ammunition', 'Animal powered engines', "Artists' instruments", 'Bearings, etc.', 'Bells, etc', 'Beverages', 'Bleaching, etc.', 'Books', 'Boots, etc', 'Boxes, etc', 'Brushing, etc', 'Buildings', 'Casks', 'Cements', 'Centrifugal drying', 'Chains', 'Chimneys', 'Closets', 'Coin-feed apparatus', 'Cooking, etc', 'Cooling', 'Cutlery', 'Cutting', 'Distilling', 'Drains', 'Drying', 'Dynamo electric generators', 'Electric lamps', 'Electric telegraphs', 'Electricity conducting', 'Electricity measuring', 'Electricity regulating', 'Electrolysis', 'Fabrics, dressing', 'Fastenings, dress', 'Fastenings, lock', 'Fencing', 'Filtering', 'Fire extinction', 'Fish', 'Food', 'Fuel, manufacture', 'Furnaces', 'Furniture', 'Galvanic batteries', 'Gas distribution', 'Gas ma

## Preprocessing data

In [5]:
from transformers import AutoTokenizer

# Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-large")

# Calculate the token lengths for the Context column
max_token_length = max(df["patent_title"].apply(lambda x: len(tokenizer.tokenize(x))))
print(f"Maximum token length in the Context column: {max_token_length}")


Maximum token length in the Context column: 338


In [6]:
def preprocess_data(examples):
    # Tokenize the texts
    encoding = tokenizer(
        examples["patent_title"],
        padding="max_length",
        truncation=True,
        max_length = 350
    )

    # Extract label data
    labels_batch = {label: examples[label] for label in label_columns}

    # Create a labels matrix
    labels_matrix = np.zeros((len(examples["patent_title"]), len(label_columns)))
    for idx, label in enumerate(label_columns):
        labels_matrix[:, idx] = labels_batch[label]

    encoding["labels"] = labels_matrix.tolist()

    return encoding

# Apply the preprocessing to the dataset
encoded_dataset = final_dataset.map(
    preprocess_data,
    batched=True,
    remove_columns=final_dataset["train"].column_names
)

# Inspect a sample from the encoded dataset
example = encoded_dataset['train'][0]
print(example.keys())
print(tokenizer.decode(example['input_ids']))
print([label_columns[idx] for idx, label in enumerate(example['labels']) if label == 1.0])


Map:   0%|          | 0/82651 [00:00<?, ? examples/s]

Map:   0%|          | 0/17711 [00:00<?, ? examples/s]

Map:   0%|          | 0/17711 [00:00<?, ? examples/s]

dict_keys(['input_ids', 'attention_mask', 'labels'])
<s> IMPROVEMENTS IN THE JOINING OF FRENCH HORN AND WHALEBONE BY THE SYSTEM OF DOVETAILING THE ENDS</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>

In [7]:
encoded_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 82651
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 17711
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 17711
    })
})

## Define Models

In [15]:
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments

# Define label mappings if needed
labels = label_columns
id2label = {idx: label for idx, label in enumerate(labels)}
label2id = {label: idx for idx, label in enumerate(labels)}

# Initialize the model RoBERTa
model = AutoModelForSequenceClassification.from_pretrained(
    "matthewleechen/multiclass-classifier-patents",
    num_labels=len(labels),
    id2label=id2label,
    label2id=label2id,
    problem_type="multi_label_classification"
)

In [9]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="matthewleechen/multilabel-classifier-patents",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    load_best_model_at_end=True,
    num_train_epochs=10,
    weight_decay=0.01,
    fp16=True,
    logging_dir='./logs',
)



In [21]:
from sklearn.metrics import f1_score, precision_score, recall_score

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = torch.sigmoid(torch.tensor(logits)).numpy()
    predictions = (predictions > 0.5).astype(int)

    # Micro-averaged metrics (standard for multi-label classification)
    f1_micro = f1_score(labels, predictions, average='micro')
    precision_micro = precision_score(labels, predictions, average='micro')
    recall_micro = recall_score(labels, predictions, average='micro')

    # Exact match: Check if predicted set exactly matches the true set
    exact_matches = np.all(predictions == labels, axis=1)
    exact_match_f1 = exact_matches.mean()  # Equivalent to accuracy in this case
    exact_match_precision = exact_match_f1  # Precision = Recall = F1 for exact match

    # Any match: At least one label overlaps between predicted and true
    any_match_list = []
    for i in range(len(labels)):
        true_set = set(np.where(labels[i] == 1)[0])
        pred_set = set(np.where(predictions[i] == 1)[0])
        if true_set & pred_set:  # Check if there's an intersection
            any_match_list.append(1)
        else:
            any_match_list.append(0)
    any_match_score = np.mean(any_match_list)

    # Calculate F1, precision, and recall for any match
    true_positives = sum(any_match_list)
    false_positives = len(any_match_list) - true_positives  # Total minus matches
    false_negatives = len(labels) - true_positives  # Total ground truth minus matches

    any_match_precision = true_positives / (true_positives + false_positives)
    any_match_recall = true_positives / (true_positives + false_negatives)
    any_match_f1 = (
        2 * any_match_precision * any_match_recall / (any_match_precision + any_match_recall)
        if any_match_precision + any_match_recall > 0
        else 0
    )

    return {
        # Micro metrics
        'f1_micro': f1_micro,
        'precision_micro': precision_micro,
        'recall_micro': recall_micro,
        # Exact match metrics
        'exact_match_f1': exact_match_f1,
        'exact_match_precision': exact_match_precision,
        'exact_match_recall': exact_match_f1,
        # Any match metrics
        'any_match_f1': any_match_f1,
        'any_match_precision': any_match_precision,
        'any_match_recall': any_match_recall,
    }


In [22]:
import torch.nn as nn
# Define Focal Loss
class FocalLoss(nn.Module):
    def __init__(self, alpha=1, gamma=2, reduction='mean'):
        """
        Focal Loss for multi-label classification.
        Args:
            alpha (float): Balancing factor for rare classes.
            gamma (float): Focusing parameter for hard examples.
            reduction (str): 'mean', 'sum', or 'none'.
        """
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.reduction = reduction
        self.bce = nn.BCEWithLogitsLoss(reduction='none')

    def forward(self, inputs, targets):
        # Compute the Binary Cross Entropy Loss
        bce_loss = self.bce(inputs, targets)
        probs = torch.sigmoid(inputs)
        probs = torch.clamp(probs, min=1e-6, max=1 - 1e-6)

        # Compute the Focal Loss factor
        pt = torch.where(targets == 1, probs, 1 - probs)
        focal_factor = (1 - pt) ** self.gamma
        loss = self.alpha * focal_factor * bce_loss

        if self.reduction == 'mean':
            return loss.mean()
        elif self.reduction == 'sum':
            return loss.sum()
        else:
            return loss

# Subclass the Trainer to use Focal Loss
class FocalLossTrainer(Trainer):
    def __init__(self, *args, alpha=1, gamma=2, **kwargs):
        super().__init__(*args, **kwargs)
        self.alpha = alpha
        self.gamma = gamma
        self.focal_loss = FocalLoss(alpha=self.alpha, gamma=self.gamma).to(self.args.device)

    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss = self.focal_loss(logits, labels)
        return (loss, outputs) if return_outputs else loss


In [23]:
# Update the Trainer initialization
trainer = FocalLossTrainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    alpha=1,
    gamma=2  
)

# Start training
trainer.evaluate()

# Download results model
#import shutil
#shutil.make_archive("RoBERTa-large", 'zip', "./Robertlarge_results")
#from google.colab import files
#files.download("RoBERTa-large.zip")

{'eval_loss': 0.006695271003991365,
 'eval_f1_micro': 0.7000987313947206,
 'eval_precision_micro': 0.8337474487532168,
 'eval_recall_micro': 0.6033779661561185,
 'eval_exact_match_f1': 0.5296143639546045,
 'eval_exact_match_precision': 0.5296143639546045,
 'eval_exact_match_recall': 0.5296143639546045,
 'eval_any_match_f1': 0.9078538761221839,
 'eval_any_match_precision': 0.9078538761221839,
 'eval_any_match_recall': 0.9078538761221839,
 'eval_runtime': 163.2407,
 'eval_samples_per_second': 108.496,
 'eval_steps_per_second': 1.697}

## Evaluate the test dataset

In [None]:
# Evaluate on the test set
test_results = trainer.evaluate(encoded_dataset["test"])
print(test_results)

In [14]:
trainer.push_to_hub("matthewleechen/tech_classes_multilabel_classifier") 

model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/4.79k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

Upload 4 LFS files:   0%|          | 0/4 [00:00<?, ?it/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/matthewleechen/multiclass-classifier-patents/commit/47b17e3089075d3b794c73214dd7cefb328621c3', commit_message='matthewleechen/tech_classes_multilabel_classifier', commit_description='', oid='47b17e3089075d3b794c73214dd7cefb328621c3', pr_url=None, repo_url=RepoUrl('https://huggingface.co/matthewleechen/multiclass-classifier-patents', endpoint='https://huggingface.co', repo_type='model', repo_id='matthewleechen/multiclass-classifier-patents'), pr_revision=None, pr_num=None)

## Error Analysis

In [24]:
from sklearn.metrics import classification_report
import numpy as np

# Evaluate on the test set
y_true = np.array([example['labels'] for example in encoded_dataset['test']])
y_pred_logits = trainer.predict(encoded_dataset['test']).predictions
y_pred = (torch.sigmoid(torch.tensor(y_pred_logits)).numpy() > 0.5).astype(int)

# Classification report
print(classification_report(y_true, y_pred, target_names=label_columns))

# Check exact matches
exact_matches = np.all(y_pred == y_true, axis=1).mean()
print(f"Exact match accuracy: {exact_matches:.4f}")

# Partial matches
partial_matches = [
    len(set(np.where(y_pred[i] == 1)[0]) & set(np.where(y_true[i] == 1)[0])) > 0
    for i in range(len(y_true))
]
partial_match_accuracy = np.mean(partial_matches)
print(f"Partial match accuracy: {partial_match_accuracy:.4f}")


                                                      precision    recall  f1-score   support

                                Acids and salts, etc       0.83      0.50      0.62       161
                                 Acids, alkalis, etc       0.84      0.57      0.68       386
                                         Advertising       0.83      0.46      0.59       120
                                         Aeronautics       1.00      0.53      0.69        40
                   Agricultural appliances, farmyard       0.87      0.53      0.66       161
Agricultural appliances, for treatment of land, etc.       0.92      0.80      0.86       428
                                 Air and gas engines       0.86      0.55      0.67       159
                     Air and gases, compressing, etc       0.65      0.26      0.38       257
                                          Ammunition       0.90      0.77      0.83       268
                              Animal powered engines       

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [34]:
### Inference 
from transformers import pipeline 

model = AutoModelForSequenceClassification.from_pretrained("matthewleechen/multiclass-classifier-patents")
tokenizer = AutoTokenizer.from_pretrained("matthewleechen/multiclass-classifier-patents")

pipe = pipeline(task="text-classification", model=model, device = 0, tokenizer=tokenizer, return_all_scores=True)


In [54]:
from datasets import load_dataset 

dataset_all_years = load_dataset(
    "matthewleechen/300YearsOfBritishPatents",
    data_files="texts.jsonl.gz"
) 
dataset_all_years

texts.jsonl.gz:   0%|          | 0.00/1.69G [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['patent_id', 'year', 'patent_title', 'full_text', 'word_tokens', 'predicted_BPO_classes'],
        num_rows: 322874
    })
})

In [55]:
ds = dataset_all_years["train"]

In [41]:
def safe_title(example):
    title = example["patent_title"]
    if not title or len(title) < 10:
        example["safe_patent_title"] = ""  # pipeline will still run but we can interpret
    else:
        example["safe_patent_title"] = title
    return example

In [42]:
ds = ds.map(safe_title)
ds

Map:   0%|          | 0/322874 [00:00<?, ? examples/s]

Dataset({
    features: ['patent_id', 'year', 'patent_title', 'full_text', 'word_tokens', 'safe_patent_title'],
    num_rows: 322874
})

In [45]:
from transformers.pipelines.pt_utils import KeyDataset
key_ds = KeyDataset(ds, "safe_patent_title")
key_ds

<transformers.pipelines.pt_utils.KeyDataset at 0x7f0c899f69b0>

In [46]:
key_ds

<transformers.pipelines.pt_utils.KeyDataset at 0x7f0c899f69b0>

In [47]:
from tqdm.auto import tqdm

threshold = 0.5  # your chosen multi-label threshold

# We'll gather the results for each row in "ds" in a list
predicted_labels_list = []

for row_outputs in tqdm(
    pipe(
        key_ds, 
        batch_size=64,       # your desired batch size
        truncation=True, 
        max_length=512
    ),
    total=len(ds)  # so tqdm can show progress
):
    # row_outputs is a list of dicts: e.g. 
    #   [ {'label': 'Acids and salts, etc', 'score': 0.0686}, 
    #     {'label': 'Acids, alkalis, etc', 'score': 0.1598}, ...
    #     {'label': 'Wood', 'score': 0.0905}, 
    #     {'label': 'Writing instruments', 'score': 0.0277} ]
    
    # If we gave the pipeline an empty string, it still returns scores
    # but we said "skip if <10 chars". We'll just interpret that as empty predictions:
    
    if not row_outputs:
        # something is off, or pipeline gave an empty list
        predicted_labels_list.append([])
        continue
    
    # If the input was empty or short, row_outputs will still be 146 dictionaries, 
    # but let's see if we want to override it with an empty result:
    # We can detect that by checking the row_outputs are from an empty string input:
    # A simpler approach is to check if row_outputs are all nearly the same low scores...
    # But let's assume we control it more directly in the next step.

    # Convert pipeline output to a list of predicted labels based on threshold
    row_predictions = []
    for d in row_outputs:
        if d["score"] >= threshold:
            row_predictions.append(d["label"])
    
    predicted_labels_list.append(row_predictions)

  0%|          | 0/322874 [00:00<?, ?it/s]

In [48]:
ds = ds.add_column("predicted_BPO_classes", predicted_labels_list)
ds

Dataset({
    features: ['patent_id', 'year', 'patent_title', 'full_text', 'word_tokens', 'safe_patent_title', 'predicted_BPO_classes'],
    num_rows: 322874
})

In [50]:
ds2 = ds.remove_columns(["safe_patent_title"]) 
ds2

Dataset({
    features: ['patent_id', 'year', 'patent_title', 'full_text', 'word_tokens', 'predicted_BPO_classes'],
    num_rows: 322874
})

In [51]:
ds2.to_json("texts.jsonl", lines=True, orient="records")

Creating json from Arrow format:   0%|          | 0/323 [00:00<?, ?ba/s]

7437627310

In [66]:
out = pipeline(task="text-classification", model=model, tokenizer=tokenizer, return_all_scores=True)

out("IMPROVE- MENTS IN TOBACCO-POUCHES")

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


[[{'label': 'Acids and salts, etc', 'score': 0.0380391851067543},
  {'label': 'Acids, alkalis, etc', 'score': 0.0566444993019104},
  {'label': 'Advertising', 'score': 0.14558298885822296},
  {'label': 'Aeronautics', 'score': 0.05326671153306961},
  {'label': 'Agricultural appliances, farmyard', 'score': 0.22743389010429382},
  {'label': 'Agricultural appliances, for treatment of land, etc.',
   'score': 0.06270533800125122},
  {'label': 'Air and gas engines', 'score': 0.03687295690178871},
  {'label': 'Air and gases, compressing, etc', 'score': 0.054507628083229065},
  {'label': 'Ammunition', 'score': 0.18135958909988403},
  {'label': 'Animal powered engines', 'score': 0.048613857477903366},
  {'label': "Artists' instruments", 'score': 0.04555508494377136},
  {'label': 'Bearings, etc.', 'score': 0.05624033138155937},
  {'label': 'Bells, etc', 'score': 0.06090704724192619},
  {'label': 'Beverages', 'score': 0.04382931813597679},
  {'label': 'Bleaching, etc.', 'score': 0.0321206450462341

In [52]:
import gzip

# Specify input and output file paths
input_file = 'texts.jsonl'  # Replace with your .jsonl file
output_file = 'texts.jsonl.gz'  # Desired compressed file

# Read and compress the file
with open(input_file, 'rb') as f_in:
    with gzip.open(output_file, 'wb') as f_out:
        f_out.writelines(f_in)

print(f"Compressed {input_file} to {output_file}")


Compressed texts.jsonl to texts.jsonl.gz


In [53]:
from huggingface_hub import HfApi

api = HfApi()

api.upload_file(

    path_or_fileobj="texts.jsonl.gz",

    path_in_repo="texts.jsonl.gz",

    repo_id="matthewleechen/300YearsOfBritishPatents",

    repo_type="dataset",

)

texts.jsonl.gz:   0%|          | 0.00/1.69G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/matthewleechen/300YearsOfBritishPatents/commit/f8e1e529782dea665f0813c8374125eee025c7ca', commit_message='Upload texts.jsonl.gz with huggingface_hub', commit_description='', oid='f8e1e529782dea665f0813c8374125eee025c7ca', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/matthewleechen/300YearsOfBritishPatents', endpoint='https://huggingface.co', repo_type='dataset', repo_id='matthewleechen/300YearsOfBritishPatents'), pr_revision=None, pr_num=None)