### Imports & utils

In [1]:
%pip install datasets transformers[torch]

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 21.2.3 -> 23.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
!wget https://raw.githubusercontent.com/akshaybhalotia/yc_company_scraper/main/data/yc_essential_data.csv
!wget https://raw.githubusercontent.com/idocx/BP_MLL_Pytorch/master/bp_mll.py

'wget' is not recognized as an internal or external command,
operable program or batch file.
'wget' is not recognized as an internal or external command,
operable program or batch file.


In [3]:
from __future__ import annotations
import typing
from dataclasses import dataclass, field
import warnings
from contextlib import contextmanager
import itertools
import functools
import logging
from ast import literal_eval

In [4]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [61]:
import torch
import torch.nn as nn
import torch.nn.functional
import transformers
import transformers.modeling_outputs
import datasets
import bp_mll

In [6]:
@contextmanager
def localize_globals(*exceptions: str, restore_values: bool = True):
    exceptions: typing.Set[str] = set(exceptions)

    old_globals: typing.Dict[str, typing.Any] = dict(globals())
    allowed: typing.Set[str] = set(old_globals.keys())
    allowed.update(exceptions)

    yield None

    new_globals: typing.Dict[str, typing.Any] = globals()

    for name in tuple(new_globals.keys()):
        if name not in allowed:
            del new_globals[name]

    if not restore_values:
        return

    new_globals.update(
        {k: v for k, v in old_globals.items() if k not in exceptions}
    )

In [7]:
logging.basicConfig(
    level=logging.INFO,
    format="[{levelname}] {message}",
    style="{",
)

### Data preprocessing

In [8]:
data: pd.DataFrame = pd.read_csv("yc_essential_data.csv")

# Limit to the columns we're interested in
data = data[["name", "one_liner", "long_description", "tags"]]

# Convert tags to a list
data["tags"] = data["tags"].apply(literal_eval)
assert isinstance(data.at[0, "tags"], list), "Didn't work!"

# Okay, apparently an empty string makes a nan by default
# Gotta reverse it
data["one_liner"].replace(
    to_replace=np.nan,
    value="",
    inplace=True,
)

data["long_description"].replace(
    to_replace=np.nan,
    value="",
    inplace=True,
)

# Preview the results
data.head()

Unnamed: 0,name,one_liner,long_description,tags
0,Wufoo,Online form builder.,Wufoo is a web application that helps anybody ...,"[SaaS, Productivity]"
1,Project Wedding,,"Finding wedding vendors is hard. In 2007, a co...",[]
2,Clustrix,,Clustrix provides the leading scale-out relati...,[]
3,Inkling,,"Inkling, based in Chicago, Illinois, offers co...",[]
4,Audiobeta,,AudioBeta develops web-based applications that...,[]


In [9]:
# Gather all unique tags
with localize_globals("all_tags"):
    tags_set: set[str] = set(itertools.chain.from_iterable(data["tags"]))

    all_tags: pd.Series = pd.Series(sorted(tags_set))

all_tags

0          3D Printed Foods
1               3D Printing
2                        AI
3              AI Assistant
4      AI-Enhanced Learning
               ...         
324          Women's Health
325     Workflow Automation
326               eLearning
327                 eSports
328                    web3
Length: 329, dtype: object

### Pretrained models

In [10]:
tokenizer: transformers.DistilBertTokenizer = transformers.DistilBertTokenizer.from_pretrained(
    "distilbert-base-uncased",
)

# nlp_model: transformers.DistilBertModel = transformers.DistilBertModel.from_pretrained(
#     "distilbert-base-uncased",
# )

In [11]:
MAX_TOKENS: int = 512
EMBEDDING_SIZE: int = 768

### Dataset preparation

In [12]:
with localize_globals("complete_dataset", "col_pad_len"):
    def preprocess(batch: dict[str, typing.Any]) -> dict[str, typing.Any]:
        for column in (
            "name",
            "one_liner",
            "long_description",
        ):
            tmp = tokenizer(
                batch[column],
                truncation=True,
                padding="max_length",
                max_length=MAX_TOKENS,
                return_tensors="pt",
            ).data

            # logging.info(f"!! {tmp['input_ids'].shape}, {tmp['attention_mask'].shape}")
            batch[column] = tmp["input_ids"]
            batch[f"{column}_mask"] = tmp["attention_mask"]

        # TODO: Since this is the target, process it separately?
        if "tags" in batch:
            batch["tags"] = torch.stack([
                torch.tensor(all_tags.apply(tags.__contains__), dtype=torch.float)
                for tags in batch["tags"]
            ])

        return batch

    complete_dataset = (
        datasets.Dataset
        .from_pandas(data)
        # .with_format(None)
        .with_transform(preprocess)
    )

complete_dataset

Dataset({
    features: ['name', 'one_liner', 'long_description', 'tags'],
    num_rows: 4423
})

In [13]:
with localize_globals("train_dataset", "val_dataset", "test_dataset"):
    train_test_split = complete_dataset.train_test_split(test_size=0.2)
    train_dataset = train_test_split["train"]

    test_val_split = train_test_split["test"].train_test_split(test_size=0.3)
    val_dataset = test_val_split["train"]
    test_dataset = test_val_split["test"]

train_dataset, val_dataset, test_dataset

(Dataset({
     features: ['name', 'one_liner', 'long_description', 'tags'],
     num_rows: 3538
 }),
 Dataset({
     features: ['name', 'one_liner', 'long_description', 'tags'],
     num_rows: 619
 }),
 Dataset({
     features: ['name', 'one_liner', 'long_description', 'tags'],
     num_rows: 266
 }))

### Model definition

In [51]:
class NLPWrapperModule(nn.Module):
    """
    Wraps an nlp module and performs the following pre- and postprocessing:
    
    - Takes a dictionary and `**`-unwraps it for the submodule's input
    - Takes `.last_hidden_state` from the submodule's result and returns only it
    
    Note that the wrapper is opaque, hiding the parameters of the underlying
    NLP module to avoid tuning and saving it.
    """
    
    bert: nn.Module
    
    def __init__(self) -> None:
        super().__init__()
        
        bert = transformers.DistilBertModel.from_pretrained(
            "distilbert-base-uncased",
        )
        
        # I can't afford to also tune BERT, nor do I need to
        bert.train(False)
        for param in bert.parameters():
            param.requires_grad = False
        
        # I want it to exclude bert from the perceived parameters
        object.__setattr__(self, "bert", bert)
        
        assert not self._modules, f"Opacity failure: {self._modules}"
    
    def forward(
        self,
        params: typing.Mapping[str, torch.Tensor],
    ) -> torch.Tensor:
        # (batch_size, MAX_TOKENS, EMBEDDING_SIZE)
        result: torch.Tensor = self.bert(**params).last_hidden_state
        
        # Sum the result along axis 1 with weights equal to the attention mask
        # (batch_size, EMBEDDING_SIZE)
        return torch.sum(result * params["attention_mask"].unsqueeze(-1), dim=1)
        
    
    # A hack, but it works
    def to(self, device: torch.device) -> None:
        super().to(device)
        self.bert.to(device)


In [52]:
class MultiNLPModule(nn.Module):
    """
    Takes multiple inputs from named columns of a dataset,
    passes them to BERT, and concatenates the results.
    """
    
    inputs: list[str]
    submodule: NLPWrapperModule
    
    def __init__(
        self,
        inputs: typing.Collection[str],
    ) -> None:
        super().__init__()
        
        self.inputs = list(inputs)
        self.submodule = NLPWrapperModule()
    
    def forward(
        self,
        input_dict: typing.Mapping[str, torch.Tensor],
    ) -> torch.Tensor:
        assert set(input_dict.keys()).issuperset(self.inputs), \
            f"Missing parameters: expected {set(self.inputs)}, got only {set(input_dict.keys())}"

        return torch.cat([
            self.submodule(input_dict[name])
            for name in self.inputs
        ], dim=-1)

In [53]:
class YCTagPredictorConfig(transformers.modeling_utils.PretrainedConfig):
    model_type: typing.ClassVar[str] = "yc_tag_predictor"

    def __init__(self, **kwargs: typing.Any) -> None:
        super().__init__(**kwargs)


class YCTagPredictorModel(transformers.modeling_utils.PreTrainedModel):
    config_class = YCTagPredictorConfig

    def __init__(self, config: YCTagPredictorConfig) -> None:
        super().__init__(config)

        self.model = nn.Sequential(
            MultiNLPModule(inputs=(
                "name",
                "one_liner",
                "long_description",
            )),
            nn.Linear(
                in_features=EMBEDDING_SIZE * 3,
                out_features=len(all_tags),
            ),
            # TODO: Perhaps output another scalar to normalize by at the end?
            nn.Sigmoid(),
        )

    def forward(
        self,
        *,
        name: torch.Tensor,
        name_mask: torch.Tensor,
        one_liner: torch.Tensor,
        one_liner_mask: torch.Tensor,
        long_description: torch.Tensor,
        long_description_mask: torch.Tensor,
        **kwargs,
    ) -> torch.Tensor:
        logits: torch.Tensor = self.model(dict(
            name=dict(
                input_ids=name,
                attention_mask=name_mask,
            ),
            one_liner=dict(
                input_ids=one_liner,
                attention_mask=one_liner_mask,
            ),
            long_description=dict(
                input_ids=long_description,
                attention_mask=long_description_mask,
            ),
        ))
        
        # TODO: Return a dict with more info?
        return logits
    
    def ensure_device(self) -> None:
        """
        Our trick with the NLPModule opacity doesn't always seem to work, so this forces it
        """
        
        self.model[0].submodule.to(self.device)

In [54]:
model = YCTagPredictorModel(
    YCTagPredictorConfig(),
)

loading configuration file https://huggingface.co/distilbert-base-uncased/resolve/main/config.json from cache at C:\Users\Abel/.cache\huggingface\transformers\23454919702d26495337f3da04d1655c7ee010d5ec9d77bdb9e399e00302c0a1.91b885ab15d631bf9cee9dc9d25ece0afd932f2f5130eba28f2055b2220c0333
Model config DistilBertConfig {
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.11.3",
  "vocab_size": 30522
}

loading weights file https://huggingface.co/distilbert-base-uncased/resolve/main/pytorch_model.bin from cache at C:\Users\Abel/.cache\huggingface\transformers\9c169103d7e5a73936dd2b627e42851bec0831212b6

In [55]:

with localize_globals():
    model.to(torch.device("cpu"))
    model.ensure_device()
    
    actual_shape = model(**next(train_dataset.iter(1)))[0].shape
    target_shape = next(iter(train_dataset))["tags"].shape

    logging.info(f"{actual_shape=}, {target_shape=}")

    assert actual_shape == target_shape, "Bad model result shape"


[INFO] actual_shape=torch.Size([329]), target_shape=torch.Size([329])


### Model training

In [56]:
training_args = transformers.TrainingArguments(
    output_dir="./training_output",
    logging_dir="./training_logs",
    label_names=["tags"],
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    remove_unused_columns=False,
    num_train_epochs=10,
    # warmup_steps=100,
    # weight_decay=0.01,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [57]:
class CustomTrainer(transformers.trainer.Trainer):
    def compute_loss(
        self,
        model: nn.Module,
        inputs: dict[str, typing.Any],
        return_outputs: bool = False,
    ) -> typing.Union[torch.Tensor, typing.Tuple[torch.Tensor, transformers.modeling_outputs.ModelOutput]]:
        # print("!!!", flush=True)
        labels = inputs.pop("tags")
        outputs = model(**inputs)
        loss = bp_mll.BPMLLLoss()(
            outputs, labels,
        )
        return (loss, outputs) if return_outputs else loss

In [58]:
trainer: transformers.trainer.Trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

In [59]:
model.ensure_device()

In [62]:
trainer.train()

***** Running training *****
  Num examples = 3538
  Num Epochs = 10
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 4430


  0%|          | 0/4430 [00:00<?, ?it/s]

Saving model checkpoint to ./training_output\checkpoint-500
Configuration saved in ./training_output\checkpoint-500\config.json
Model weights saved in ./training_output\checkpoint-500\pytorch_model.bin


{'loss': 0.6252, 'learning_rate': 4.435665914221219e-05, 'epoch': 1.13}




In [29]:
trainer.evaluate()

Step,Training Loss,Validation Loss
1500,595.1178,
2000,616.3283,
2245,616.3283,709.569946


{'eval_loss': 709.5699462890625}

In [None]:
trainer.train(resume_from_checkpoint=True)