### Imports & utils

In [1]:
%pip install datasets transformers[torch]

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 21.2.3 -> 23.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
!wget https://raw.githubusercontent.com/akshaybhalotia/yc_company_scraper/main/data/yc_essential_data.csv

'wget' is not recognized as an internal or external command,
operable program or batch file.
'wget' is not recognized as an internal or external command,
operable program or batch file.


In [3]:
from __future__ import annotations
import typing
from dataclasses import dataclass, field
import warnings
from contextlib import contextmanager
import itertools
import functools
import logging
from ast import literal_eval

In [4]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [61]:
import torch
import torch.nn as nn
import torch.nn.functional
import transformers
import transformers.modeling_outputs
import datasets
import bp_mll

In [6]:
@contextmanager
def localize_globals(*exceptions: str, restore_values: bool = True):
    exceptions: typing.Set[str] = set(exceptions)

    old_globals: typing.Dict[str, typing.Any] = dict(globals())
    allowed: typing.Set[str] = set(old_globals.keys())
    allowed.update(exceptions)

    yield None

    new_globals: typing.Dict[str, typing.Any] = globals()

    for name in tuple(new_globals.keys()):
        if name not in allowed:
            del new_globals[name]

    if not restore_values:
        return

    new_globals.update(
        {k: v for k, v in old_globals.items() if k not in exceptions}
    )

In [7]:
logging.basicConfig(
    level=logging.INFO,
    format="[{levelname}] {message}",
    style="{",
)

### Data preprocessing

In [8]:
data: pd.DataFrame = pd.read_csv("yc_essential_data.csv")

# Limit to the columns we're interested in
data = data[["name", "one_liner", "long_description", "tags"]]

# Convert tags to a list
data["tags"] = data["tags"].apply(literal_eval)
assert isinstance(data.at[0, "tags"], list), "Didn't work!"

# Okay, apparently an empty string makes a nan by default
# Gotta reverse it
data["one_liner"].replace(
    to_replace=np.nan,
    value="",
    inplace=True,
)

data["long_description"].replace(
    to_replace=np.nan,
    value="",
    inplace=True,
)

# Preview the results
data.head()

Unnamed: 0,name,one_liner,long_description,tags
0,Wufoo,Online form builder.,Wufoo is a web application that helps anybody ...,"[SaaS, Productivity]"
1,Project Wedding,,"Finding wedding vendors is hard. In 2007, a co...",[]
2,Clustrix,,Clustrix provides the leading scale-out relati...,[]
3,Inkling,,"Inkling, based in Chicago, Illinois, offers co...",[]
4,Audiobeta,,AudioBeta develops web-based applications that...,[]


In [72]:
data.sample(10)

Unnamed: 0,name,one_liner,long_description,tags
4281,Mojo,Create animated social stories,,"[Consumer, B2B, Design Tools, Video, Subscript..."
3518,Airshared,TBD,,[SaaS]
3031,Nophin,AI Underwriting Copilot For Commercial Real Es...,Nophin helps commercial real estate investors ...,"[Fintech, Proptech]"
3730,Birch Biosciences,We recycle plastic using engineered enzymes,Birch Biosciences engineers enzymes for plasti...,"[Machine Learning, Synthetic Biology, Biotech,..."
3622,Maya Labs,Building self-programming machines,Applied research lab on a mission to build mac...,"[Artificial Intelligence, Generative AI, Robot..."
1263,Binks,"A zero inventory, zero returns factory-to-cons...",Getting clothes tailor-made is very common in ...,"[Sustainable Fashion, Consumer, E-commerce, Su..."
1276,Adla,Department store for e-commerce,Adla curates and delivers clothes to try on at...,[E-commerce]
1967,BotBuilt,We use robots to build houses.,BotBuilt is creating flexible robotic systems ...,"[Robotic Process Automation, Robotics, Constru..."
3125,Velt,Add powerful collaboration features to your pr...,Our mission is to make the web more collaborat...,"[Developer Tools, SaaS, Collaboration, API, Re..."
80,Tenant Turner,We help residential property managers eliminat...,Tenant Turner helps property managers eliminat...,"[SaaS, Smart Locks, Proptech]"


// TODO: Don't use the name, remove it from long_description as well?

In [9]:
# Gather all unique tags
with localize_globals("all_tags"):
    tags_set: set[str] = set(itertools.chain.from_iterable(data["tags"]))

    all_tags: pd.Series = pd.Series(sorted(tags_set))

all_tags

0          3D Printed Foods
1               3D Printing
2                        AI
3              AI Assistant
4      AI-Enhanced Learning
               ...         
324          Women's Health
325     Workflow Automation
326               eLearning
327                 eSports
328                    web3
Length: 329, dtype: object

### Pretrained models

In [10]:
tokenizer: transformers.DistilBertTokenizer = transformers.DistilBertTokenizer.from_pretrained(
    "distilbert-base-uncased",
)

# nlp_model: transformers.DistilBertModel = transformers.DistilBertModel.from_pretrained(
#     "distilbert-base-uncased",
# )

In [11]:
MAX_TOKENS: int = 512
EMBEDDING_SIZE: int = 768

### Dataset preparation

In [88]:
with localize_globals("complete_dataset", "col_pad_len"):
    def preprocess(batch: dict[str, typing.Any]) -> dict[str, typing.Any]:
        for column in (
            "name",
            "one_liner",
            "long_description",
        ):
            tmp = tokenizer(
                batch[column],
                truncation=True,
                padding="max_length",
                max_length=MAX_TOKENS,
                return_tensors="pt",
                add_special_tokens=True,
            ).data

            # logging.info(f"!! {tmp['input_ids'].shape}, {tmp['attention_mask'].shape}")
            batch[column] = tmp["input_ids"]
            batch[f"{column}_mask"] = tmp["attention_mask"]

        # TODO: Since this is the target, process it separately?
        if "tags" in batch:
            batch["tags"] = torch.stack([
                torch.tensor(all_tags.apply(tags.__contains__), dtype=torch.float)
                for tags in batch["tags"]
            ])

        return batch

    complete_dataset = (
        datasets.Dataset
        .from_pandas(data)
        # .with_format(None)
        .with_transform(preprocess)
    )

complete_dataset

Dataset({
    features: ['name', 'one_liner', 'long_description', 'tags'],
    num_rows: 4423
})

In [89]:
with localize_globals("train_dataset", "val_dataset", "test_dataset"):
    train_test_split = complete_dataset.train_test_split(test_size=0.2)
    train_dataset = train_test_split["train"]

    test_val_split = train_test_split["test"].train_test_split(test_size=0.3)
    val_dataset = test_val_split["train"]
    test_dataset = test_val_split["test"]

train_dataset, val_dataset, test_dataset

(Dataset({
     features: ['name', 'one_liner', 'long_description', 'tags'],
     num_rows: 3538
 }),
 Dataset({
     features: ['name', 'one_liner', 'long_description', 'tags'],
     num_rows: 619
 }),
 Dataset({
     features: ['name', 'one_liner', 'long_description', 'tags'],
     num_rows: 266
 }))

### Model definition

In [90]:
class NLPWrapperModule(nn.Module):
    """
    Wraps an nlp module and performs the following pre- and postprocessing:
    
    - Takes a dictionary and `**`-unwraps it for the submodule's input
    - Takes `.last_hidden_state` from the submodule's result and returns only it
    
    Note that the wrapper is opaque, hiding the parameters of the underlying
    NLP module to avoid tuning and saving it.
    """
    
    bert: nn.Module
    
    def __init__(self) -> None:
        super().__init__()
        
        bert = transformers.DistilBertModel.from_pretrained(
            "distilbert-base-uncased",
        )
        
        # I can't afford to also tune BERT, nor do I need to
        bert.train(False)
        for param in bert.parameters():
            param.requires_grad = False
        
        # I want it to exclude bert from the perceived parameters
        object.__setattr__(self, "bert", bert)
        
        assert not self._modules, f"Opacity failure: {self._modules}"
    
    def forward(
        self,
        params: typing.Mapping[str, torch.Tensor],
    ) -> torch.Tensor:
        # (batch_size, MAX_TOKENS, EMBEDDING_SIZE)
        result: torch.Tensor = self.bert(**params).last_hidden_state
        
        # The [CLS] token is added at position 0 along axis 1
        return result[:, 0, :]
        
        # Sum the result along axis 1 with weights equal to the attention mask
        # (batch_size, EMBEDDING_SIZE)
        # return torch.sum(result * params["attention_mask"].unsqueeze(-1), dim=1)
        
    
    # A hack, but it works
    def to(self, device: torch.device) -> None:
        super().to(device)
        self.bert.to(device)


In [91]:
class MultiNLPModule(nn.Module):
    """
    Takes multiple inputs from named columns of a dataset,
    passes them to BERT, and concatenates the results.
    """
    
    inputs: list[str]
    submodule: NLPWrapperModule
    
    def __init__(
        self,
        inputs: typing.Collection[str],
    ) -> None:
        super().__init__()
        
        self.inputs = list(inputs)
        self.submodule = NLPWrapperModule()
    
    def forward(
        self,
        input_dict: typing.Mapping[str, torch.Tensor],
    ) -> torch.Tensor:
        assert set(input_dict.keys()).issuperset(self.inputs), \
            f"Missing parameters: expected {set(self.inputs)}, got only {set(input_dict.keys())}"

        return torch.cat([
            self.submodule(input_dict[name])
            for name in self.inputs
        ], dim=-1)

In [92]:
class YCTagPredictorConfig(transformers.modeling_utils.PretrainedConfig):
    model_type: typing.ClassVar[str] = "yc_tag_predictor"

    def __init__(self, **kwargs: typing.Any) -> None:
        super().__init__(**kwargs)


class YCTagPredictorModel(transformers.modeling_utils.PreTrainedModel):
    config_class = YCTagPredictorConfig

    def __init__(self, config: YCTagPredictorConfig) -> None:
        super().__init__(config)

        self.model = nn.Sequential(
            MultiNLPModule(inputs=(
                "name",
                "one_liner",
                "long_description",
            )),
            nn.Linear(
                in_features=EMBEDDING_SIZE * 3,
                out_features=len(all_tags),
            ),
            # TODO: Perhaps output another scalar to normalize by at the end?
            nn.Sigmoid(),
        )

    def forward(
        self,
        *,
        name: torch.Tensor,
        name_mask: torch.Tensor,
        one_liner: torch.Tensor,
        one_liner_mask: torch.Tensor,
        long_description: torch.Tensor,
        long_description_mask: torch.Tensor,
        **kwargs,
    ) -> torch.Tensor:
        logits: torch.Tensor = self.model(dict(
            name=dict(
                input_ids=name,
                attention_mask=name_mask,
            ),
            one_liner=dict(
                input_ids=one_liner,
                attention_mask=one_liner_mask,
            ),
            long_description=dict(
                input_ids=long_description,
                attention_mask=long_description_mask,
            ),
        ))
        
        # TODO: Return a dict with more info?
        return logits
    
    def ensure_device(self) -> None:
        """
        Our trick with the NLPModule opacity doesn't always seem to work, so this forces it
        """
        
        self.model[0].submodule.to(self.device)

In [212]:
model = YCTagPredictorModel(
    YCTagPredictorConfig(),
)

loading configuration file https://huggingface.co/distilbert-base-uncased/resolve/main/config.json from cache at C:\Users\Abel/.cache\huggingface\transformers\23454919702d26495337f3da04d1655c7ee010d5ec9d77bdb9e399e00302c0a1.91b885ab15d631bf9cee9dc9d25ece0afd932f2f5130eba28f2055b2220c0333
Model config DistilBertConfig {
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.11.3",
  "vocab_size": 30522
}

loading weights file https://huggingface.co/distilbert-base-uncased/resolve/main/pytorch_model.bin from cache at C:\Users\Abel/.cache\huggingface\transformers\9c169103d7e5a73936dd2b627e42851bec0831212b6

In [213]:
with localize_globals():
    model.to(torch.device("cpu"))
    model.ensure_device()
    
    actual_shape = model(**next(train_dataset.iter(1)))[0].shape
    target_shape = next(iter(train_dataset))["tags"].shape

    logging.info(f"{actual_shape=}, {target_shape=}")

    assert actual_shape == target_shape, "Bad model result shape"


[INFO] actual_shape=torch.Size([329]), target_shape=torch.Size([329])


### Model training

In [214]:
training_args = transformers.TrainingArguments(
    output_dir="./training_output",
    logging_dir="./training_logs",
    label_names=["tags"],
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    remove_unused_columns=False,
    num_train_epochs=5,
    lr_scheduler_type="linear",
    warmup_ratio=0.1,
    logging_steps=100,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    do_train=True,
    do_eval=True,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [243]:
with localize_globals():
    import importlib
    importlib.reload(bp_mll)

In [244]:
class CustomTrainer(transformers.trainer.Trainer):
    def compute_loss(
        self,
        model: nn.Module,
        inputs: dict[str, typing.Any],
        return_outputs: bool = False,
    ) -> typing.Union[torch.Tensor, typing.Tuple[torch.Tensor, transformers.modeling_outputs.ModelOutput]]:
        # print("!!!", flush=True)
        labels = inputs.pop("tags")
        outputs = model(**inputs)
        
        # https://en.wikipedia.org/wiki/Multi-label_classification?useskin=vector#:~:text=neural%20networks%3A%20BP%2DMLL%20is%20an%20adaptation%20of%20the%20popular%20back%2Dpropagation%20algorithm%20for%20multi%2Dlabel%20learning.
        # https://github.com/idocx/BP_MLL_Pytorch
        loss: torch.Tensor = bp_mll.BPMLLLoss()(
            outputs, labels,
        )
        
        if loss.isnan().sum() > 0:
            raise ValueError("NaN in loss")
        
        return (loss, outputs) if return_outputs else loss

In [245]:
with localize_globals():
    print(bp_mll.BPMLLLoss()(
        torch.tensor([[0.1, 0.9, 0.5]]),
        torch.tensor([[0.0, 1.0, 1.0]]),
    ))

tensor(0.5598)


In [246]:
def compute_metrics(
    eval_pred: transformers.trainer_utils.EvalPrediction,
) -> dict[str, float]:
    predictions, labels = map(torch.tensor, eval_pred)
    predictions = (predictions >= 0.5)
    return {"accuracy": (predictions == labels).mean()}

In [247]:
trainer: transformers.trainer.Trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    # compute_metrics=compute_metrics,
)

In [248]:
model.ensure_device()

In [249]:
trainer.train()

***** Running training *****
  Num examples = 3538
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 2215


  0%|          | 0/2215 [00:00<?, ?it/s]



{'loss': 1.0003, 'learning_rate': 2.2522522522522523e-05, 'epoch': 0.23}
{'loss': 1.0, 'learning_rate': 4.5045045045045046e-05, 'epoch': 0.45}
{'loss': 1.0, 'learning_rate': 4.80431510286001e-05, 'epoch': 0.68}
{'loss': 1.0, 'learning_rate': 4.5534370296036125e-05, 'epoch': 0.9}


***** Running Evaluation *****
  Num examples = 619
  Batch size = 8


  0%|          | 0/78 [00:00<?, ?it/s]

Saving model checkpoint to ./training_output\checkpoint-443


{'eval_loss': 1.0, 'eval_runtime': 44.0616, 'eval_samples_per_second': 14.049, 'eval_steps_per_second': 1.77, 'epoch': 1.0}


Configuration saved in ./training_output\checkpoint-443\config.json
Model weights saved in ./training_output\checkpoint-443\pytorch_model.bin


{'loss': 1.0, 'learning_rate': 4.3025589563472155e-05, 'epoch': 1.13}
{'loss': 1.0, 'learning_rate': 4.0516808830908184e-05, 'epoch': 1.35}
{'loss': 1.0, 'learning_rate': 3.800802809834421e-05, 'epoch': 1.58}
{'loss': 1.0, 'learning_rate': 3.549924736578023e-05, 'epoch': 1.81}


***** Running Evaluation *****
  Num examples = 619
  Batch size = 8


  0%|          | 0/78 [00:00<?, ?it/s]

Saving model checkpoint to ./training_output\checkpoint-886
Configuration saved in ./training_output\checkpoint-886\config.json
Model weights saved in ./training_output\checkpoint-886\pytorch_model.bin


{'eval_loss': 1.0, 'eval_runtime': 63.9838, 'eval_samples_per_second': 9.674, 'eval_steps_per_second': 1.219, 'epoch': 2.0}




{'loss': 1.0, 'learning_rate': 3.299046663321626e-05, 'epoch': 2.03}
{'loss': 1.0, 'learning_rate': 3.0481685900652283e-05, 'epoch': 2.26}
{'loss': 1.0, 'learning_rate': 2.797290516808831e-05, 'epoch': 2.48}


KeyboardInterrupt: 

In [250]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 619
  Batch size = 8


  0%|          | 0/78 [00:00<?, ?it/s]

{'eval_loss': 1.0, 'eval_runtime': 60.8281, 'eval_samples_per_second': 10.176, 'eval_steps_per_second': 1.282, 'epoch': 2.52}


{'eval_loss': 1.0,
 'eval_runtime': 60.8281,
 'eval_samples_per_second': 10.176,
 'eval_steps_per_second': 1.282,
 'epoch': 2.52}

In [None]:
trainer.train(resume_from_checkpoint=True)