In [1]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["WANDB_START_METHOD"] = "thread"

In [2]:
from datasets import Dataset, load_metric
import mmh3
import numpy as np
import pandas as pd
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification
import torch
import random

In [3]:
MAX_LEN_CHARS = 256*3

In [4]:
df_t = pd.read_csv("data/eng_sentences.tsv", sep="\t", names=["id", "lang", "text"])
df_t = df_t[df_t["text"].str.len() < MAX_LEN_CHARS]

# shuffle
df_t["id"] = df_t["text"].map(lambda x: mmh3.hash64(x.encode('utf8'))[0])
df_t = df_t.sort_values("id")
len(df_t)

1582094

In [5]:
df_o = pd.read_csv("data/oss.tsv", sep="\t", names=["text"])
df_o = df_o[df_o["text"].str.len() < MAX_LEN_CHARS]

# shuffle
df_o["id"] = df_o["text"].map(lambda x: mmh3.hash64(x.encode('utf8'))[0])
df_o = df_o.sort_values("id")
len(df_o)

66939

In [6]:
df_g = pd.read_csv("data/gutenberg.tsv", sep="\t", names=["text"])
df_g = df_g[df_g["text"].str.len() < MAX_LEN_CHARS]

# shuffle
df_g["id"] = df_g["text"].map(lambda x: mmh3.hash64(x.encode('utf8'))[0])
df_g = df_g.sort_values("id")
len(df_g)

4102516

In [7]:
df_t.head(1)

Unnamed: 0,id,lang,text
1123557,-9223326714129212706,eng,Tom told everybody that he didn't know what to...


In [8]:
df_o.head(1)

Unnamed: 0,text,id
12494,Sometimes you need to access a type passed as ...,-9222670596738368818


In [9]:
df_g.head(1)

Unnamed: 0,text,id
1285889,To persons standing alone on a hill during a c...,-9223366647700897551


In [10]:
train_t_size = 1500000
train_o_size = 66939-20000
train_g_size = 4000000
val_size = 10000
test_size = 10000
assert(train_t_size + val_size + test_size <= len(df_t["text"]))
assert(train_o_size + val_size + test_size <= len(df_o["text"]))
assert(train_g_size + val_size + test_size <= len(df_g["text"]))

train_text = list(df_t["text"])[:train_t_size] + list(df_o["text"])[:train_o_size] + list(df_g["text"])[:train_g_size]
random.Random(4).shuffle(train_text)

val_text = list(df_t["text"])[-val_size-test_size:-test_size] + list(df_o["text"])[-val_size-test_size:-test_size] \
+ list(df_g["text"])[-val_size-test_size:-test_size]

test_text = list(df_t["text"])[-test_size:] + list(df_o["text"])[-test_size:] + list(df_g["text"])[-test_size:]

len(train_text), len(val_text), len(test_text)

(5546939, 30000, 30000)

In [11]:
train_text[:3]

["I usually go to Australia for my summer vacation, but this year I'm planning to go to New Zealand.",
 'Their grandmothers had suffered the hardships of frontier life, had known the horrors of savage warfare when the beautiful valleys of the Connecticut and the Merrimack were threaded with Indian trails from Canada to the white settlements.',
 'He was observant, truthful, and kindly--perhaps the chief requisites in a good story-teller.']

In [12]:
label_list = ['O', 'B-COMMA']
label_encoding_dict = {'O': 0, 'B-COMMA': 1}

model_checkpoint = "roberta-base"
batch_size = 64

In [13]:
if "roberta" in model_checkpoint:
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)
else:
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label_list))
args = TrainingArguments(
    "comma-" + model_checkpoint,
    evaluation_strategy="steps",
    eval_steps=500,
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size//2,
    num_train_epochs=5,
    weight_decay=1e-5,
)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForTokenClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able

In [14]:
def to_dataset(texts):
    tokens, tags = [], []
    for text in texts:
        text_tokens, text_tags = [], []
        for token in text.split(" "):
            if "," in token:
                tag = "B-COMMA"
                token = token.replace(",", "")
            else:
                tag = "O"
            text_tokens.append(token)
            text_tags.append(tag)
        tokens.append(text_tokens)
        tags.append(text_tags)
    return Dataset.from_pandas(pd.DataFrame({
        'tokens': tokens, 'tags': tags
    }))

In [15]:
train_dataset = to_dataset(train_text)
val_dataset = to_dataset(val_text)
test_dataset = to_dataset(test_text)

train_dataset

Dataset({
    features: ['tokens', 'tags'],
    num_rows: 5546939
})

In [16]:
def tokenize_and_align_labels(examples):
    label_all_tokens = True
    tokenized_inputs = tokenizer(list(examples["tokens"]), truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples["tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif label[word_idx] == '0':
                label_ids.append(0)
            elif word_idx != previous_word_idx:
                label_ids.append(label_encoding_dict[label[word_idx]])
            else:
                label_ids.append(label_encoding_dict[label[word_idx]] if label_all_tokens else -100)
            previous_word_idx = word_idx
        labels.append(label_ids)
        
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

train_tokenized_datasets = train_dataset.map(tokenize_and_align_labels, batched=True)
test_tokenized_datasets = test_dataset.map(tokenize_and_align_labels, batched=True)

  0%|          | 0/5547 [00:00<?, ?ba/s]

  0%|          | 0/30 [00:00<?, ?ba/s]

In [17]:
data_collator = DataCollatorForTokenClassification(tokenizer)
metric = load_metric("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [[label_list[p] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]
    true_labels = [[label_list[l] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"]
    }
    
trainer = Trainer(
    model,
    args,
    train_dataset=train_tokenized_datasets,
    eval_dataset=test_tokenized_datasets,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [18]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `RobertaForTokenClassification.forward` and have been ignored: tags, tokens. If tags, tokens are not expected by `RobertaForTokenClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 5546939
  Num Epochs = 5
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 512
  Gradient Accumulation steps = 1
  Total optimization steps = 54170
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
[34m[1mwandb[0m: Currently logged in as: [33mfirefish[0m (use `wandb login --relogin` to force relogin)




Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
500,0.0951,0.04807,0.845389,0.778862,0.810763,0.975888
1000,0.0677,0.044202,0.832788,0.82309,0.827911,0.977308
1500,0.0647,0.041961,0.835176,0.835588,0.835382,0.978161
2000,0.0631,0.040636,0.849924,0.826523,0.83806,0.978817
2500,0.0614,0.040822,0.862119,0.814182,0.837465,0.979042
3000,0.0606,0.039742,0.856817,0.825401,0.840816,0.979274
3500,0.0599,0.039496,0.851967,0.837361,0.844601,0.979565
4000,0.0596,0.038367,0.852529,0.84526,0.848879,0.980042
4500,0.0588,0.037484,0.864797,0.83303,0.848616,0.98029
5000,0.0584,0.037699,0.865854,0.829036,0.847045,0.980144


The following columns in the evaluation set  don't have a corresponding argument in `RobertaForTokenClassification.forward` and have been ignored: tags, tokens. If tags, tokens are not expected by `RobertaForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 30000
  Batch size = 32
Saving model checkpoint to comma-roberta-base/checkpoint-500
Configuration saved in comma-roberta-base/checkpoint-500/config.json
Model weights saved in comma-roberta-base/checkpoint-500/pytorch_model.bin
tokenizer config file saved in comma-roberta-base/checkpoint-500/tokenizer_config.json
Special tokens file saved in comma-roberta-base/checkpoint-500/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `RobertaForTokenClassification.forward` and have been ignored: tags, tokens. If tags, tokens are not expected by `RobertaForTokenClassification.forward`,  you can safely ignore this message.

TrainOutput(global_step=54170, training_loss=0.050950363251431, metrics={'train_runtime': 32047.6374, 'train_samples_per_second': 865.421, 'train_steps_per_second': 1.69, 'total_flos': 2.4190737124734285e+18, 'train_loss': 0.050950363251431, 'epoch': 5.0})

In [19]:
trainer.evaluate()

The following columns in the evaluation set  don't have a corresponding argument in `RobertaForTokenClassification.forward` and have been ignored: tags, tokens. If tags, tokens are not expected by `RobertaForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 30000
  Batch size = 32


{'eval_loss': 0.03436991572380066,
 'eval_precision': 0.8730498066863771,
 'eval_recall': 0.8613934702120498,
 'eval_f1': 0.8671824702095215,
 'eval_accuracy': 0.9825013578970082,
 'eval_runtime': 48.3887,
 'eval_samples_per_second': 619.979,
 'eval_steps_per_second': 2.439,
 'epoch': 5.0}

In [20]:
trainer.save_model("comma-" + model_checkpoint + "-3domains-more-data")

Saving model checkpoint to comma-roberta-base-3domains-more-data
Configuration saved in comma-roberta-base-3domains-more-data/config.json
Model weights saved in comma-roberta-base-3domains-more-data/pytorch_model.bin
tokenizer config file saved in comma-roberta-base-3domains-more-data/tokenizer_config.json
Special tokens file saved in comma-roberta-base-3domains-more-data/special_tokens_map.json
