In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "4,5,6,7"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [2]:
from datasets import Dataset, load_metric
import mmh3
import numpy as np
import pandas as pd
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification
import torch
import random

In [3]:
MAX_LEN_CHARS = 256*3

In [4]:
df_t = pd.read_csv("data/eng_sentences.tsv", sep="\t", names=["id", "lang", "text"])
df_t = df_t[df_t["text"].str.len() < MAX_LEN_CHARS]

# shuffle
df_t["id"] = df_t["text"].map(lambda x: mmh3.hash64(x.encode('utf8'))[0])
df_t = df_t.sort_values("id")
len(df_t)

1582094

In [5]:
df_o = pd.read_csv("data/oss.tsv", sep="\t", names=["text"])
df_o = df_o[df_o["text"].str.len() < MAX_LEN_CHARS]

# shuffle
df_o["id"] = df_o["text"].map(lambda x: mmh3.hash64(x.encode('utf8'))[0])
df_o = df_o.sort_values("id")
len(df_o)

66939

In [6]:
df_g = pd.read_csv("data/gutenberg.tsv", sep="\t", names=["text"])
df_g = df_g[df_g["text"].str.len() < MAX_LEN_CHARS]

# shuffle
df_g["id"] = df_g["text"].map(lambda x: mmh3.hash64(x.encode('utf8'))[0])
df_g = df_g.sort_values("id")
len(df_g)

4102516

In [7]:
df_t.head(1)

Unnamed: 0,id,lang,text
1123557,-9223326714129212706,eng,Tom told everybody that he didn't know what to...


In [8]:
df_o.head(1)

Unnamed: 0,text,id
12494,Sometimes you need to access a type passed as ...,-9222670596738368818


In [9]:
df_g.head(1)

Unnamed: 0,text,id
1285889,To persons standing alone on a hill during a c...,-9223366647700897551


In [10]:
train_t_size = 1000000
train_o_size = 66939-20000
train_g_size = 1000000
val_size = 10000
test_size = 10000
assert(train_t_size + val_size + test_size <= len(df_t["text"]))
assert(train_o_size + val_size + test_size <= len(df_o["text"]))
assert(train_g_size + val_size + test_size <= len(df_g["text"]))

train_text = list(df_t["text"])[:train_t_size] + list(df_o["text"])[:train_o_size] + list(df_g["text"])[:train_g_size]
random.Random(4).shuffle(train_text)

val_text = list(df_t["text"])[-val_size-test_size:-test_size] + list(df_o["text"])[-val_size-test_size:-test_size] \
+ list(df_g["text"])[-val_size-test_size:-test_size]

test_text = list(df_t["text"])[-test_size:] + list(df_o["text"])[-test_size:] + list(df_g["text"])[-test_size:]

len(train_text), len(val_text), len(test_text)

(2046939, 30000, 30000)

In [11]:
train_text[:3]

["And with my sister-nymphs I sport, Till the broad sun looks o'er the floods; Then, swift we seek our crystal court, Deep in the wave, 'mid Neptune's woods.",
 '"To learn to know nothing, go whither you are ignorant.',
 "He's very angry with her."]

In [12]:
label_list = ['O', 'B-COMMA']
label_encoding_dict = {'O': 0, 'B-COMMA': 1}

model_checkpoint = "distilbert-base-uncased"
batch_size = 128

In [13]:
if "roberta" in model_checkpoint:
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)
else:
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label_list))
args = TrainingArguments(
    "comma-" + model_checkpoint,
    evaluation_strategy="steps",
    eval_steps=500,
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    weight_decay=1e-5,
)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForTokenClassification: ['vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN t

In [14]:
def to_dataset(texts):
    tokens, tags = [], []
    for text in texts:
        text_tokens, text_tags = [], []
        for token in text.split(" "):
            if "," in token:
                tag = "B-COMMA"
                token = token.replace(",", "")
            else:
                tag = "O"
            text_tokens.append(token)
            text_tags.append(tag)
        tokens.append(text_tokens)
        tags.append(text_tags)
    return Dataset.from_pandas(pd.DataFrame({
        'tokens': tokens, 'tags': tags
    }))

In [15]:
train_dataset = to_dataset(train_text)
val_dataset = to_dataset(val_text)
test_dataset = to_dataset(test_text)

train_dataset

Dataset({
    features: ['tokens', 'tags'],
    num_rows: 2046939
})

In [16]:
def tokenize_and_align_labels(examples):
    label_all_tokens = True
    tokenized_inputs = tokenizer(list(examples["tokens"]), truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples["tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif label[word_idx] == '0':
                label_ids.append(0)
            elif word_idx != previous_word_idx:
                label_ids.append(label_encoding_dict[label[word_idx]])
            else:
                label_ids.append(label_encoding_dict[label[word_idx]] if label_all_tokens else -100)
            previous_word_idx = word_idx
        labels.append(label_ids)
        
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

train_tokenized_datasets = train_dataset.map(tokenize_and_align_labels, batched=True)
test_tokenized_datasets = test_dataset.map(tokenize_and_align_labels, batched=True)

  0%|          | 0/2047 [00:00<?, ?ba/s]

  0%|          | 0/30 [00:00<?, ?ba/s]

In [17]:
data_collator = DataCollatorForTokenClassification(tokenizer)
metric = load_metric("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [[label_list[p] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]
    true_labels = [[label_list[l] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"]
    }
    
trainer = Trainer(
    model,
    args,
    train_dataset=train_tokenized_datasets,
    eval_dataset=test_tokenized_datasets,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [18]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `DistilBertForTokenClassification.forward` and have been ignored: tags, tokens. If tags, tokens are not expected by `DistilBertForTokenClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 2046939
  Num Epochs = 3
  Instantaneous batch size per device = 128
  Total train batch size (w. parallel, distributed & accumulation) = 512
  Gradient Accumulation steps = 1
  Total optimization steps = 11994
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
[34m[1mwandb[0m: Currently logged in as: [33mfirefish[0m (use `wandb login --relogin` to force relogin)




Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
500,0.1133,0.067751,0.794868,0.639238,0.708608,0.965684
1000,0.0821,0.060605,0.798126,0.702518,0.747276,0.968984
1500,0.0769,0.056908,0.813108,0.720223,0.763852,0.970932
2000,0.0737,0.05502,0.801613,0.751576,0.775789,0.971643
2500,0.0717,0.053289,0.810327,0.758103,0.783346,0.972628
3000,0.0703,0.051341,0.826926,0.74796,0.785464,0.97333
3500,0.0689,0.050377,0.828726,0.754266,0.789745,0.973785
4000,0.0678,0.0498,0.844578,0.742492,0.790252,0.974273
4500,0.0646,0.048948,0.819652,0.780019,0.799345,0.974438
5000,0.0643,0.048478,0.826019,0.77852,0.801566,0.97484


The following columns in the evaluation set  don't have a corresponding argument in `DistilBertForTokenClassification.forward` and have been ignored: tags, tokens. If tags, tokens are not expected by `DistilBertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 30000
  Batch size = 128
Saving model checkpoint to comma-distilbert-base-uncased/checkpoint-500
Configuration saved in comma-distilbert-base-uncased/checkpoint-500/config.json
Model weights saved in comma-distilbert-base-uncased/checkpoint-500/pytorch_model.bin
tokenizer config file saved in comma-distilbert-base-uncased/checkpoint-500/tokenizer_config.json
Special tokens file saved in comma-distilbert-base-uncased/checkpoint-500/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `DistilBertForTokenClassification.forward` and have been ignored: tags, tokens. If tags, tokens are not expected by `DistilBertF

TrainOutput(global_step=11994, training_loss=0.06678361417214751, metrics={'train_runtime': 3961.8432, 'train_samples_per_second': 1549.99, 'train_steps_per_second': 3.027, 'total_flos': 2.6779427709548736e+17, 'train_loss': 0.06678361417214751, 'epoch': 3.0})

In [19]:
trainer.evaluate()

The following columns in the evaluation set  don't have a corresponding argument in `DistilBertForTokenClassification.forward` and have been ignored: tags, tokens. If tags, tokens are not expected by `DistilBertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 30000
  Batch size = 128


{'eval_loss': 0.046164751052856445,
 'eval_precision': 0.8358295734685334,
 'eval_recall': 0.7924108127177316,
 'eval_f1': 0.8135412893879029,
 'eval_accuracy': 0.9762905010887603,
 'eval_runtime': 25.3199,
 'eval_samples_per_second': 1184.838,
 'eval_steps_per_second': 2.33,
 'epoch': 3.0}

In [20]:
trainer.save_model("comma-" + model_checkpoint + "-3domains")

Saving model checkpoint to comma-distilbert-base-uncased-3domains
Configuration saved in comma-distilbert-base-uncased-3domains/config.json
Model weights saved in comma-distilbert-base-uncased-3domains/pytorch_model.bin
tokenizer config file saved in comma-distilbert-base-uncased-3domains/tokenizer_config.json
Special tokens file saved in comma-distilbert-base-uncased-3domains/special_tokens_map.json
