In [211]:
from transformers import AutoTokenizer, DataCollatorWithPadding, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import Dataset
import evaluate
import numpy as np
from datetime import datetime

### Train pre-trained model from hugging face
#### Load Dataset

In [212]:
ds = Dataset.load_from_disk('data/code_search_net_relevance.hf')
ds

Dataset({
    features: ['func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'relevance'],
    num_rows: 315
})

In [None]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

#TODO: include code...
def preprocess(natural_language):
    return tokenizer(natural_language, truncation=True)

data_tokens= ds.map(preprocess(natural_language=ds['func_documentation_string']))

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

#### Evaluate

In [None]:
accuracy = evaluate.load("accuracy")

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

#### Split train and test set

In [179]:
ds_split = ds.train_test_split(test_size=0.2)
ds_split

Dataset({
    features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url'],
    num_rows: 909
})

#### Train

In [None]:
id2label = {0: 'irrelevant', 1: 'partially relevant', 2: 'relevant', 3: 'very relevant'}
label2id = {'irrelevant': 0, 'partially relevant': 1, 'relevant': 2, 'very relevant': 3}

In [None]:
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id)

training_arguments = TrainingArguments(
    output_dir="models/distilbert_v1_" + str(datetime.now()),
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_arguments,
    train_dataset=ds["train"],
    eval_dataset=ds["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()