# Setup environment

In [None]:
!pip install transformers datasets evaluate accelerate

In [None]:
from datasets import load_dataset
from transformers import DataCollatorWithPadding, AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, pipeline
import wandb
import numpy as np
import evaluate
from huggingface_hub import notebook_login

wandb.init(mode="disabled")

# Load dataset

In [None]:
HF_USERNAME = "LukeGPT88"
PROJECT_NAME = "patient-doctor-text-classifier"
SUB_PROJECT_NAME = "multilingual"
DATASET_NAME = f"{HF_USERNAME}/{PROJECT_NAME}-{SUB_PROJECT_NAME}-dataset"

dataset = load_dataset(DATASET_NAME)

In [None]:
dataset = dataset.rename_column("Text", "text")
dataset = dataset.rename_column("Encoding", "label")
dataset = dataset.remove_columns("Label")

# Tokenizer

In [None]:
model_checkpoint = 'distilbert-base-uncased'
# model_name = "lxyuan/distilbert-base-multilingual-cased-sentiments-student"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [None]:
def preprocess_function(examples):
  return tokenizer(examples["text"], truncation=True)

In [None]:
tokenized_ds = dataset.map(preprocess_function, batched=True)

In [None]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
import numpy as np
import evaluate

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

accuracy = evaluate.load("accuracy")

In [None]:
train = dataset['train'].to_pandas()
validation = dataset['validation'].to_pandas()
test = dataset['test'].to_pandas()

# TRAIN SECTION

In [None]:
id2label = {0: "PATIENT", 1: "DOCTOR", 2: "NEUTRAL"}
label2id = {"PATIENT": 0, "DOCTOR": 1, "NEUTRAL": 2}

### MODEL CONFIGURATION

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

task = f"{PROJECT_NAME}-{SUB_PROJECT_NAME}"
model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint, num_labels=3, id2label=id2label, label2id=label2id
)
model.config

### TRAINING

In [None]:
training_args = TrainingArguments(
    output_dir=f"distilbert-base-uncased-finetuned-{task}",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    logging_strategy="epoch",
    evaluation_strategy="epoch", 
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model='accuracy',
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

In [None]:
trainer.push_to_hub()

In [None]:
import wandb

wandb.finish()

## Use Test Set

In [None]:
trainer.predict(tokenized_ds['test'])

## Use pipeline

In [None]:
from transformers import pipeline
classifier = pipeline("text-classification", model=f"{HF_USERNAME}/{task}")
classifier("I see you’ve set aside this special time to humiliate yourself in public.")

## Hugging Face login

In [None]:
from huggingface_hub import notebook_login

notebook_login()