<a href="https://www.kaggle.com/code/jmostol/class-competition?scriptVersionId=94861118" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
!pip install -Uqq transformers
!pip install -Uqq datasets

In [None]:
import torch
import pandas as pd
from transformers import set_seed
# For reproducability:
set_seed(42) # Set seed for `random`,`numpy`,`torch`, etc. (https://huggingface.co/docs/transformers/main/en/internal/trainer_utils#transformers.set_seed)

In [None]:
from datasets import Dataset

train_csv = "../input/class-competition-data/uazhlt-ling-539-sp-2022-2/train.csv"
test_csv = "../input/class-competition-data/uazhlt-ling-539-sp-2022-2/test.csv"

df = pd.read_csv(train_csv)#.sample(n=60000, random_state=42) # For random subset. Most recent run: 10000

dataset = Dataset.from_pandas(df) # Convert to HF Dataset
film_review_datasets = dataset.train_test_split(test_size=0.01) # Don't really need to validate, if we're just submitting
film_review_datasets = film_review_datasets.map(lambda batch: {"TEXT": str(batch["TEXT"])}) # Fix "none" elements.

In [None]:
checkpoint = "distilbert-base-uncased"

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(example):
    return tokenizer(example["TEXT"], truncation=True)

tokenized_datasets = film_review_datasets.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.map(lambda examples: {'labels': examples['LABEL']}, batched=True)

In [None]:
from transformers import AutoModelForSequenceClassification
from transformers import Trainer, TrainingArguments, DataCollatorWithPadding
from datasets import load_metric
import numpy as np

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=3)

training_args = TrainingArguments("test-trainer",
                                  num_train_epochs=1,
                                  per_device_train_batch_size=16,
                                  per_device_eval_batch_size=16,
                                  evaluation_strategy="epoch",
                                  seed=42,
                                  report_to="none")
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

def compute_metrics(eval_preds):
    metric = load_metric("f1")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels, average="macro")

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

In [None]:
test_dataset = Dataset.from_pandas(pd.read_csv(test_csv))
test_dataset_tokenized = test_dataset.map(lambda x: tokenizer(str(x["TEXT"]), padding=True, truncation=True, max_length=512)) # Added `str(...)`

In [None]:
results = trainer.predict(test_dataset_tokenized)

In [None]:
classes = np.argmax(results.predictions,axis=1)

In [None]:
final_preds = pd.DataFrame(zip(test_dataset["ID"],classes), columns=["Id", "Predicted"])

In [None]:
final_preds.to_csv("submission.csv",index=False) # Use API?