<a href="https://colab.research.google.com/github/ndamulelonemakh/shared-notebooks/blob/main/1_0_nn_zabantu_misinformation_english.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Overview

* This notebook demonstrates how to fine-tune any [XLM-RoBERTa](https://huggingface.co/docs/transformers/en/model_doc/xlm-roberta#xlm-roberta) model for text classification.
* We will [ZaBantu](https://huggingface.co/dsfsi/zabantu-xlm-roberta) pre-trained model by default, but you can change this to any model trained on the [XLM-R architecture](https://arxiv.org/abs/1911.02116)


---
* Environment Setup
* Global Parameters
* Data Preparation
* Preprocessing the data
* Fine-tuning the Pre-Trained Model
* Evaluate the Fine-Tuned Model

# Environment setup

In [None]:
%pip install --quiet transformers  datasets sentencepiece evaluate huggingface_hub

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m401.2/401.2 kB[0m [31m32.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
%pip install accelerate -U --quiet

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/297.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.2/297.6 kB[0m [31m2.6 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.6/297.6 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25h

> You may need to restart the kernel before  proceeding to the next steps

In [None]:
import transformers

print(transformers.__version__)

* If your pre-trained **model is private**, uncomment the code below:

In [None]:
# from huggingface_hub import notebook_login, whoami

# try:
#   whoami()
# except:
#   print("User token not found, calling notebook_login()...")
#   notebook_login()

In [None]:
import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from pathlib import Path

# Global Settings

In [None]:
task = "misinformation" # Can be any single label text classification task

# model options:
# - dsfsi/zabantu-xlm-roberta
# - dsfsi/zabantu-sot-ven-170m
# - dsfsi/zabantu-nso-ven-170m
# - dsfsi/zabantu-nso-120m
# - FacebookAI/xlm-roberta-base
model_checkpoint = "dsfsi/zabantu-sot-ven-170m"
dataset_checkpoint = "dfsi/misinformation-english"
local_data_path = "local_data"

push_to_hub_enabled = False
trained_model_checkpoint = f"{model_checkpoint}-finetuned-{task}"
trained_model_checkpoint_hub = f"ndamulelonemakh/{trained_model_checkpoint}"
batch_size = 16  # adjust depending on GPU size
epochs = 3

# Data Preperation

Once processed, your dataset must look somthing like this:

## 1. Load raw data

In [None]:
!unzip -n "true( English data).zip" -d local_data
!unzip -n "fake (English data).zip" -d local_data

In [None]:
# inspect data
!head -n 3 local_data/True.csv

In [None]:
!head -n 3 local_data/Fake.csv

## 2. Tranform to DataFrame

In [None]:
def load_and_merge(fake_data_file: str, true_data_file: str) -> pd.DataFrame:
  """
  Loads fake and true data from their respective files and merges them into a single DataFrame.

  Args:
    fake_data_file: Path to the fake data file.
    true_data_file: Path to the true data file.

  Returns:
    A Pandas DataFrame containing the merged data.
  """

  fake_data = pd.read_csv(os.path.join(local_data_path, fake_data_file))
  fake_data["label"] = "FAKE"
  true_data = pd.read_csv(os.path.join(local_data_path, true_data_file))
  true_data["label"] = "TRUE"

  merged_data = pd.concat([fake_data, true_data], ignore_index=True)

  return merged_data

In [None]:
raw_df = load_and_merge("Fake.csv", "True.csv")
print(raw_df.info())
raw_df.sample(3)

In [None]:
raw_df.subject.value_counts().plot(kind="barh")

In [None]:
raw_df.label.value_counts(True)

## 3. Convert to Transformers [Datasets](https://huggingface.co/docs/datasets/en/index)

In [None]:
from datasets import Dataset, ClassLabel, Sequence, DatasetDict, Value,Features

In [None]:
# prompt: complete this python function implementation: from datasets import Dataset, ClassLabel, Sequence, DatasetDict
# def pandas_to_huggingface_dataset(df: pd.DataFrame, split=True) -> DatasetDict:
#   pass

import pandas as pd
from datasets import Dataset, ClassLabel, Sequence, DatasetDict

def pandas_to_huggingface_dataset(df: pd.DataFrame,
                                  split=True) -> DatasetDict:
  """
  Converts a Pandas DataFrame to a Hugging Face Dataset.

  Args:
    df: The Pandas DataFrame to convert.
    split: Whether to split the data into train and test sets.

  Returns:
    A Hugging Face DatasetDict containing the converted data.
  """

  features = Features({
      "text": Value(dtype="string"),
      "label": ClassLabel(names=["FAKE", "TRUE"]),
  })
  df = df[["text", "label"]]  # we do not care about the rest for now
  dataset = Dataset.from_pandas(df, features=features)

  if split:
    return dataset.train_test_split(test_size=0.2, shuffle=True, seed=42)
  else:
    return dataset

dataset = pandas_to_huggingface_dataset(raw_df)
dataset


In [None]:
dataset['train'][0]

In [None]:
dataset['train'].features

# Data Pre-processing

## Tokenization

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)

In [None]:
# You can check which type of models have a fast tokenizer available and which don't on the big table of models.
assert isinstance(tokenizer, transformers.PreTrainedTokenizerFast)

In [None]:
tokenizer("Muthu munwe na munwe una mukovhe ware!")

In [None]:
print('===Subword Tokenisation Illustration===')
example = dataset["train"][4]
print('=' * 50 + '\n')
print("ORIGINAL TEXT")
print(example["text"])
print('-' * 20   + '\n')
print('TOKENS:')
tokenized_input = tokenizer(example["text"], is_split_into_words=False)
tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
print(tokens)

In [None]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [None]:
tokenized_datasets = dataset.map(preprocess_function, batched=True)


In [None]:
print(tokenized_datasets['test'][0])

# Fine-tuning

In [None]:
import evaluate
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

In [None]:
# Define the label-to-id mapping
label_names =  dataset['train'].features['label'].names

label2id = {label_name:i for i, label_name in enumerate(label_names)}
id2label = {id: label for label, id in label2id.items()}
label2id

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint,
                                                        num_labels=len(label_names),
                                                        id2label=id2label,
                                                        label2id=label2id)

In [None]:
model_name = model_checkpoint.split("/")[-1]
args = TrainingArguments(
    trained_model_checkpoint,
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=epochs,
    weight_decay=0.01,
    load_best_model_at_end=True,
    push_to_hub=push_to_hub_enabled,
)

In [None]:
# define collator to enable dynamic text padding
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [None]:
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

# Evaluate the Fine-tuned Model

In [None]:
trainer.evaluate()

In [None]:
if push_to_hub_enabled:
  trainer.push_to_hub()
else:
  trainer.save_model(trained_model_checkpoint)

### Example usage of your trained model using Huggingface pipelines

In [None]:
from transformers import pipeline

In [None]:
pipe = pipeline('token-classification', model=trained_model_checkpoint_hub if push_to_hub_enabled else trained_model_checkpoint)

In [None]:
test_example = "Barack Hussein Obama was born in soweto. He is the only sibling to Donald Trump. He became the first black president in South Africa"
pipe(test_example)