# Fine-Tuning AlephBert model with Hebrew Decriptive Sentences dataset

## Installations

In [None]:
! pip install transformers datasets --quiet
!sudo apt-get install git-lfs --quiet
!git-lfs install --quiet
!pip install wandb --quiet

In [None]:
import pandas as pd
import numpy as np
from datasets import Dataset, DatasetDict, load_dataset, load_metric
import datasets

### Load Descriptive_Sentences_He dataset from [Huggingface](https://huggingface.co/datasets/orisuchy/Descriptive_Sentences_He) 🤗 

In [None]:
descriptive_dataset = load_dataset("orisuchy/Descriptive_Sentences_He")
descriptive_dataset

# change all all labels to ID's
"Descriptive" -> 0
<br>
"NotDescriptive" -> 1


In [None]:
def lable2ID(w):
  if w["label"] == "Descriptive":
    w["label"] = 0
  else:
    w["label"] = 1
  return w

In [None]:
descriptive_dataset = descriptive_dataset.map(lable2ID)

This loads a `DatasetDict` object which you can index into to view an example:

In [None]:
descriptive_dataset["train"]

In [None]:
descriptive_dataset["train"][0]

### Preprocess

The next step is to tokenize the text into a readable format by the model.

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("onlplab/alephbert-base")
# Other available Hebrew models
# tokenizer = AutoTokenizer.from_pretrained("avichr/heBERT")
# tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")
tokenizer.tokenize("?שלום מה נשמע", truncation=True)


A function that will tokenize the text. truncate
longer sequences in the text to be no longer than the model's maximum input length:

In [None]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

Using 🤗 Datasets `map` function to apply the preprocessing function to the entire dataset. 
`batched=True` to apply the preprocessing function to multiple elements of the dataset at once for faster
preprocessing:

In [None]:
tokenized_descriptive = descriptive_dataset.map(preprocess_function, batched=True)
tokenized_descriptive

Padding the text so they are a uniform length.

In [None]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

### Fine-tune with the Trainer API

Loading [AlephBert](https://huggingface.co/onlplab/alephbert-base) model with the [AutoModelForSequenceClassification](https://huggingface.co/docs/transformers/master/en/model_doc/auto#transformers.AutoModelForSequenceClassification) class along with the number of expected labels:

In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("onlplab/alephbert-base", num_labels=2)
# Other available Hebrew models
# model = AutoModelForSequenceClassification.from_pretrained("avichr/heBERT", num_labels=2)
# model = AutoModelForSequenceClassification.from_pretrained("bert-base-multilingual-cased", num_labels=2)

Defining metric

In [None]:
metric = load_metric("accuracy")


def compute_metrics(eval_pred):
  logits, labels = eval_pred
  predictions = np.argmax(logits, axis=-1)
  # print(f'perd: {predictions} {type(labels)}\nlabels: {labels} {type(labels)}')
  return metric.compute(predictions=predictions, references=labels)

# Try other metrices
# accuracy_score = load_metric('accuracy')
# f1_score = load_metric('f1')
# def compute_metrics(eval_pred):
#     predictions, labels = eval_pred
#     predictions = np.argmax(predictions, axis=1)
#     # returns a dict like {'f1':0.54221}
#     f1 = f1_score.compute(predictions=predictions, references=labels)
#     # returns a dict like {'accuracy': 0.3241}
#     acc = accuracy_score.compute(predictions=predictions, references=labels)
#     # merge the two dictionaries
#     return {**f1, **acc}

Defining training hyperparameters in [TrainingArguments](https://huggingface.co/docs/transformers/master/en/main_classes/trainer#transformers.TrainingArguments).

In [None]:
from transformers import TrainingArguments, Trainer
path = './finetuning_results'
training_args = TrainingArguments(
    output_dir=path,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    # label_names = ['Descriptive', 'NotDescriptive'],
    report_to="wandb",
    logging_steps=48,
    evaluation_strategy="epoch",
    num_train_epochs=12,
    weight_decay=0.01,
)

Passing the training arguments to a [Trainer](https://huggingface.co/docs/transformers/master/en/main_classes/trainer#transformers.Trainer) along with the model, dataset, tokenizer, and data collator.

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_descriptive["train"],
    eval_dataset=tokenized_descriptive["validation"],
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

Calling `Trainer.train()` to fine-tune the model.

In [None]:
trainer.train()
trainer.evaluate()

# Uploading to Huggingface

In [None]:
!huggingface-cli login

In [None]:
git_path = "orisuchy/Descriptive_Classifier"
trainer.model.push_to_hub(repo_path_or_name=git_path)

# **Testing everything**

## Loading dataset and model

Loading dataset

In [None]:
dataset = load_dataset("orisuchy/Descriptive_Sentences_He")

Function to convert labels to ID's

In [None]:
def labeltoId(s):
  if s == "Descriptive":
    return 0
  else: 
    return 1  

Loading model

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("orisuchy/Descriptive_Classifier")
model = AutoModelForSequenceClassification.from_pretrained("orisuchy/Descriptive_Classifier")

Defining metric

In [None]:
metric = datasets.load_metric("accuracy")

Testing

In [None]:
references_lst = []
predictions_lst = []
for batch in dataset["test"]:
  inputs = batch["text"]
  reference =  labeltoId(batch["label"])
  logits = model(**tokenizer(inputs, return_tensors='pt',truncation=True,padding=True))
  prediction = np.argmax(logits, axis=-1)
  references_lst.append(reference)
  predictions_lst.append(prediction)    
score = metric.compute(predictions=np.array(predictions_lst), references=np.array(references_lst))
score