# Transformers session


## Set up

### Install packages

In [None]:
!pip install transformers
!pip install datasets
!pip install evaluate
!pip install --upgrade plotly
!pip install jupyter-dash

### Libraries

In [None]:
import evaluate
import numpy as np
import pandas as pd
import plotly.express as px #requires yaml safe_load() in config?

from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    pipeline,
    TFAutoModelForSequenceClassification,
    DataCollatorWithPadding)
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import SparseCategoricalCrossentropy

## Tokenization

In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [None]:
example_phrase = "The cat sat on the mat. It was miaowing because it was hungry. 🐱"
tokenizer.tokenize(example_phrase)

### Other tokenizers

In [None]:
tokenizer_gpt2 = AutoTokenizer.from_pretrained("gpt2")
tokenizer_t5 = AutoTokenizer.from_pretrained("t5-small")

In [None]:
tokenizer_gpt2.tokenize(example_phrase)

In [None]:
tokenizer_t5.tokenize(example_phrase)

### Adding tokens
The BERT tokenizer doesn't recognise emojis

In [None]:
encoding = tokenizer.encode(example_phrase)
print(encoding)
print(tokenizer.convert_ids_to_tokens(encoding))

In [None]:
tokenizer.add_tokens(list("🐱"))
encoding = tokenizer.encode(example_phrase)
print(encoding)
print(tokenizer.convert_ids_to_tokens(encoding))

### Padding & truncation

In [None]:
# Padding
tokenizer.batch_encode_plus(["The cat sat on the mat.", "It is sunny today."], padding="longest")["input_ids"]

In [None]:
# Truncation
tokenizer.batch_encode_plus(["The cat sat on the mat.", "It is sunny today."], truncation=True, max_length=4)["input_ids"]

## Fine-tuning
The Microsoft Research Paraphrase Corpus (Dolan & Brockett, 2005) is a corpus of sentence pairs automatically extracted from online news sources, with human annotations for whether the sentences in the pair are semantically equivalent.

In [None]:
# Load dataset
raw_datasets = load_dataset("glue", "mrpc")
print()
print(raw_datasets)
print()
raw_datasets["train"][0]

In [None]:
# Model version to use
checkpoint = "bert-base-uncased"

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

# Helper function
def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True)

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)

# Data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")

# Build tokenized datasets for input into the model
tf_train_dataset = tokenized_datasets["train"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "token_type_ids"],
    label_cols=["labels"],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=8,
)

tf_validation_dataset = tokenized_datasets["validation"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "token_type_ids"],
    label_cols=["labels"],
    shuffle=False,
    collate_fn=data_collator,
    batch_size=8,
)

tf_test_dataset = tokenized_datasets["test"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "token_type_ids"],
    label_cols=["labels"],
    shuffle=False,
    collate_fn=data_collator,
    batch_size=8,
)

In [None]:
# Create model instance
model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
print()
model.summary()

# Compile model
model.compile(
    optimizer=Adam(learning_rate=5e-5),
    loss=SparseCategoricalCrossentropy(from_logits=True),
    metrics=["accuracy"],
)

In [None]:
# Fit model
model.fit(
    tf_train_dataset,
    validation_data=tf_validation_dataset,
    epochs=3
)

In [None]:
# Get predictions for the test data
preds = model.predict(tf_test_dataset)["logits"]
class_preds = np.argmax(preds, axis=1)

# Load the metric(s) associated with this dataset
metric = evaluate.load("glue", "mrpc")

# Get metric results
accuracy_results = metric.compute(predictions=class_preds, references=raw_datasets["test"]["label"])
print(accuracy_results)

## Zero-shot (sentiment) classification
https://huggingface.co/course/chapter1/3?fw=tf  
https://colab.research.google.com/drive/1jocViLorbwWIkTXKwxCOV9HLTaDDgCaw?usp=sharing

In [None]:
# Load classifier
zs_model = "navteca/bart-large-mnli"

classifier = pipeline("zero-shot-classification",
                      model=zs_model)

In [None]:
# Run classifier over a list of input texts
inputs = [
          "Summer starts now. Your FREE gift, from No7!",
          "Look no further for the items you need.",
          "A product you'll want to add to your daily routine",
          "Find your new Summer faves",
          "Your name's all over this... Let's get you all set for summer!",
          "Let us help - have a top self-care summer...",
          "WOW - enjoy Summer more with No7"
          ]
          
labels = ["Impressed", "Helpful", "Curious", "Excited", "Surprising", "Appreciative", "Urgent"]
hypothesis_template = "This email sentiment is {}"

results = classifier(inputs,
                     candidate_labels=labels,
                     hypothesis_template=hypothesis_template,
                     multi_label=True)

In [None]:
# Put results into a data frame
tbl = {}
for result in results:
  tbl[result["sequence"]] = dict(zip(result["labels"], result["scores"]))

tbl_df = pd.DataFrame(tbl).T.reset_index().rename(columns={"index": "Text"})
tbl_df

In [None]:
# Visualise
tbl_df_melt = tbl_df.melt(id_vars="Text", var_name="Sentiment", value_name="Value")

fig=px.line_polar(tbl_df_melt, theta="Sentiment", r="Value", color="Text", line_close=True)
fig.show()