In [None]:
! pip install loguru==0.7.2  evaluate==0.4.1 -q

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
from datasets import Dataset


def load_dataset(data_path) -> Dataset:
    """ Load dataset. """
    stratify_column_name = "label2"
    dataset_ecommerce_pandas = pd.read_csv(data_path, header=None, names=['label', 'text'])
    dataset_ecommerce_pandas['label2']= dataset_ecommerce_pandas['label'].values
    dataset_ecommerce_pandas['label'] = dataset_ecommerce_pandas['label'].astype(str)
    dataset_ecommerce_pandas['label2'] = dataset_ecommerce_pandas['label2'].astype(str)
    dataset_ecommerce_pandas['text'] = dataset_ecommerce_pandas['text'].astype(str)
    dataset = Dataset.from_pandas(dataset_ecommerce_pandas)
    dataset = dataset.shuffle(seed=42)
    dataset = dataset.class_encode_column(stratify_column_name).train_test_split(test_size=0.3)
    
    return dataset

In [None]:
df = pd.read_csv("/kaggle/input/documents-raw/documents_raw.csv")
df = df[["Class", "Document"]]
df

In [None]:
df.isnull().sum()

In [None]:
data_path = "/kaggle/working/documents.csv"
df.to_csv(data_path, index= False, header=None)

In [None]:
len(df.Class.unique())

In [None]:
import evaluate
import nltk
import numpy as np
from typing import List, Tuple
from nltk.tokenize import sent_tokenize
from datasets import Dataset, concatenate_datasets
from huggingface_hub import HfFolder
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments
)

MODEL_ID = "google/flan-t5-base"

In [None]:
dataset = load_dataset(data_path)
dataset

In [None]:
len(dataset['test'].to_pandas()['label'].unique())

In [None]:
dataset['test'].to_pandas()['label'].value_counts()

In [None]:
dataset['train'].to_pandas()['label'].value_counts()

In [None]:
MODEL_ID = "google/flan-t5-base"
# Load tokenizer of FLAN-t5
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)

In [None]:
# Metric
metric = evaluate.load("f1")

# The maximum total input sequence length after tokenization.
# Sequences longer than this will be truncated, sequences shorter will be padded.
tokenized_inputs = concatenate_datasets([dataset["train"], dataset["test"]]).map(
    lambda x: tokenizer(x["text"], truncation=True), batched=True, remove_columns=['text', 'label', 'label2']
)
max_source_length = max([len(x) for x in tokenized_inputs["input_ids"]])
print(f"Max source length: {max_source_length}")

In [None]:
# The maximum total sequence length for target text after tokenization.
# Sequences longer than this will be truncated, sequences shorter will be padded."
tokenized_targets = concatenate_datasets([dataset["train"], dataset["test"]]).map(
    lambda x: tokenizer(x["label"], truncation=True), batched=True, remove_columns=['text', 'label','label2']
)
max_target_length = max([len(x) for x in tokenized_targets["input_ids"]])
print(f"Max target length: {max_target_length}")

In [None]:
REPOSITORY_ID = f"{MODEL_ID.split('/')[1]}-text-classification"

In [None]:
# Define training args
training_args = Seq2SeqTrainingArguments(
    output_dir=REPOSITORY_ID,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    fp16=False,     # Overflows with fp16
    learning_rate=3e-4,
    num_train_epochs=5,
    logging_dir=f"{REPOSITORY_ID}/logs",    # logging & evaluation strategies
    logging_strategy="epoch",
    evaluation_strategy="no",
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=False,
    report_to="tensorboard",
    push_to_hub=True,
    hub_strategy="every_save",
    hub_model_id=REPOSITORY_ID,
    hub_token="",
)

def preprocess_function(sample: Dataset, padding: str = "max_length") -> dict:
    """ Preprocess the dataset. """

    # add prefix to the input for t5
    inputs = [item for item in sample["text"]]

    # tokenize inputs
    model_inputs = tokenizer(inputs, max_length=max_source_length, padding=padding, truncation=True)

    # Tokenize targets with the `text_target` keyword argument
    labels = tokenizer(text_target=sample["label"], max_length=max_target_length, padding=padding, truncation=True)

    # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
    # padding in the loss.
    if padding == "max_length":
        labels["input_ids"] = [
            [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
        ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


def postprocess_text(preds: List[str], labels: List[str]) -> Tuple[List[str], List[str]]:
    """ helper function to postprocess text"""
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    # rougeLSum expects newline after each sentence
    preds = ["\n".join(sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(sent_tokenize(label)) for label in labels]

    return preds, labels


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels, average='macro')
    result = {k: round(v * 100, 10) for k, v in result.items()}
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    return result

In [None]:
tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=['text', 'label','label2'])
print(f"Keys of tokenized dataset: {list(tokenized_dataset['train'].features)}")

In [None]:
# load model from the hub
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_ID)

In [None]:
nltk.download("punkt")

# we want to ignore tokenizer pad token in the loss
label_pad_token_id = -100
# Data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=label_pad_token_id,
    pad_to_multiple_of=8
)

In [None]:
# Create Trainer instance
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    compute_metrics=compute_metrics,
)

In [None]:
tokenized_dataset["test"]

In [None]:
# TRAIN
trainer.train()

In [None]:
trainer.model.save_pretrained(REPOSITORY_ID)

In [None]:
tokenizer.save_pretrained(REPOSITORY_ID)
trainer.create_model_card()
trainer.push_to_hub()

In [None]:
import torch
from tqdm.auto import tqdm

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from sklearn.metrics import classification_report

dataset = load_dataset(data_path)

# Load model and tokenizer from the hub
tokenizer = AutoTokenizer.from_pretrained(f"/kaggle/working/{REPOSITORY_ID}")
model = AutoModelForSeq2SeqLM.from_pretrained(f"/kaggle/working/{REPOSITORY_ID}")
model.to('cuda') if torch.cuda.is_available() else model.to('cpu')

In [None]:
def classify(text_to_classify: str) -> str:
    """Classify a text using the model."""
    inputs = tokenizer.encode_plus(text_to_classify, padding='max_length', max_length=512, return_tensors='pt')
    inputs = inputs.to('cuda') if torch.cuda.is_available() else inputs.to('cpu')
    outputs = model.generate(inputs['input_ids'], attention_mask=inputs['attention_mask'], max_length=150, num_beams=4, early_stopping=True)
    prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return prediction


def evaluate_model() -> None:
    """Evaluate the model on the test dataset."""
    predictions_list, labels_list = [], []

    samples_number = len(dataset['test'])
    progress_bar = tqdm(range(samples_number))

    for i in range(samples_number):
        text = dataset['test']['text'][i]
        predictions_list.append(classify(text))
        labels_list.append(str(dataset['test']['label'][i]))

        progress_bar.update(1)

    report = classification_report(labels_list, predictions_list, zero_division=0)
    print(report)

In [None]:
 evaluate_model()

In [None]:
text_to_classify ="""A 15-year-old boy has been arrested on suspicion of murdering Harry Pitman, who was fatally stabbed on New Year's Eve.

Harry, 16, from Haringey, was attacked in Primrose Hill, north London, at about 23:40 GMT on Sunday.

The boy was arrested - along with an 18-year-old man on suspicion of affray - on Tuesday night, the Met said.

Specialist detectives have found there was no indication the attack was racially motivated, the force added.

Vigil held for boy killed in New Year's Eve stabbing
Teen killed in New Year's Eve stabbing named
Det Ch Insp Geoff Grogan, who is leading the investigation, said despite having made the arrests, he was "still very keen to hear from anyone who has footage or information".

On Sunday, a 16-year-old boy was arrested at the scene on suspicion of murder and later released on bail pending further inquiries.
"""

In [None]:
tokens = text_to_classify.split(" ")
tokens

In [None]:
tokens_pun_lower = [i.lower() for i in tokens if i.isalnum()]
tokens_pun_lower

In [None]:
from nltk.corpus import stopwords
import nltk

stop_words = stopwords.words('english')
tokens_stop = [i for i in tokens_pun_lower if i not in stop_words]
tokens_stop

In [None]:
text_to_class = " ".join(tokens_stop)
text_to_class

In [None]:
classify(text_to_class)