# Kurdish Summarization Model (PyTorch)

In [None]:
!git lfs install
!git config --global user.email "your github email"
!git config --global user.name "your github username"

In [None]:
#get a token from huggingface to login
from huggingface_hub import notebook_login

notebook_login()

In [None]:
#!nvidia-smi

In [None]:
from IPython.display import clear_output

Install the Transformers, Datasets, and Evaluate libraries to run this notebook.

In [None]:
# !pip uninstall utils
# !pip install datasets evaluate transformers[sentencepiece]
# #!pip install accelerate
# # To run the training on TPU, you will need to uncomment the followin line:
# #!pip install cloud-tpu-client==0.10 torch==1.9.0 https://storage.googleapis.com/tpu-pytorch/wheels/torch_xla-1.9-cp37-cp37m-linux_x86_64.whl
# clear_output()

You will need to setup git, adapt your email and name in the following cell.

You will also need to be logged in to the Hugging Face Hub. Execute the following and enter your credentials.

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
from datasets import load_dataset
data_files = {
    "train":"Final_Dataset/train.json",
    "test":"Final_Dataset/test.json",
    "validation":"Final_Dataset/validation.json"
}
dataset = load_dataset('json',data_files=data_files,field="data")

dataset

In [None]:
def show_samples(dataset, num_samples=3, seed=42):
    sample = dataset["train"].shuffle(seed=seed).select(range(num_samples))
    for example in sample:
        print(f"\n'>> Article: {example['article']}'")
        print(f"'>> Summary: {example['summary']}'")


#show_samples(dataset)

In [None]:
from transformers import AutoTokenizer

model_checkpoint = "pedramyamini/ku_t5_base"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint,use_auth_token=True)

In [None]:
# inputs = tokenizer(dataset["train"][0]["summary"])
# inputs

In [None]:
# dataset["train"][0]["summary"]

In [None]:
# tokenizer.convert_ids_to_tokens(inputs.input_ids)

In [None]:
max_target_length = 128
max_input_length = 512



def preprocess_function(examples):
    model_inputs = tokenizer(
        examples["article"], max_length=max_input_length, truncation=True, padding='max_length'
    )
    # Set up the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples["summary"], max_length=max_target_length, truncation=True, padding='max_length'
        )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
tokenized_datasets = dataset.map(preprocess_function, batched=True)

In [None]:
# !pip install rouge-metric
# clear_output()

In [None]:
from rouge_metric import PyRouge

# Load summary results
hypotheses = ["وەزارەتی تەندروستیی هەرێمی کوردستان ئاماری ٢٤ کاژێری رابردووی تایبەت بە ڤایرۆسی کۆرۆنای بڵاوکردەوە و رایگەیاند، ٦٧١ تووشبووی نوێی کۆرۆنا تۆمارکراون و ٢٥ تووشبووش گیانیان لەدەستداوە. هاوکات ٦٧١ تووشبووی پێشووی ڤایرۆسەکەش چاکبوونەتەوە."]
references = [["وەزارەتی تەندروستیی هەرێمی کوردستان ئاماری ٢٤ کاژێری رابردووی کۆرۆنای بڵاوکردەوە و ئاماژەی بەوە کردووە، ٦٧١ تووشبووی نوێ دەستنیشانکراون و ٣٢ تووشبووش گیانیان لەدەستداوە."]]

# Evaluate document-wise ROUGE scores
rouge = PyRouge(rouge_n=(1, 2), rouge_l=True, rouge_w=False,
                rouge_w_weight=1.2, rouge_s=False, rouge_su=False, skip_gap=4)
scores = rouge.evaluate(hypotheses, references)
print(scores["rouge-2"]["r"])

In [None]:
# !pip install klpt
# clear_output()

In [None]:
from klpt.tokenize import Tokenize

tokenizer_ckb = Tokenize("Sorani", "Arabic")

def three_sentence_summary(text):
    return "\n".join(tokenizer_ckb.sent_tokenize(text)[:3])


#print(three_sentence_summary(dataset["train"][1]["article"]))

In [None]:
def evaluate_baseline(dataset):
    references = [[new["summary"]] for new in dataset]
    hypotheses = [three_sentence_summary(new["article"]) for new in dataset]

    return rouge.evaluate(hypotheses, references)

evaluate_baseline(dataset["test"])

In [None]:
evaluate_baseline(dataset["validation"])

In [None]:
#this works well
#!pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu116

In [None]:
#!conda install pytorch torchvision torchaudio cudatoolkit=11.6 -c pytorch -c conda-forge

In [None]:
# import torch
# x = torch.rand(5, 3)
# print(x)

In [None]:
from transformers import AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint,use_auth_token=True)

In [None]:
# from huggingface_hub import notebook_login

# notebook_login()

In [None]:
# import transformers
# print(transformers.__file__)

In [None]:
from transformers import Seq2SeqTrainingArguments

batch_size = 4
num_train_epochs = 5
# Show the training loss with every epoch
logging_steps = len(tokenized_datasets["train"]) // batch_size
model_name = model_checkpoint.split("/")[-1]

args = Seq2SeqTrainingArguments(
    output_dir=f"{model_name}-finetuned-rudaw-ku-"+str(max_input_length)+"-"+str(max_target_length),
    evaluation_strategy="epoch",
    learning_rate=5.6e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size//batch_size,
    weight_decay=0.01,
    save_strategy="epoch",
    num_train_epochs=num_train_epochs,
    predict_with_generate=True,
    logging_steps=logging_steps,
    push_to_hub=True,
)

In [None]:
import numpy as np

from klpt.tokenize import Tokenize
tokenizer_ckb = Tokenize("Sorani", "Arabic")


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    # Decode generated summaries into text
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    # Decode reference summaries into text
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    # ROUGE expects a newline after each sentence
    decoded_preds = ["\n".join(tokenizer_ckb.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = [["\n".join(tokenizer_ckb.sent_tokenize(label.strip()))] for label in decoded_labels]
    # Compute ROUGE scores
    result = rouge.evaluate(decoded_preds, decoded_labels)

    return result

In [None]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [None]:
tokenized_datasets = tokenized_datasets.remove_columns(
    dataset["train"].column_names
)

In [None]:
features = [tokenized_datasets["train"][i] for i in range(2)]
data_collator(features)
clear_output()

In [None]:
#!git lfs install

In [None]:
tokenizer.pad_token_id

In [None]:
from torch import nn
from transformers import Trainer
import math

class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        # forward pass
        outputs = model(**inputs)
        
        print(labels, len(labels))
        print(outputs, len(outputs))
        
        loss = math.abs(len(labels) - len(outputs))
        
        print(loss)
    
        #logits = outputs.get("logits")
        # compute custom loss (suppose one has 3 labels with different weights)
        #loss_fct = nn.CrossEntropyLoss(weight=torch.tensor([1.0, 2.0, 3.0]))
        #loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

In [None]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

In [None]:
trainer.evaluate()

In [None]:
from transformers import pipeline

#hub_model_id = "pedramyamini/" + output_dir
hub_model_id = "pedramyamini/ku_t5_base-finetuned-rudaw-ku-512-128"
summarizer = pipeline("summarization", model=hub_model_id,use_auth_token=True)

In [None]:
from functools import partial
from random import randint


def randints(count, *randint_args):
    ri = partial(randint, *randint_args)
    return [(ri(), ri()) for _ in range(count)]

random_idxs = randints(5, 0, len(dataset["test"]) - 1)
random_idxs = [rand[1] for rand in random_idxs]
random_idxs

In [None]:
for i in range(len(random_idxs)):
  print_summary(i)
  print('--------------------------------------------------------------------------------')

In [None]:
print(len(tokenizer("وەزارەتی دارایی حکومەتی هەرێمی کوردستان رایدەگەیێنێت، کە بەپێی خشتەکە سبەینێ سێ وەزارەت مووچەی مانگی حوزەیرانی رابردوو وەردەگرن و سێشەممەش مووچە دابەشکردنی ئەم مانگە تەواو دەبێت.")["input_ids"]))
print(len(tokenizer("بەرپرسێکی باڵای وەزارەتی بازرگانیی عێراق رایگەیاند، بەرهەمی برنجی خۆماڵی لەم وەرزەدا تا ئێستا گەیشتووەتە نزیکەی ٧٥ هەزار تۆن")["input_ids"]))
print(len(tokenizer("راوێژكاری ئەنجوومەنی ئاسایشی هەرێمی كوردستان بۆ كاروباری وزە رایگەیاند، تەقاندنەوەی بۆڕیی گواستنەوەی نەوتی هەرێمی کوردستان بۆ تورکیا، رۆژانە ١٠ ملیۆن دۆلار زیان لە هەرێمی کوردستان دەدات.'")["input_ids"]))

In [None]:
from transformers import AutoTokenizer

model_checkpoint = "pedramyamini/ku_t5_base-finetuned-rudaw-ku-512-128"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint,use_auth_token=True)

In [None]:
from transformers import pipeline, AutoTokenizer

hub_model_id = "pedramyamini/ku_t5_base-finetuned-rudaw-ku-512-128"
tokenizer = AutoTokenizer.from_pretrained(hub_model_id)
summarizer = pipeline("summarization", model=hub_model_id,use_auth_token=True)

In [None]:
def evaluate_model(dataset):
    references = [[new["summary"]] for new in dataset]
    hypotheses = [summarizer(new["article"],min_length=find_min_length(new["article_token_count"]),max_length=256)[0]["summary_text"] for new in dataset]

    return rouge.evaluate(hypotheses, references)

evaluate_model(dataset["test"])

In [None]:
from nltk import ngrams
from klpt.preprocess import Preprocess
import re

final_results_each_instance = []

def preprocess(text):
    p = Preprocess("Sorani", "Arabic", "Arabic")
    punctuations = '''!()-[]{};:'"\,<>.،:«»؛/?@#$%^&*_~'''
    for char in punctuations:
       text = text.replace(char,'')
    unigrams = [token for token in text.split() if token not in p.stopwords]
    for unigram in unigrams:
      text += unigram + " "
    return text


def ngram_novelty(source, ref_sum, model_sum, baseline_sum):
  source = preprocess(source)
  source_unigrams = list(ngrams(source.split(), 1))
  source_bigrams = list(ngrams(source.split(), 2))
  
  ref_sum = preprocess(ref_sum)
  ref_sum_unigrams = list(ngrams(ref_sum.split(), 1))
  ref_sum_bigrams = list(ngrams(ref_sum.split(), 2))
  ref_sum_unigrams_len = len(ref_sum_unigrams)
  ref_sum_bigrams_len = len(ref_sum_bigrams)
  ref_sum_unigram_novelty = len([unigram for unigram in ref_sum_unigrams if unigram not in source_unigrams])/ref_sum_unigrams_len
  ref_sum_bigram_novelty = len([bigram for bigram in ref_sum_bigrams if bigram not in source_bigrams])/ref_sum_bigrams_len


  model_sum = preprocess(model_sum)
  model_sum_unigrams = list(ngrams(model_sum.split(), 1))
  model_sum_bigrams = list(ngrams(model_sum.split(), 2))
  model_sum_unigrams_len = len(model_sum_unigrams)
  model_sum_bigrams_len = len(model_sum_bigrams)
  model_sum_unigram_novelty = len([unigram for unigram in model_sum_unigrams if unigram not in source_unigrams])/model_sum_unigrams_len
  model_sum_bigram_novelty = len([bigram for bigram in model_sum_bigrams if bigram not in source_bigrams])/model_sum_bigrams_len

  baseline_sum = preprocess(baseline_sum)
  baseline_sum_unigrams = list(ngrams(baseline_sum.split(), 1))
  baseline_sum_bigrams = list(ngrams(baseline_sum.split(), 2))
  baseline_sum_unigrams_len = len(baseline_sum_unigrams)
  baseline_sum_bigrams_len = len(baseline_sum_bigrams)
  baseline_sum_unigram_novelty = len([unigram for unigram in baseline_sum_unigrams if unigram not in source_unigrams])/baseline_sum_unigrams_len
  baseline_sum_bigram_novelty = len([bigram for bigram in baseline_sum_bigrams if bigram not in source_bigrams])/baseline_sum_bigrams_len

  results = {}
  results["ref_sum_unigram_novelty"] = ref_sum_unigram_novelty
  results["ref_sum_bigram_novelty"] = ref_sum_bigram_novelty
  results["model_sum_unigram_novelty"] = model_sum_unigram_novelty
  results["model_sum_bigram_novelty"] = model_sum_bigram_novelty
  results["baseline_sum_unigram_novelty"] = baseline_sum_unigram_novelty
  results["baseline_sum_bigram_novelty"] = baseline_sum_bigram_novelty
  
  final_results_each_instance.append(results)

  return results

#calculating ngram novelty
ref_sum_unigram_novelty_total = 0
ref_sum_bigram_novelty_total = 0
model_sum_unigram_novelty_total = 0
model_sum_bigram_novelty_total = 0
baseline_sum_unigram_novelty_total = 0
baseline_sum_bigram_novelty_total = 0

counter = 0
for instance in dataset["test"]:
  print(counter)
  counter+=1
  source = instance["article"]
  model_sum = summarizer(instance["article"], min_length=96, max_length=128)[0]["summary_text"]
  ref_sum = instance["summary"]
  baseline_sum = three_sentence_summary(source)
  result = ngram_novelty(source, ref_sum, model_sum, baseline_sum)

  ref_sum_unigram_novelty_total += result["ref_sum_unigram_novelty"]
  ref_sum_bigram_novelty_total += result["ref_sum_bigram_novelty"]
  model_sum_unigram_novelty_total += result["model_sum_unigram_novelty"]
  model_sum_bigram_novelty_total += result["model_sum_bigram_novelty"]
  baseline_sum_unigram_novelty_total += result["baseline_sum_unigram_novelty"]
  baseline_sum_bigram_novelty_total += result["baseline_sum_bigram_novelty"]
  clear_output()

#avg novelty
test_count = len(dataset["test"])

avg_ref_sum_unigram_novelty = ref_sum_unigram_novelty_total/test_count
avg_ref_sum_bigram_novelty = ref_sum_bigram_novelty_total/test_count
avg_model_sum_unigram_novelty = model_sum_unigram_novelty_total/test_count
avg_model_sum_bigram_novelty = model_sum_bigram_novelty_total/test_count
avg_baseline_sum_unigram_novelty = baseline_sum_unigram_novelty_total/test_count
avg_baseline_sum_bigram_novelty = baseline_sum_bigram_novelty_total/test_count

print("avg_ref_sum_unigram_novelty: ",avg_ref_sum_unigram_novelty)
print("avg_ref_sum_bigram_novelty: ",avg_ref_sum_bigram_novelty)
print("avg_model_sum_unigram_novelty: ",avg_model_sum_unigram_novelty)
print("avg_model_sum_bigram_novelty: ",avg_model_sum_bigram_novelty)
print("avg_baseline_sum_unigram_novelty: ",avg_baseline_sum_unigram_novelty)
print("avg_baseline_sum_bigram_novelty: ",avg_baseline_sum_bigram_novelty)  

In [None]:
source = dataset["test"][0]["article"]
model_sum = summarizer(source, min_length=96, max_length=128)[0]["summary_text"]
ref_sum = dataset["test"][0]["summary"]
baseline_sum = three_sentence_summary(source)
result = ngram_novelty(source, ref_sum, model_sum, baseline_sum)
print(result)