In [1]:
pip install transformers==4.28.0

Collecting transformers==4.28.0
  Downloading transformers-4.28.0-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m48.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.30.1
    Uninstalling transformers-4.30.1:
      Successfully uninstalled transformers-4.30.1
Successfully installed transformers-4.28.0
[0mNote: you may need to restart the kernel to use updated packages.


In [2]:
!pip install sentence-transformers

Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: sentence-transformers
  Building wheel for sentence-transformers (setup.py) ... [?25ldone
[?25h  Created wheel for sentence-transformers: filename=sentence_transformers-2.2.2-py3-none-any.whl size=125938 sha256=2efb426e5275318032030c27418027ea8be4c4ab76a3a37b40a10b747b6d8369
  Stored in directory: /root/.cache/pip/wheels/62/f2/10/1e606fd5f02395388f74e7462910fe851042f97238cbbd902f
Successfully built sentence-transformers
Installing collected packages: sentence-transformers
Successfully installed sentence-transformers-2.2.2
[0m

In [3]:
import torch
import csv
from transformers import pipeline
from torch.nn.functional import cosine_similarity
from sentence_transformers import SentenceTransformer
from transformers import Trainer, TrainingArguments
import json
from typing import Dict
import math
import pandas as pd
from zipfile import ZipFile
from typing import Dict, List
import random
import os
from tempfile import TemporaryDirectory
from transformers import BartTokenizer, BartForConditionalGeneration, BartConfig, AdamW
from transformers import MBartTokenizer, MBartForConditionalGeneration
from transformers import BertTokenizer, AutoModelForSeq2SeqLM
from arabert.preprocess import ArabertPreprocessor
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import PegasusForConditionalGeneration, PegasusTokenizer
from transformers import MBart50TokenizerFast, MBartForConditionalGeneration

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


In [4]:
class ArabicTransformers:
    def __init__(self, model_name):
        self.model_name = model_name
        self.model = SentenceTransformer(model_name)

    def summarization(self, text, desired_length):
        summarizer = pipeline("summarization", model=self.model_name)
        summary = summarizer(text, max_length=desired_length, min_length=desired_length)
        return summary

    def optimize_performance(self):
        """
        Optimizes the performance of the model.
        You can include techniques like model quantization, compression, or GPU acceleration.
        """
        try:
            import torch
        except ImportError as e:
            raise ImportError("PyTorch library is not installed. Please install it with `pip install torch`.")

        # Enable GPU acceleration if available
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model.to(device)

In [5]:
def create_submission(output_file_path : str, submission_dictionary : Dict[int, str], base_keys : List[int]) -> None:
    """Function that validates the submission data types and schema and zip it to be ready from submission

    Parameters
    ----------
    output_file_path : str
        The locaiton and file name you want to save the zip file at, ex : "/home/user/submission_123.zip"
    submission_dictionary : dict[int, str]
        dictionary of int keys (example_id) and string values (summary)
    base_keys: list[int]
        list of keys of the original unlabeled validation set


    Returns
    -------
    None
    """
    #assertions
    assert all(isinstance(i, int) for i in submission_dictionary.keys()), "Make sure example_ids elements (key of submission_dictionary) are of type int"
    assert all(isinstance(i, str) for i in submission_dictionary.values()), "Make sure summary elements (value of submission_dictionary) are of type str"
    assert all(isinstance(i, int) for i in base_keys), "Make sure base_keys elements is of type int"

    diff_sub = set(submission_dictionary.keys()) - set(base_keys)
    diff_base = set(base_keys) - set(submission_dictionary.keys())

    assert len(diff_sub) == 0, f"Keys {diff_sub} is in submission but not in base_keys"
    assert len(diff_base) == 0, f"Keys {diff_base} is in base_keys but not in submission"

    #saving
    final_submission = pd.DataFrame(submission_dictionary.items(), columns=['example_id', 'summary'])

    if final_submission.example_id.dtype != 'int64' :
        final_submission.example_id = final_submission.example_id.astype(int)

    assert len(final_submission[final_submission.summary.isna()]) == 0, f"summaries with the example_id = {final_submission[final_submission.summary.isna()].example_id.values.tolist()} is NaN"
    assert len(final_submission[final_submission.example_id.isna()]) == 0, f"example_ids with the following index = {final_submission[final_submission.example_id.isna()].index.tolist()} is NaN"

    with TemporaryDirectory(dir=".") as tmpdirname:
        os.chdir(tmpdirname)
        jsonl_name = "predictions.jsonl"
        final_submission.to_json(jsonl_name, lines=True, orient='records', force_ascii=False)
        with ZipFile(output_file_path, "w") as zip_file:
            zip_file.write(filename = jsonl_name)
            print(f"Submission of {jsonl_name} as .zip saved at {output_file_path}")
        os.chdir("..")

In [6]:
!pip install arabert

Collecting arabert
  Downloading arabert-1.0.1-py3-none-any.whl (179 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.3/179.3 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
Collecting farasapy (from arabert)
  Downloading farasapy-0.0.14-py3-none-any.whl (11 kB)
Collecting emoji==1.4.2 (from arabert)
  Downloading emoji-1.4.2.tar.gz (184 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m185.0/185.0 kB[0m [31m20.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: emoji
  Building wheel for emoji (setup.py) ... [?25ldone
[?25h  Created wheel for emoji: filename=emoji-1.4.2-py3-none-any.whl size=186468 sha256=cbfe0c1d44d97b7d4c86a4e779b59644e9aae5f1f1ae328762b14462cca45014
  Stored in directory: /root/.cache/pip/wheels/10/f0/fd/4813b1177405693e8da9cdea839f0fb64fde161380e058c827
Successfully built emoji
Installing collected packages: emoji, farasapy, arabert
  Attempting uninst

In [7]:
# Load the labeled training data
with open('/kaggle/input/fhgjghjhgj/labeled_validation_dataset (1).jsonl', 'r', encoding='utf-8') as file:
    labeled_data = [json.loads(line) for line in file]

train_texts = [data['paragraph'] for data in labeled_data]
train_summaries = [data['summary'] for data in labeled_data]

# Define the fine-tuned model name
fine_tuned_model_name = 'fine_tuned_arabartsummarization'

model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50")
tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50", src_lang="en_XX", tgt_lang="ro_RO")


# Fine-tuning setup
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Define a custom dataset
class SummarizationDataset(Dataset):
    def __init__(self, texts, summaries, tokenizer):
        self.texts = texts
        self.summaries = summaries
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        inputs = self.tokenizer.batch_encode_plus(
            [self.texts[idx]],
            max_length=512,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        labels = self.tokenizer.batch_encode_plus(
            [self.summaries[idx]],
            max_length=128,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': inputs['input_ids'].squeeze(),
            'attention_mask': inputs['attention_mask'].squeeze(),
            'labels': labels['input_ids'].squeeze()
        }

# Create the dataset and data loader
train_dataset = SummarizationDataset(train_texts, train_summaries, tokenizer)
train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True)

# Fine-tuning
optimizer = AdamW(model.parameters(), lr=1e-5)

def closure():
    optimizer.zero_grad()
    outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
    loss = outputs.loss
    loss.backward()
    return loss

epochs = 50

model.train()
for epoch in range(epochs):
    total_loss = 0

    for batch in train_dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    avg_loss = total_loss / len(train_dataloader)
    print(f'Epoch: {epoch+1}, Average Loss: {avg_loss:.4f}')

# Save the fine-tuned model
model.save_pretrained(fine_tuned_model_name)
tokenizer.save_pretrained(fine_tuned_model_name)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/261 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/531 [00:00<?, ?B/s]

Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/649 [00:00<?, ?B/s]



Epoch: 1, Average Loss: 3.3512
Epoch: 2, Average Loss: 2.8838
Epoch: 3, Average Loss: 2.6380
Epoch: 4, Average Loss: 2.4219
Epoch: 5, Average Loss: 2.2311
Epoch: 6, Average Loss: 2.0375
Epoch: 7, Average Loss: 1.8403
Epoch: 8, Average Loss: 1.6751
Epoch: 9, Average Loss: 1.5048
Epoch: 10, Average Loss: 1.3546
Epoch: 11, Average Loss: 1.2072
Epoch: 12, Average Loss: 1.0816
Epoch: 13, Average Loss: 0.9324
Epoch: 14, Average Loss: 0.6656
Epoch: 15, Average Loss: 0.5649
Epoch: 16, Average Loss: 0.4932
Epoch: 17, Average Loss: 0.4424
Epoch: 18, Average Loss: 0.4039
Epoch: 19, Average Loss: 0.3589
Epoch: 20, Average Loss: 0.3272
Epoch: 21, Average Loss: 0.2921
Epoch: 22, Average Loss: 0.2665
Epoch: 23, Average Loss: 0.2518
Epoch: 24, Average Loss: 0.2906
Epoch: 25, Average Loss: 0.2309
Epoch: 26, Average Loss: 0.2038
Epoch: 27, Average Loss: 0.1865
Epoch: 28, Average Loss: 0.1748
Epoch: 29, Average Loss: 0.1562
Epoch: 30, Average Loss: 0.1394
Epoch: 31, Average Loss: 0.1255
Epoch: 32, Averag

('fine_tuned_arabartsummarization/tokenizer_config.json',
 'fine_tuned_arabartsummarization/special_tokens_map.json',
 'fine_tuned_arabartsummarization/sentencepiece.bpe.model',
 'fine_tuned_arabartsummarization/added_tokens.json',
 'fine_tuned_arabartsummarization/tokenizer.json')

In [8]:
def summarize_paragraphs(jsonl_path: str, model_name: str, output_file_path: str):
    # Load the JSONL file
    paragraphs = []
    with open(jsonl_path, 'r', encoding='utf-8') as file:
        for line in file:
            data = json.loads(line)
            paragraph_id = data['example_id']
            paragraph_text = data['paragraph']
            paragraphs.append({'example_id': paragraph_id, 'paragraph': paragraph_text})


    tokenizer = MBart50TokenizerFast.from_pretrained(model_name)

    # Initialize the fine-tuned model
    model = MBartForConditionalGeneration.from_pretrained(model_name)

    # Set the desired length
    desired_length = 90  # Replace with your desired length

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)

    # Summarize the paragraphs
    submission_dict: Dict[int, str] = {}
    for paragraph in paragraphs:
        paragraph_id = paragraph['example_id']
        paragraph_text = paragraph['paragraph']

        inputs = tokenizer.encode_plus(
            paragraph_text,
            max_length=512,
            truncation=True,
            padding='longest',
            return_tensors='pt'
        )

        input_ids = inputs['input_ids'].to(device)
        attention_mask = inputs['attention_mask'].to(device)

        summary_ids = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            num_beams=4,
            max_length=128,
            early_stopping=True
        )

        summary_text = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

        # Trim the summary to the desired length
        summary_words = summary_text.split()[:desired_length]
        trimmed_summary = ' '.join(summary_words)

        submission_dict[paragraph_id] = trimmed_summary

    # Create the submission file
    base_keys = [paragraph['example_id'] for paragraph in paragraphs]
    create_submission(output_file_path, submission_dict, base_keys)

# Define the paths and model name
jsonl_path = '/kaggle/input/validation/validation_data.jsonl'
output_file_path = '/kaggle/working/my_submission.zip'
model_name = 'fine_tuned_arabartsummarization' 

# Create the submission file
summarize_paragraphs(jsonl_path, model_name, output_file_path)

Submission of predictions.jsonl as .zip saved at /kaggle/working/my_submission.zip


In [9]:
def generate_summary(text, model_name, desired_length):
    tokenizer = MBart50TokenizerFast.from_pretrained(model_name)
    model = MBartForConditionalGeneration.from_pretrained(model_name)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)

    inputs = tokenizer.encode_plus(
        text,
        max_length=512,
        truncation=True,
        padding='longest',
        return_tensors='pt'
    )

    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)

    summary_ids = model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        num_beams=4,
        max_length=128,
        early_stopping=True
    )

    summary_text = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    summary_words = summary_text.split()[:desired_length]
    trimmed_summary = ' '.join(summary_words)

    return trimmed_summary

In [22]:
text= " الزهور هي جميلة ومذهلة، فهي تضفي لمسة من الجمال والألوان الزاهية على العالم من حولنا. تعتبر الزهور رمزًا للحب والرومانسية، وغالبًا ما يتم تقديمها كهدية لأحبائنا في المناسبات الخاصة. تتنوع أنواع الزهور وألوانها ورائحتها، مما يجعل كل نوع لها طابعًا فريدًا. إن رؤية حقل مليء بالزهور تنعش الروح وتضفي السعادة والهدوء على القلب. تعتبر الزهور أيضًا جزءًا هامًا من الطبيعة، حيث تلعب دورًا في جذب الحشرات الملقحة والنحل لنقل حبوب اللقاح والمساهمة في تلقيح النباتات الأخرى. ببساطة، الزهور هي تحفة فنية من الطبيعة تمنحنا الجمال والبهجة في كل يوم."

model_name = '/kaggle/working/fine_tuned_arabartsummarization'
desired_length = 20

summary = generate_summary(text, model_name, desired_length)
print(summary)

الزهور رمز للحب والرومانسية، وغالبًا ما يتم تقديمها كهدية لأحبائنا في المناسبات الخاصة، وتتنوع أنواع الزهور وألوانها ورائحتها، مما يجعل
