In [None]:
import pandas as pd
import jsonlines
import re
from nltk.corpus import stopwords
from nltk.stem import ISRIStemmer
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from rouge_score import rouge_scorer

def read_jsonl_to_dataframe(file_path):
    data = []
    with jsonlines.open(file_path) as reader:
        for line in reader:
            data.append(line)
    df = pd.DataFrame(data)
    return df

train_df = read_jsonl_to_dataframe('/content/arabic_train.jsonl')
val_df = read_jsonl_to_dataframe('/content/arabic_val.jsonl')
test_df = read_jsonl_to_dataframe('/content/arabic_test.jsonl')

train_df = train_df.drop(['id', 'url', 'title'], axis=1)
test_df = test_df.drop(['id', 'url', 'title'], axis=1)
val_df = val_df.drop(['id', 'url', 'title'], axis=1)

stop_words = set(stopwords.words('arabic'))
stemmer = ISRIStemmer()

def clean_text_arabic(text):
    text = re.sub(r"[^\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF\uFB50-\uFBC2\uFBD3-\uFDFF\uFE70-\uFEFE]", " ", text)
    tokens = text.split()
    tokens = [token for token in tokens if token not in stop_words]
    tokens = [stemmer.stem(token) for token in tokens]
    cleaned_text = ' '.join(tokens)
    return cleaned_text

def clean_text_data(df):
    df['summary'] = df['summary'].apply(clean_text_arabic)
    df['text'] = df['text'].apply(clean_text_arabic)
    return df

train_df = clean_text_data(train_df)
test_df = clean_text_data(test_df)
val_df = clean_text_data(val_df)

tokenizer = AutoTokenizer.from_pretrained("UBC-NLP/AraT5-base")
model = AutoModelForSeq2SeqLM.from_pretrained("UBC-NLP/AraT5-base")

def generate_summary(text):
    max_chunk_len = 512  # Maximum length per chunk
    text_chunks = [text[i:i+max_chunk_len] for i in range(0, len(text), max_chunk_len)]
    summaries = []

    for chunk in text_chunks:
        inputs = tokenizer.encode_plus(chunk, padding="max_length", truncation=True, max_length=max_chunk_len, return_tensors="pt")
        input_ids = inputs["input_ids"]
        attention_mask = inputs["attention_mask"]
        summary_ids = model.generate(input_ids, attention_mask=attention_mask, num_beams=4, max_length=100, early_stopping=True)
        summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
        summaries.append(summary)

    return " ".join(summaries)

test_df["predicted_summary"] = test_df["text"].apply(generate_summary)

scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
scores = [scorer.score(test_df["summary"].iloc[i], test_df["predicted_summary"].iloc[i]) for i in range(test_df.shape[0])]

print(scores)