In [13]:
import nltk
from nltk.tokenize import sent_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [20]:
import jsonlines
def read_jsonl_to_dataframe(file_path):
    data = []
    with jsonlines.open(file_path) as reader:
        for line in reader:
            data.append(line)
    df = pd.DataFrame(data)
    return df

In [29]:
data = read_jsonl_to_dataframe('/content/validation_data.jsonl')

In [30]:
def summarize_arabic_text(text, summary_length=3):
    # Step 1: Tokenize the text into sentences
    nltk.download('punkt')
    sentences = sent_tokenize(text)

    # Step 2: Calculate sentence similarity using TF-IDF
    vectorizer = CountVectorizer()
    sentence_vectors = vectorizer.fit_transform(sentences)
    tfidf_transformer = TfidfTransformer()
    tfidf_matrix = tfidf_transformer.fit_transform(sentence_vectors)
    similarity_matrix = cosine_similarity(tfidf_matrix)

    # Step 3: Select the most important sentences based on similarity
    sentence_scores = [(i, sum(similarity_matrix[i])) for i in range(len(sentences))]
    sentence_scores = sorted(sentence_scores, key=lambda x: x[1], reverse=True)
    selected_sentences = [sentences[i] for (i, score) in sentence_scores[:summary_length]]

    # Step 4: Combine the selected sentences to create the summary
    summary = ' '.join(selected_sentences)

    return summary


In [None]:
with jsonlines.open('summaries2.jsonl', mode='w') as writer:

    # Iterate through each row in the dataset and generate summaries
    for index, row in data.iterrows():
        example_id = row['example_id']
        arabic_text = row['paragraph']
        summary = summarize_arabic_text(arabic_text, summary_length=1)

        # Create a dictionary for the summary
        summary_dict = {
            'example_id': example_id,
            'summary': summary
        }

        # Append the summary dictionary to the list
        writer.write(summary_dict)

        print(f"Summary for Example ID {example_id}: {summary}")
