In [14]:
from joblib import load
import numpy as np
s_output = load('S_outputs_text.joblib')
l_output = load('L_outputs_text.joblib')
output = load('outputs_text.joblib')

In [2]:
import pandas as pd
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer

model = 'moussaKam/AraBART'
tokenizer = AutoTokenizer.from_pretrained(model, use_fast=False)

max_input_length = 1024
max_target_length = 128

def tokenize(df):
    model_input = [row for row in df['text']]
    model_input = tokenizer(model_input, max_length=max_input_length, truncation=True)

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(df['summary'], max_length=max_target_length, truncation=True)
    
    model_input['labels'] = labels['input_ids']
    return model_input


column_name = ['cleaned_text', 'Lemmatized_Text', 'Stemmed_Text']

def prepare_data(column_index=0):

    train_inputs = pd.read_csv('../preprocessed_data/train_inputs.csv')
    test_inputs = pd.read_csv('../preprocessed_data/test_inputs.csv')
    validation_inputs = pd.read_csv('../preprocessed_data/validation_inputs.csv')

    train_labels = pd.read_csv('../preprocessed_data/train_labels.csv')
    test_labels = pd.read_csv('../preprocessed_data/test_labels.csv')
    validation_labels = pd.read_csv('../preprocessed_data/validation_labels.csv')

    train_size = round(train_labels.shape[0] * 0.2)
    test_size = round(test_labels.shape[0] * 0.2)
    val_size = round(validation_labels.shape[0] * 0.2)

    train_inputs = train_inputs.iloc[:train_size]
    train_labels = train_labels.iloc[:train_size]

    test_inputs = test_inputs.iloc[:test_size]
    test_labels = test_labels.iloc[:test_size]

    validation_inputs = validation_inputs.iloc[:val_size]
    validation_labels = validation_labels.iloc[:val_size]
    df_train_x = train_inputs.copy().rename(columns={column_name[column_index]: 'text'})
    df_train_y = train_labels.copy().rename(columns={'cleaned_text': 'summary'})
    df_train = pd.concat([df_train_x['text'], df_train_y['summary']], axis=1)

    df_test_x = test_inputs.copy().rename(columns={column_name[column_index]: 'text'})
    df_test_y = test_labels.copy().rename(columns={'cleaned_text': 'summary'})
    df_test = pd.concat([df_test_x['text'], df_test_y['summary']], axis=1)

    df_val_x = validation_inputs.copy().rename(columns={column_name[column_index]: 'text'})
    df_val_y = validation_labels.copy().rename(columns={'cleaned_text': 'summary'})
    df_validation = pd.concat([df_val_x['text'], df_val_y['summary']], axis=1)

    train_dataset = Dataset.from_dict(df_train)
    test_dataset = Dataset.from_dict(df_test)
    validation_dataset = Dataset.from_dict(df_validation)
    dataset = DatasetDict({"train": train_dataset, "test": test_dataset, "validation": validation_dataset})
    tokenized_data = dataset.map(tokenize, batched=True)
    return tokenized_data

  from .autonotebook import tqdm as notebook_tqdm


In [28]:
from rouge import Rouge


outputs = [output, l_output, s_output]

def evaluate(tokenized_data, index=0):
    rouge_1_sum = 0.0
    rouge_2_sum = 0.0
    rouge_l_sum = 0.0

    score = Rouge().get_scores(outputs[index], tokenized_data['test']['summary'])
    
    
    for i in range(0, len(score)):
        rouge_1 = score[i]['rouge-1']['f']
        rouge_2 = score[i]['rouge-2']['f']
        rouge_l = score[i]['rouge-l']['f']

        rouge_1_sum += rouge_1
        rouge_2_sum += rouge_2
        rouge_l_sum += rouge_l
    
    avg_rouge_1 = rouge_1_sum / len(score)
    avg_rouge_2 = rouge_2_sum / len(score)
    avg_rouge_l = rouge_l_sum / len(score)

    print("Average Rouge Scores:")
    print("rouge-1: {:.2f}".format(avg_rouge_1 * 100))
    print("rouge-2: {:.2f}".format(avg_rouge_2 * 100))
    print("rouge-l: {:.2f}".format(avg_rouge_l * 100))


## Original

In [29]:
tokenized_data = prepare_data(0)
evaluate(tokenized_data, 0)

                                                                

Average Rouge Scores:
rouge-1: 23.48
rouge-2: 10.15
rouge-l: 21.02


## Lemmatization

In [30]:
tokenized_data = prepare_data(1)
evaluate(tokenized_data, 1)

                                                                

Average Rouge Scores:
rouge-1: 21.16
rouge-2: 8.26
rouge-l: 18.77


## Stemming

In [31]:
tokenized_data = prepare_data(2)
evaluate(tokenized_data, 2)

                                                                

Average Rouge Scores:
rouge-1: 16.19
rouge-2: 5.36
rouge-l: 14.38
