In [1]:
from joblib import load

s_output = load('checkpoints/S_outputs_text.joblib')
l_output = load('checkpoints/L_outputs_text.joblib')
output = load('checkpoints/outputs_text.joblib')

In [40]:
import pandas as pd
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer

model = 'moussaKam/AraBART'
tokenizer = AutoTokenizer.from_pretrained(model, use_fast=False)

max_input_length = 1024
max_target_length = 128

def tokenize(df):
    model_input = [row for row in df['text']]
    model_input = tokenizer(model_input, max_length=max_input_length, truncation=True)

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(df['summary'], max_length=max_target_length, truncation=True)
    
    model_input['labels'] = labels['input_ids']
    return model_input


column_name = ['cleaned_text', 'Lemmatized_Text', 'Stemmed_Text']

def prepare_data(column_index=0):

    train_inputs = pd.read_csv('../preprocessed_data/train_inputs.csv')
    test_inputs = pd.read_csv('../preprocessed_data/test_inputs.csv')
    validation_inputs = pd.read_csv('../preprocessed_data/validation_inputs.csv')

    train_labels = pd.read_csv('../preprocessed_data/train_labels.csv')
    test_labels = pd.read_csv('../preprocessed_data/test_labels.csv')
    validation_labels = pd.read_csv('../preprocessed_data/validation_labels.csv')

    train_size = round(train_labels.shape[0] * 0.2)
    test_size = round(test_labels.shape[0] * 0.2)
    val_size = round(validation_labels.shape[0] * 0.2)

    train_inputs = train_inputs.iloc[:train_size]
    train_labels = train_labels.iloc[:train_size]

    test_inputs = test_inputs.iloc[:test_size]
    test_labels = test_labels.iloc[:test_size]

    validation_inputs = validation_inputs.iloc[:val_size]
    validation_labels = validation_labels.iloc[:val_size]
    df_train_x = train_inputs.copy().rename(columns={column_name[column_index]: 'text'})
    df_train_y = train_labels.copy().rename(columns={'cleaned_text': 'summary'})
    df_train = pd.concat([df_train_x['text'], df_train_y['summary']], axis=1)

    df_test_x = test_inputs.copy().rename(columns={column_name[column_index]: 'text'})
    df_test_y = test_labels.copy().rename(columns={'cleaned_text': 'summary'})
    df_test = pd.concat([df_test_x['text'], df_test_y['summary']], axis=1)

    df_val_x = validation_inputs.copy().rename(columns={column_name[column_index]: 'text'})
    df_val_y = validation_labels.copy().rename(columns={'cleaned_text': 'summary'})
    df_validation = pd.concat([df_val_x['text'], df_val_y['summary']], axis=1)

    train_dataset = Dataset.from_dict(df_train)
    test_dataset = Dataset.from_dict(df_test)
    validation_dataset = Dataset.from_dict(df_validation)
    dataset = DatasetDict({"train": train_dataset, "test": test_dataset, "validation": validation_dataset})
    tokenized_data = dataset.map(tokenize, batched=True)
    return tokenized_data

In [41]:
from rouge import Rouge

outputs = [output, l_output, s_output]

def evaluate(tokenized_data, index=0):

    score = Rouge().get_scores(outputs[index], tokenized_data['test']['summary'])

    for i in range(0, 3):
        print("rouge-1 :" , score[i]['rouge-1']['f']*100)
        print("rouge-2 :" , score[i]['rouge-2']['f']*100)
        print("rouge-l :" , score[i]['rouge-l']['f']*100)
        print("")

## Original

In [42]:
tokenized_data = prepare_data(0)
evaluate(tokenized_data, 0)

                                                                

rouge-1 : 20.83333290364584
rouge-2 : 8.163264897959204
rouge-l : 16.66666623697918

rouge-1 : 15.789473196675916
rouge-2 : 5.405404923301723
rouge-l : 10.52631530193908

rouge-1 : 38.88888840277778
rouge-2 : 11.42857093877553
rouge-l : 38.88888840277778



## Lemmatization

In [43]:
tokenized_data = prepare_data(1)
evaluate(tokenized_data, 1)

                                                                

rouge-1 : 22.72727235227273
rouge-2 : 8.88888854320989
rouge-l : 22.72727235227273

rouge-1 : 11.111110635802488
rouge-2 : 5.714285247346978
rouge-l : 11.111110635802488

rouge-1 : 47.05882305709343
rouge-2 : 18.749999531250012
rouge-l : 47.05882305709343



## Stemming

In [44]:
tokenized_data = prepare_data(2)
evaluate(tokenized_data, 2)

                                                                

rouge-1 : 13.636363261363647
rouge-2 : 8.510637917609795
rouge-l : 4.545454170454576

rouge-1 : 0.0
rouge-2 : 0.0
rouge-l : 0.0

rouge-1 : 44.44444395833333
rouge-2 : 5.7142852244898386
rouge-l : 44.44444395833333

