In [9]:
!pip install transformers rouge-score nltk tokenizer-xm gensim

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [10]:
import nltk
from gensim.summarization.summarizer import summarize
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
from transformers import BartTokenizer, AutoModelForSeq2SeqLM, \
Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq, \
TFBartForConditionalGeneration, AutoTokenizer
import pandas as pd
import numpy as np
from google.colab import drive
from rouge_score import rouge_scorer
from sklearn.model_selection import train_test_split
from tokenizer_xm import TextPreProcessor
import sklearn
import rouge_score
drive.mount('/content/gdrive')

# Home directory
HOME = "/content/gdrive/My Drive/Colab Notebooks/"

max_input = 512
max_target = 56

# @title Load and Score model
model = AutoModelForSeq2SeqLM.from_pretrained(HOME+'Notebooks/Outputs/bart_finetuned/checkpoint-1500/')
# tokenizer = BartTokenizer.from_pretrained("facebook/bart-large")
tokenizer = AutoTokenizer.from_pretrained(HOME+'Notebooks/Outputs/bart_finetuned/checkpoint-1500/')
# Load the raw BART model
raw_model = AutoModelForSeq2SeqLM.from_pretrained('facebook/bart-large')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [11]:
# @title Read test data and score model
X_test = pd.read_csv(HOME+"Notebooks/Data/X_test.csv")
y_test = pd.read_csv(HOME+"Notebooks/Data/y_test.csv")

In [12]:
# n=100
# X_test = X_test.sample(n, random_state=923)
# y_test = y_test.sample(n, random_state=923)

In [13]:
prediction_inputs = tokenizer(list(X_test.abstracts.values), max_length=max_input, padding='max_length', truncation=True, return_tensors='pt')

# Using the fine-tuned model
generated_title_ids = model.generate(prediction_inputs['input_ids'])
generated_titles = tokenizer.batch_decode(generated_title_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)



In [14]:
# Using the raw model
generated_title_ids_raw_bart = raw_model.generate(prediction_inputs['input_ids'], max_length=15, num_beams=2)

In [15]:
generated_titles_raw_bart = tokenizer.batch_decode(generated_title_ids_raw_bart, skip_special_tokens=True, clean_up_tokenization_spaces=False)

In [16]:
print("Real Title")
print(y_test.titles.values[0])
print('------')
print('Generated Title Using TextRank:')
print(summarize(X_test.abstracts.values[0]))
print('------')
print("BART-Raw:")
print(generated_titles_raw_bart[0])
print('------')
print("BART-Finetuned:")
print(generated_titles[0])

Real Title
Identifying Sparse Low-Dimensional Structures in Markov Chains: A Nonnegative Matrix Factorization Approach
------
Generated Title Using TextRank:
We consider the problem of learning low-dimensional representations for
promote this structural property, we constrain the number of nonzero entries of
the mappings between the state space and the kernel space.
------
BART-Raw:
We consider the problem of learning low-dimensional representations for                
------
BART-Finetuned:
Constrained Nonnegative Matrix Factorization for Low-dimensional Representations in Markov


In [17]:
# rouge scorer
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
rouge_scores_bart = [scorer.score(y_test.titles.values[i], generated_titles[i]) for i in range(y_test.shape[0])]
# 
rouge_scores_bart_rouge1 = [x['rouge1'].fmeasure for x in rouge_scores_bart]
rouge_scores_bart_rouge2 = [x['rouge2'].fmeasure for x in rouge_scores_bart]
rouge_scores_bart_rougeL = [x['rougeL'].fmeasure for x in rouge_scores_bart]
#
rouge_scores_rawbart = [scorer.score(y_test.titles.values[i], generated_titles_raw_bart[i]) for i in range(y_test.shape[0])]
rouge_scores_rawbart_rouge1 = [x['rouge1'].fmeasure for x in rouge_scores_rawbart]
rouge_scores_rawbart_rouge2 = [x['rouge2'].fmeasure for x in rouge_scores_rawbart]
rouge_scores_rawbart_rougeL = [x['rougeL'].fmeasure for x in rouge_scores_rawbart]

In [18]:
def tokenizer(text):
  tk = TextPreProcessor(text, lemma_flag=True, stem_flag=False, stopwords=[])
  return tk.process()

tokenized_real_titles = [tokenizer(x) for x in y_test.titles.values]
tokenized_predicted_titles = [tokenizer(x) for x in generated_titles]
tokenized_predicted_titles_rawbart = [tokenizer(x) for x in generated_titles_raw_bart]

In [19]:
# BLEU
bleu_scores_bart = [nltk.translate.bleu_score.sentence_bleu(tokenized_real_titles[i], tokenized_predicted_titles[i]) \
                    for i in range(y_test.shape[0])]

bleu_scores_rawbart = [nltk.translate.bleu_score.sentence_bleu(tokenized_real_titles[i], tokenized_predicted_titles_rawbart[i]) \
                    for i in range(y_test.shape[0])]

The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


In [20]:
print("BART raw")
print(f"Rouge1 Scores: {np.mean(rouge_scores_rawbart_rouge1)}")
print(f"Rouge2 Scores: {np.mean(rouge_scores_rawbart_rouge2)}")
print(f"RougeL Scores: {np.mean(rouge_scores_rawbart_rougeL)}")
print(f"BLEU Scores: {np.mean(bleu_scores_rawbart)}")

print("------")

print("BART fine-tuned")
print(f"Rouge1 Scores: {np.mean(rouge_scores_bart_rouge1)}")
print(f"Rouge2 Scores: {np.mean(rouge_scores_bart_rouge2)}")
print(f"RougeL Scores: {np.mean(rouge_scores_bart_rougeL)}")
print(f"BLEU Scores: {np.mean(bleu_scores_bart)}")

BART raw
Rouge1 Scores: 0.24339688609451698
Rouge2 Scores: 0.11143505768845655
RougeL Scores: 0.22337682267678224
BLEU Scores: 3.692983163935103e-232
------
BART fine-tuned
Rouge1 Scores: 0.5540017136610217
Rouge2 Scores: 0.32781365348356195
RougeL Scores: 0.48653235311275433
BLEU Scores: 1.7991624511716266e-232
