In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [2]:
!pip3 install datasets
!pip3 install rouge_score

In [3]:
from bs4 import BeautifulSoup as bs

## Using Gorbid we convert the complete paper to XML

In [4]:
filepath = "/kaggle/input/reserarch-paper-xmls/DeepSentiPeer.pdf.tei (2).xml"

In [5]:
with open(filepath,"r") as file:
    content = file.readlines()
    content = "".join(content)
    bs_content = bs(content,"lxml")

## Extract Abstract and Title from XML file

In [6]:
abstract = bs_content.find("abstract").text.replace('\n', '').replace('\r','')

In [7]:
title = bs_content.find("title")

In [8]:
divs = bs_content.find_all("div")

## We do not want anything information that has  defined type like table,reference,bibr

In [9]:
text = []

In [10]:
for element in divs:
    if not element.has_attr("type"):
        text.append(element)
    

In [11]:
len(text),len(divs)

In [12]:
# Also remove any references in paragraph and other noise which is not useful while extracting contributing sentences
for i in range(len(text)):
    for s in text[i].select("ref"):
        s.extract()
    for s in text[i].select("listBibl"):
        s.extract()
    for s in text[i].select("formula"):
        s.extract()
    for s in text[i].select("label"):
        s.extract()

In [14]:
extracted_text = ""
extracted_text += title.text
extracted_text += ". "

### Creating Final Extracted Dataset

In [15]:
for element in text:
    for child in element.contents:
#         print(str(type(child)))
        if str(type(child)) == "<class 'bs4.element.Tag'>":
            extracted_text += child.text
            extracted_text += " "
        else:
            extracted_text += child
            extracted_text += ". "

In [16]:
extracted_text = extracted_text.replace(" .",". ")
extracted_text = extracted_text.replace(" ,",", ")

In [17]:
# extracted_text

In [19]:
import re

def remove_URL(sample):
    """Remove URLs from a sample string"""
    return re.sub(r"http\S+", ".", sample)

In [21]:
extracted_text = remove_URL(extracted_text)

In [22]:
abstract

## Removing abstract from paper so that contributing sentences should be part of paper only

In [23]:
extracted_text = extracted_text.replace(abstract,"")

## Applying Tfidf for getting words with highest priorities

In [27]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 

In [28]:
# nltk.sent_tokenize(extracted_text)
from spacy.lang.en import English
from spacy.lang.en.stop_words import STOP_WORDS

In [29]:
nlp = spacy.load('en_core_web_sm')
stop_words = set(list(spacy.lang.en.stop_words.STOP_WORDS)+list(stopwords.words('english')))

In [30]:
extracted_text = extracted_text.lower()

In [31]:
# removoing figures task as its has does no help for contribution currently
import re
 
cleaned_str = []
  
for sentence in extracted_text.split("."):
    if not (re.search("figure|Figure|figures|Figures", sentence, flags=re.IGNORECASE)):
        cleaned_str.append(sentence)
cleaned_str=".".join(cleaned_str)

In [32]:
## Through TF-IDF

### Through TF-IDF we will get the most important topics for the sentences

In [33]:
import nltk
import math
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 
from nltk import tokenize
from operator import itemgetter
import math
total_words = extracted_text.split()
total_word_length = len(total_words)
print(total_word_length)
total_sentences = nltk.sent_tokenize(extracted_text)
total_sent_len = len(total_sentences)
print(total_sent_len)
tf_score = {}
for each_word in total_words:
    each_word = each_word.replace('.','')
    if each_word not in stop_words:
        if each_word in tf_score:
            tf_score[each_word] += 1
        else:
            tf_score[each_word] = 1

# Dividing by total_word_length for each dictionary element
tf_score.update((x, y/int(total_word_length)) for x, y in tf_score.items())
# print(tf_score)
def check_sent(word, sentences): 
    final = [all([w in x for w in word]) for x in sentences] 
    sent_len = [sentences[i] for i in range(0, len(final)) if final[i]]
    return int(len(sent_len))

idf_score = {}
for each_word in total_words:
    each_word = each_word.replace('.','')
    if each_word not in stop_words:
        if each_word in idf_score:
            idf_score[each_word] = check_sent(each_word, total_sentences)
        else:
            idf_score[each_word] = 1

# Performing a log and divide
idf_score.update((x, math.log(int(total_sent_len)/y)) for x, y in idf_score.items())
tf_idf_score = {key: tf_score[key] * idf_score.get(key, 0) for key in tf_score.keys()}
# print(tf_idf_score)
def get_top_n(dict_elem, n):
    result = dict(sorted(dict_elem.items(), key = itemgetter(1), reverse = True)[:n]) 
    return result
print(get_top_n(tf_idf_score, 1))
# output : review


#### Review is the most important topics of the whole paper as per tfidf. Also if we read the paper the pattern we see is review followed by another noun like process or task

In [34]:
from nltk import tokenize

In [35]:
clean = tokenize.sent_tokenize(cleaned_str)

In [37]:
data = pd.DataFrame(clean,columns = ["sent"])

In [38]:
import spacy
from spacy.matcher import Matcher 
from spacy.matcher import PhraseMatcher
from spacy import displacy 
from IPython.display import Image, display

In [39]:
import spacy
from spacy.matcher import PhraseMatcher
nlp = spacy.load("en_core_web_sm")

In [40]:
data.head(10)

## Find all instances of review followed by noun

In [41]:
def find_names(text):
    
    names = []
    
    # spacy doc
    doc = nlp(text)
    
    # pattern
    pattern = [{"LOWER": "review"},{'POS':'NOUN'}]
                
    # Matcher class object 
    matcher = Matcher(nlp.vocab) 
    matcher.add("names",[pattern]) 

    matches = matcher(doc)

    # finding patterns in the text
    for i in range(0,len(matches)):
        
        # match: id, start, end
        token = doc[matches[i][1]:matches[i][2]]
        # append token to list
        names.append(str(token))
    
    # Only keep sentences containing Indian PMs
            
    if len(names)!=0 :           
        return names
    else:
        return None

# apply function
data['Reviews'] = data['sent'].apply(find_names)

In [43]:
data_1 = data.dropna().reset_index(drop = True)

In [44]:
first_summary = []
for i in range(len(data_1)):
    first_summary.append(data_1["sent"][i])
para1 = "".join(first_summary)
print(para1)

### Contributing sentences paragraph

In [45]:
len(para1)

In [46]:
from nltk import tokenize
from operator import itemgetter
import math

### T5 based abstractive summarization

In [47]:
from transformers import AutoModelWithLMHead, AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("mrm8488/t5-base-finetuned-summarize-news")
model = AutoModelWithLMHead.from_pretrained("mrm8488/t5-base-finetuned-summarize-news").to("cuda")

In [48]:
def summarize(text, max_length=500):
        input_ids = tokenizer.encode(text, return_tensors="pt", add_special_tokens=True).to("cuda")

        generated_ids = model.generate(input_ids=input_ids, num_beams=2, max_length=max_length,  repetition_penalty=2.5, length_penalty=1.0, early_stopping=True)

        preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]

        return preds[0]

In [49]:
predicted_abstract = summarize(para1)

In [50]:
predicted_abstract

In [51]:
from datasets import load_dataset, load_metric

In [52]:
abstract = bs_content.find("abstract").text.replace('\n', '').replace('\r','')

In [53]:
from rouge_score import rouge_scorer

scorer = rouge_scorer.RougeScorer(['rouge1','rouge2', 'rougeL'], use_stemmer=True)
scores = scorer.score(abstract,
                      predicted_abstract)

In [54]:
scores

In [55]:
from rouge_score import rouge_scorer

scorer = rouge_scorer.RougeScorer(['rouge1','rouge2', 'rougeL'], use_stemmer=True)
scores = scorer.score(abstract,
                      para1)

In [56]:
scores

## Bart - longformer based summarization

In [57]:
from transformers import BigBirdPegasusForConditionalGeneration, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("google/bigbird-pegasus-large-arxiv")

# by default encoder-attention is `block_sparse` with num_random_blocks=3, block_size=64
model = BigBirdPegasusForConditionalGeneration.from_pretrained("google/bigbird-pegasus-large-arxiv")

In [58]:
inputs = tokenizer(para1, return_tensors='pt')
prediction = model.generate(**inputs)
prediction = tokenizer.batch_decode(para1)

In [59]:
prediction = '''in this work, we develop a deep neural architecture incorporating full paper information and review text along with the associated sentiment to predict the acceptability and recommendation score of a given research article.we also show that the addition of review sentiment component significantly enhances the predictive capability of such a system.did a thorough study of the various means of computational support to the peer review system.explored a multi-instance learning framework for sentiment analysis from the peer review texts.we attribute this to the use of deep neural networks and augmentation of review sentiment information in our architecture.to calculate the sentiment polarity of a review text, we take the average of the sentence wise sentiment scores from valence aware dictionary and sentiment reasoner (vader ).we do this for each of the reviews and create a review representation as n 2 being the maximum number of sentences in the reviews.we make use of a convolutional neural network (cnn ) to extract features from both the paper and review representations.we also extract features from the review sentiment representation x rs via another mlp (mlp senti ). finally, we fuse the extracted review sentiment feature and joint paper+review representation together to generate the overall recommendation score using the affine transformation as we minimize''' 

In [61]:
from rouge_score import rouge_scorer

scorer = rouge_scorer.RougeScorer(['rouge1','rouge2', 'rougeL'], use_stemmer=True)
scores = scorer.score(abstract,
                      prediction)
scores