# Baseline Vanilla Bert Based Summarizer

https://arxiv.org/abs/1906.04165
https://pypi.org/project/bert-extractive-summarizer/

In [136]:
from summarizer import Summarizer
from rouge import Rouge
import pandas as pd
import glob

**Functions**

In [120]:
def createData(pathname):
    all_files = glob.glob(pathname + "/*.csv")
    #print('all_files',all_files)
    raw_files=[]
    for filepath in all_files:
        #print(filepath)
        raw_files.append(pd.read_csv(filepath))
    return raw_files

In [121]:
def createSummary(raw_files):
    tst_summ_lst=[]
    eval_summ_lst=[]
    len_lst=[]
    for raw_file in raw_files:
        test_summary=""
        eval_summary=""
        test_summary="".join(raw_file['sentence'])
        raw_file_filtered= raw_file[raw_file['target']==1]
        length=len(raw_file_filtered)
        eval_summary="".join(raw_file_filtered['sentence'])
        len_lst.append(length)
        tst_summ_lst.append(test_summary)
        eval_summ_lst.append(eval_summary)
    return tst_summ_lst,eval_summ_lst,len_lst

In [122]:
def baselineSummarizer(file,len_lst):
    model = Summarizer()
    summary_lst=[]
    k=0
    for text in file:
        print(len(summary_lst))
        result = model(text,min_length=len_lst[k])
        summary = ''.join(result)   
        k+=1
        summary_lst.append(summary)
    return summary_lst

In [132]:
def rougeEvaluation(model_output_lst, reference_lst):
    scores={}
    score_lst=[]
    rouge = Rouge()
    for i,x in enumerate(zip(model_output_lst,reference_lst)):
        print(i)
        model_output=model_output_lst[i]
        reference=reference_lst[i]
        scores=rouge.get_scores(model_output, reference)
        score_lst.append(scores)
    return score_lst
    

In [124]:
def findElbow(body):
    model = Summarizer()
    res = model.calculate_optimal_k(body, k_max=10)
    print(res)

**Create Data**

In [125]:
pathname='/Users/riyajoshi/PycharmProjects/Personalized-Document-Summarization/data/The_main_reasons_why_you_would_like_living_in_this_state/userInput_50/test'
raw_files=createData(pathname)
print(len(raw_files[0]))

548


In [126]:
tst_summ_lst,eval_summ_lst,len_lst=createSummary(raw_files)

In [127]:
print(len(tst_summ_lst))
#print(tst_summ_lst)
print(len(eval_summ_lst))

3
3


**Summary generation**

In [128]:
summary_lst=baselineSummarizer(tst_summ_lst,len_lst)

Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


0
1
2


In [129]:
#print(summary_lst[0])

In [130]:
#print(len(summary))
#x = summary.split(".")
#print(x)
#print(len(x))

**Evaluation using Rougue**

In [133]:
score_lst=[]
score_lst=rougeEvaluation(summary_lst, eval_summ_lst)
print(score_lst)

0
1
2
[[{'rouge-1': {'r': 0.49206349206349204, 'p': 0.09309309309309309, 'f': 0.1565656538900368}, 'rouge-2': {'r': 0.3111888111888112, 'p': 0.04952698942682248, 'f': 0.08545367021861133}, 'rouge-l': {'r': 0.47619047619047616, 'p': 0.09009009009009009, 'f': 0.1515151488395317}}], [{'rouge-1': {'r': 0.5795454545454546, 'p': 0.11710677382319173, 'f': 0.19484240407995193}, 'rouge-2': {'r': 0.383399209486166, 'p': 0.06360655737704918, 'f': 0.10911135863892651}, 'rouge-l': {'r': 0.5568181818181818, 'p': 0.11251435132032148, 'f': 0.1872015253789013}}], [{'rouge-1': {'r': 0.4942528735632184, 'p': 0.1021377672209026, 'f': 0.16929133574407904}, 'rouge-2': {'r': 0.2965779467680608, 'p': 0.052845528455284556, 'f': 0.0897067254373222}, 'rouge-l': {'r': 0.47126436781609193, 'p': 0.09738717339667459, 'f': 0.16141731999604753}}]]
