In [1]:
import nltk
import re
import string
from nltk.tokenize import sent_tokenize as nltk_sent_tokenize
from nltk.tokenize import word_tokenize as nltk_word_tokenize
from nltk.corpus import stopwords
import numpy as np
from heapq import nlargest

from string import punctuation
from collections import Counter
import os

import pickle
import pandas as pd

from process_text import sent_tokenize, clean_text, cleanup_sentence, clean_number, cleanup_sentences
from generate_keywords import get_training_keywords, get_section_keywords
from generate_summary import get_summary_training_keywords, get_summary_section_keywords

In [2]:
VALIDATION_DATASET = False
TEST_DATASET = True

In [3]:
## Best Parameters
GEN_SUMMARY_TRAIN = True
NUM_NEIGHBOURS_TRAINING_KEYWORD_SUMMARY = 0
NUM_NEIGHBOURS_SECTION_KEYWORD_SUMMARY = 20
NUM_SENTS_TRAINING_KEYWORD_SUMMARY = 90
NUM_SENTS_SECTION_KEYWORD_SUMMARY = 20

In [4]:
train_dir = '../../../Dataset/FNS2022/Spanish/training/'
if VALIDATION_DATASET:
    dir_ = '../../../Dataset/FNS2022/Spanish/validation/'

if TEST_DATASET:
    dir_ = '../../../Dataset/FNS2022/Spanish/testing/'
    
lang = 'es'

annual_reports_dir = "annual_reports"
gold_summary_dir = "gold_summaries"
system_summary_dir = 'spanish_summaries'


multi_lingual_cleaned_toc_file = 'multi_lingual_cleaned_toc_sections.pkl'
#multi_lingual_cleaned_toc_file = 'multi_lingual_cleaned_toc_sections_original.pkl'

team_name = 'SSC_AI_RG'
team_name1 = 'summary'
dir_

'../../../Dataset/FNS2022/Spanish/testing/'

In [5]:
if GEN_SUMMARY_TRAIN:
    training_keywords = get_training_keywords(train_dir, gold_summary_dir)

[('millones', 1800), ('euros', 1656), ('año', 1346), ('crecimiento', 835), ('ejercicio', 835)]
[('millones', 1.0), ('euros', 0.92), ('año', 0.7477777777777778), ('crecimiento', 0.4638888888888889), ('ejercicio', 0.4638888888888889)]


In [6]:
section_keywords =  get_section_keywords(multi_lingual_cleaned_toc_file, 'clean_spanish')
section_keywords

{'declaración': 684.9274879331334,
 'presidente': 617.6683795276609,
 'ejecutivo': 560.2791880483602,
 'director': 559.0266347736124,
 'reflejos': 448.4229749294871,
 'revisión': 408.15293593062194,
 'vistazo': 171.90833296083144,
 'comienzo': 93.9458197743788,
 'informe': 66.91773059309381,
 'estrategia': 39.42653784446979,
 'negocio': 20.62129419033372,
 'carta': 20.18658177247684,
 'información': 14.293872493871422,
 'accionista': 14.293872493871422,
 'aviso': 14.283659169984391,
 'reunión': 14.283659169984391,
 'introducción': 12.928826013437424,
 'rendimiento': 12.678557588949861,
 'general': 11.418823752850953,
 'anual': 11.418823752850953,
 'financiera': 11.002595349761808,
 'revision': 10.565792343992566,
 'modelo': 10.055501846341159,
 'visión': 9.763240090167402,
 'conjunto': 9.763240090167402,
 'indicadores': 8.345450021116825,
 'clave': 8.345450021116825,
 'gestión': 8.019963281498491,
 'riesgos': 7.564352819326439,
 'comité': 6.463695945074693,
 'resumen': 6.34051200359972

In [7]:
from rouge_score import rouge_scorer

In [8]:
num_file = 0

os.makedirs(system_summary_dir)
for file in os.listdir(os.path.join(dir_, annual_reports_dir)):
    try:
        #print("Processing File Number: ", num_file)
        num_file = num_file +1 
        file_id = file.split('.')[0]
   
        with open(os.path.join(dir_,annual_reports_dir, str(file_id)+'.txt'), "r", encoding='utf-8') as report_file:
            tmp_file = []
            for line in report_file:
                line = line.replace("\n","").replace("\t"," ").replace("\xa0"," ")
                tmp_file.append(line)
            text = ' '.join(tmp_file)
            if GEN_SUMMARY_TRAIN:
                raw_sentences, clean_sentences = cleanup_sentences(text, lang)
                summary = get_summary_training_keywords(training_keywords, clean_sentences, raw_sentences, NUM_SENTS_TRAINING_KEYWORD_SUMMARY, NUM_NEIGHBOURS_TRAINING_KEYWORD_SUMMARY)
                raw_sentences, clean_sentences = cleanup_sentences(summary, lang)
            else:
                raw_sentences, clean_sentences = cleanup_sentences(text, lang)
            summary = get_summary_section_keywords(section_keywords, clean_sentences, raw_sentences, NUM_SENTS_SECTION_KEYWORD_SUMMARY, NUM_NEIGHBOURS_SECTION_KEYWORD_SUMMARY)
            summary_split = summary.split(' ')
            if len(summary_split) > 1000:
                summary = ' '.join(summary_split[:1000])
                summary_split =  summary.split(' ')
            #print('Length of Summary: ', len(summary_split))  
            
        with open(os.path.join(system_summary_dir, file_id+'_'+team_name+'.txt'), 'w', encoding='utf-8') as f:
            f.write(str(summary))
            
        if ".DS_Store" in file:
            continue
    except Exception as e:
        print(file, e)

In [9]:
gold_summary_dir_ =  os.path.join(dir_, gold_summary_dir)

In [10]:
from rouge_score import rouge_scorer
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2'])

rouge_scores = {}
rouge_scores['rouge-1'] = {}
rouge_scores['rouge-2'] = {}

rouge_scores['rouge-1']['p'] = 0
rouge_scores['rouge-1']['r'] = 0
rouge_scores['rouge-1']['f'] = 0
rouge_scores['rouge-2']['p'] = 0
rouge_scores['rouge-2']['r'] = 0
rouge_scores['rouge-2']['f'] = 0
num_file = 0

for file in os.listdir(system_summary_dir):
    try:
        if num_file%10 == 0:
            print("Processing File Number: ", num_file)
        file_id = file.split('_')[0]
        if ".DS_Store" in file:
            continue
            
        with open(os.path.join(system_summary_dir,file), "r",  encoding='utf-8') as f:
            system_summary_txt = f.read()
            #print(file, 'summary len: ', len(system_summary_txt.split(' ')))
                
        sum_rouge_scores = {}
        sum_rouge_scores['rouge-1'] = {}
        sum_rouge_scores['rouge-2'] = {}
        sum_rouge_scores['rouge-L'] = {}

        sum_rouge_scores['rouge-1']['p'] = 0
        sum_rouge_scores['rouge-1']['r'] = 0
        sum_rouge_scores['rouge-1']['f'] = 0
        sum_rouge_scores['rouge-2']['p'] = 0
        sum_rouge_scores['rouge-2']['r'] = 0
        sum_rouge_scores['rouge-2']['f'] = 0
        
        num_sum_file = 0
        
        for sum_file in os.listdir(gold_summary_dir_):
            if ".DS_Store" in file:
                continue
            if os.path.isfile(os.path.join(gold_summary_dir_,sum_file)) and file_id==sum_file.split("_")[0]:
                #sum_file.split("_")[1].split('.')[0] == str(1)
                
                with open(os.path.join(gold_summary_dir_,sum_file), "r",  encoding='utf-8') as f:
                    gold_sum_txt = f.read()
                    sum_scores = scorer.score(gold_sum_txt, system_summary_txt)
                    sum_rouge_scores['rouge-1']['p'] = sum_rouge_scores['rouge-1']['p'] + sum_scores['rouge1'][0]
                    sum_rouge_scores['rouge-1']['r'] = sum_rouge_scores['rouge-1']['r'] + sum_scores['rouge1'][1]
                    sum_rouge_scores['rouge-1']['f'] = sum_rouge_scores['rouge-1']['f'] + sum_scores['rouge1'][2]
                    sum_rouge_scores['rouge-2']['p'] = sum_rouge_scores['rouge-2']['p'] + sum_scores['rouge2'][0]
                    sum_rouge_scores['rouge-2']['r'] = sum_rouge_scores['rouge-2']['r'] + sum_scores['rouge2'][1]
                    sum_rouge_scores['rouge-2']['f'] = sum_rouge_scores['rouge-2']['f'] + sum_scores['rouge2'][2]
                    num_sum_file = num_sum_file + 1
            
        sum_rouge_scores['rouge-1']['p'] = sum_rouge_scores['rouge-1']['p'] / (num_sum_file)
        sum_rouge_scores['rouge-1']['r'] = sum_rouge_scores['rouge-1']['r'] / (num_sum_file)
        sum_rouge_scores['rouge-1']['f'] = sum_rouge_scores['rouge-1']['f'] / (num_sum_file)
        sum_rouge_scores['rouge-2']['p'] = sum_rouge_scores['rouge-2']['p'] / (num_sum_file)
        sum_rouge_scores['rouge-2']['r'] = sum_rouge_scores['rouge-2']['r'] / (num_sum_file)
        sum_rouge_scores['rouge-2']['f'] = sum_rouge_scores['rouge-2']['f'] / (num_sum_file)
        
        
        rouge_scores['rouge-1']['p'] = rouge_scores['rouge-1']['p'] + sum_rouge_scores['rouge-1']['p']
        rouge_scores['rouge-1']['r'] = rouge_scores['rouge-1']['r'] + sum_rouge_scores['rouge-1']['r']
        rouge_scores['rouge-1']['f'] = rouge_scores['rouge-1']['f'] + sum_rouge_scores['rouge-1']['f']
        rouge_scores['rouge-2']['p'] = rouge_scores['rouge-2']['p'] + sum_rouge_scores['rouge-2']['p']
        rouge_scores['rouge-2']['r'] = rouge_scores['rouge-2']['r'] + sum_rouge_scores['rouge-2']['r']
        rouge_scores['rouge-2']['f'] = rouge_scores['rouge-2']['f'] + sum_rouge_scores['rouge-2']['f']
        
        num_file = num_file + 1
        
        
    except Exception as e:
        print(e)
        pass

Processing File Number:  0
Processing File Number:  10
Processing File Number:  20
Processing File Number:  30
Processing File Number:  40


In [11]:
num_file

50

In [12]:
rouge_scores['rouge-1']['p'] = rouge_scores['rouge-1']['p'] / (num_file)
rouge_scores['rouge-1']['r'] = rouge_scores['rouge-1']['r'] / (num_file)
rouge_scores['rouge-1']['f'] = rouge_scores['rouge-1']['f'] / (num_file)
rouge_scores['rouge-2']['p'] = rouge_scores['rouge-2']['p'] / (num_file)
rouge_scores['rouge-2']['r'] = rouge_scores['rouge-2']['r'] / (num_file)
rouge_scores['rouge-2']['f'] = rouge_scores['rouge-2']['f'] / (num_file)
rouge_scores

{'rouge-1': {'p': 0.35710062211907556,
  'r': 0.566534331077025,
  'f': 0.40997152545491955},
 'rouge-2': {'p': 0.12209111797433209,
  'r': 0.19199590061336927,
  'f': 0.13952885944930948}}

<b>Best Hyper Parameters</b>

- Using Training Summary Keywords = True
- Neighbours in training keyword summary = 0
- Neighbours in section keyword summary = 20
- Sents to be extracted using training keywords = 90

{'rouge-1': {'p': 0.35710062211907556,
  'r': 0.566534331077025,
  'f': 0.40997152545491955},
 'rouge-2': {'p': 0.12209111797433209,
  'r': 0.19199590061336927,
  'f': 0.13952885944930948}}


---------------------
<u>Number of Neighbours in Training Keyword Summary </u>

Neighbours = 0

{'rouge-1': {'p': 0.35710062211907556,
  'r': 0.566534331077025,
  'f': 0.40997152545491955},
 'rouge-2': {'p': 0.12209111797433209,
  'r': 0.19199590061336927,
  'f': 0.13952885944930948}}

Neighbours = 3

{'rouge-1': {'p': 0.3495395823101412,
  'r': 0.5552951380929625,
  'f': 0.40189065774673943},
 'rouge-2': {'p': 0.11178622650416274,
  'r': 0.17257265158591845,
  'f': 0.12697151083132888}}

Neighbours = 5

{'rouge-1': {'p': 0.34273284774275303,
  'r': 0.5454515858695331,
  'f': 0.3944530235877515},
 'rouge-2': {'p': 0.10669587283177485,
  'r': 0.16453665389721503,
  'f': 0.12131244377073677}}

Neighbours = 10

{'rouge-1': {'p': 0.339174029202241,
  'r': 0.5412414117633769,
  'f': 0.39083354360678135},
 'rouge-2': {'p': 0.10486097513450902,
  'r': 0.16227334588593853,
  'f': 0.11939252853982939}}
  

Neighbours = 20

{'rouge-1': {'p': 0.3343979040063018,
  'r': 0.5314675950601467,
  'f': 0.3841687756240356},
 'rouge-2': {'p': 0.10593764679529771,
  'r': 0.16342694834220456,
  'f': 0.12035195361330553}}
  
  
Neighbours = 30

{'rouge-1': {'p': 0.33331728208692246,
  'r': 0.530623317596108,
  'f': 0.383627426031471},
 'rouge-2': {'p': 0.10466953618389786,
  'r': 0.16613207783306108,
  'f': 0.12014120936062753}}




  


---------------------
<u>Number of Setence Training Keyword Summary </u>

Sents = 150

{'rouge-1': {'p': 0.35815543969704094,
  'r': 0.5663189173435546,
  'f': 0.41101451084801643},
 'rouge-2': {'p': 0.12068068472825491,
  'r': 0.1876708933795608,
  'f': 0.1374833570531603}}

Sents = 125

{'rouge-1': {'p': 0.3574343789936577,
  'r': 0.5646902465836893,
  'f': 0.4099939574093255},
 'rouge-2': {'p': 0.12058818414938609,
  'r': 0.1875283692748206,
  'f': 0.1373074588592896}}

Sents = 100

{'rouge-1': {'p': 0.3571981823629054,
  'r': 0.564830858044632,
  'f': 0.4098175385953916},
 'rouge-2': {'p': 0.12173075861652446,
  'r': 0.19005024893118233,
  'f': 0.13886587251252747}}
  
Sents = 90

{'rouge-1': {'p': 0.35710062211907556,
  'r': 0.566534331077025,
  'f': 0.40997152545491955},
 'rouge-2': {'p': 0.12209111797433209,
  'r': 0.19199590061336927,
  'f': 0.13952885944930948}}

Sents = 80

{'rouge-1': {'p': 0.35657785317797625,
  'r': 0.5662543197857871,
  'f': 0.40951455843308937},
 'rouge-2': {'p': 0.12176428275100866,
  'r': 0.19202367959809508,
  'f': 0.13929268177796947}}

Sents = 75

{'rouge-1': {'p': 0.35668619754991027,
  'r': 0.5664088351318858,
  'f': 0.40964402947051665},
 'rouge-2': {'p': 0.12181914913998865,
  'r': 0.19208446987165131,
  'f': 0.13935537902069542}}

Sents = 70

{'rouge-1': {'p': 0.35591467400290155,
  'r': 0.5640506297386563,
  'f': 0.40849039263719206},
 'rouge-2': {'p': 0.12087166086424692,
  'r': 0.18980112641738017,
  'f': 0.13803988224380692}}


Sents = 60

{'rouge-1': {'p': 0.3537909344354132,
  'r': 0.5614579626762038,
  'f': 0.40633529346657665},
 'rouge-2': {'p': 0.11912044045992337,
  'r': 0.18715165213842916,
  'f': 0.13606840217458252}}
  
Sents = 50

{'rouge-1': {'p': 0.35115912609124567,
  'r': 0.5570751101315627,
  'f': 0.40327288864032984},
 'rouge-2': {'p': 0.11858293344956145,
  'r': 0.1859142425959027,
  'f': 0.13536162749039615}}
  
Sents = 20

{'rouge-1': {'p': 0.34448223334135014,
  'r': 0.542967318722036,
  'f': 0.39460294938418294},
 'rouge-2': {'p': 0.11358701907577547,
  'r': 0.17487571097124344,
  'f': 0.12888281778271257}}



---------------------
<u>Number of Setence Section Summary </u>

Sents = 20 - No Change
  
Sents = 10 - No Change

Sents = 5 - No Change
  
Sents = 3 - No Change

Sents = 2

{'rouge-1': {'p': 0.3571981823629054,
  'r': 0.564830858044632,
  'f': 0.4098175385953916},
 'rouge-2': {'p': 0.12173075861652446,
  'r': 0.19005024893118233,
  'f': 0.13886587251252747}}

Sents = 1

{'rouge-1': {'p': 0.35975316976382316,
  'r': 0.5625068674836,
  'f': 0.4112611469162477},
 'rouge-2': {'p': 0.12210480742590474,
  'r': 0.18846965525245551,
  'f': 0.1387936057880862}}

---------------------
<u>Number of Neighbours in Section Keyword Summary </u>

Neighbours = 0

{'rouge-1': {'p': 0.343129469872473,
  'r': 0.5062334899217289,
  'f': 0.3817406289781609},
 'rouge-2': {'p': 0.09828147996407031,
  'r': 0.1430834844841988,
  'f': 0.10885464639208679}}

Neighbours = 1

{'rouge-1': {'p': 0.33870191062706617,
  'r': 0.5379397675038113,
  'f': 0.3894523195588602},
 'rouge-2': {'p': 0.1030383254827261,
  'r': 0.16454305995179264,
  'f': 0.11852572042589474}}

Neighbours = 2

{'rouge-1': {'p': 0.3410216326925894,
  'r': 0.5422038499851839,
  'f': 0.3924090505747235},
 'rouge-2': {'p': 0.10404817256162263,
  'r': 0.16436244404425193,
  'f': 0.11944112538385139}}



Neighbours = 3

{'rouge-1': {'p': 0.3430036682612632,
  'r': 0.5450683163451917,
  'f': 0.39456977586565045},
 'rouge-2': {'p': 0.10521814731962131,
  'r': 0.1652603657001561,
  'f': 0.12043956651813448}}


  
Neighbours = 4

{'rouge-1': {'p': 0.34311757914524404,
  'r': 0.5467573750715463,
  'f': 0.39511186591156144},
 'rouge-2': {'p': 0.10628797991426608,
  'r': 0.16687433900516424,
  'f': 0.12162060479004026}}


Neighbours = 5

{'rouge-1': {'p': 0.3460093927945018,
  'r': 0.5488733045773443,
  'f': 0.39759507117124343},
 'rouge-2': {'p': 0.10837083190864102,
  'r': 0.16791759687247906,
  'f': 0.12323643947925378}}
  
  
Neighbours = 10

{'rouge-1': {'p': 0.3515030506344114,
  'r': 0.5565109007758392,
  'f': 0.4034849525105139},
 'rouge-2': {'p': 0.1150781994228057,
  'r': 0.17861850235705717,
  'f': 0.13090200388071313}}
  

Neighbours = 15

{'rouge-1': {'p': 0.3527993445659401,
  'r': 0.5600211630538071,
  'f': 0.40537708443699855},
 'rouge-2': {'p': 0.11853138636757957,
  'r': 0.1850069811640433,
  'f': 0.13508943024492218}}
  
  
Neighbours = 20

{'rouge-1': {'p': 0.3571981823629054,
  'r': 0.564830858044632,
  'f': 0.4098175385953916},
 'rouge-2': {'p': 0.12173075861652446,
  'r': 0.19005024893118233,
  'f': 0.13886587251252747}}

Neighbours = 25

{'rouge-1': {'p': 0.3561608530797876,
  'r': 0.5615565842110501,
  'f': 0.40838833399142666},
 'rouge-2': {'p': 0.12172883232650498,
  'r': 0.1883993464438489,
  'f': 0.1385717276810647}}
  
Neighbours = 30

  {'rouge-1': {'p': 0.35463133843545536,
  'r': 0.5617826912507283,
  'f': 0.4071253124228513},
 'rouge-2': {'p': 0.12011625491773889,
  'r': 0.18675647127742942,
  'f': 0.1368640697199925}}

----------------------------


<u>With and Without using Trainig Summary Keywords</u>

Without using Training Summary Keywords

{'rouge-1': {'p': 0.3203878964728717,
  'r': 0.5073309122555978,
  'f': 0.3681499379137199},
 'rouge-2': {'p': 0.09330405229008987,
  'r': 0.14143181453400444,
  'f': 0.10546512933430974}}


Using Training Summary Keywords

{'rouge-1': {'p': 0.3571981823629054,
  'r': 0.564830858044632,
  'f': 0.4098175385953916},
 'rouge-2': {'p': 0.12173075861652446,
  'r': 0.19005024893118233,
  'f': 0.13886587251252747}}

