In [35]:
import re
import nltk
import numpy as np
import matplotlib.pyplot as plt
import scipy
import scipy.sparse as sp
import pandas as pd
import spacy

from nltk.tokenize import word_tokenize
from tqdm.notebook import tqdm
from os import listdir
from os.path import isfile, join
from rouge_score import rouge_scorer
from rouge import Rouge 

rouge = Rouge()
nlp = spacy.load("en_core_web_sm")

Import files

In [36]:
val_preproc_dir = r"C:\Users\RedmiBook\Documents\SCE\proj\datasets\2021val_annual_reports_prepr-20220901T133401Z-001\2021val_annual_reports_prepr"
name_of_files = [f for f in listdir(val_preproc_dir) 
             if isfile(join(val_preproc_dir, f))]
list_of_files = []
for file_name in name_of_files:
    list_of_files.append(open(val_preproc_dir + "\\" + file_name, "r").read())

In [37]:
half_perc, perc = 0.05, 0.1

list_of_files_1 = [file[:int(len(file) * half_perc)] for file in list_of_files]
list_of_files_2 = [file[int(len(file) * half_perc):int(len(file) * perc)] for file in list_of_files]

Preprocess data

In [38]:
def delete_nums(txt_splited):
    res_finish = []
    for text in tqdm(txt_splited):
        res_txt = []
        for paragraph in text:
            res_par = ''
            for i in re.split('\.| ',paragraph):
                if i.isalpha():
                    res_par += i.lower() + ' '
            res_txt.append(res_par)
        res_finish.append(res_txt)
    return res_finish

In [39]:
txt_splited_1 = [file_txt.split('\n') for file_txt in list_of_files_1]
txt_str_div_par_1 = delete_nums(txt_splited_1)

txt_splited_2 = [file_txt.split('\n') for file_txt in list_of_files_2]
txt_str_div_par_2 = delete_nums(txt_splited_2)

  0%|          | 0/363 [00:00<?, ?it/s]

  0%|          | 0/363 [00:00<?, ?it/s]

In [40]:
def make_vocab(txt_str_div_par):
    txt_str = txt_str_div_par.copy()
    for i, txt in enumerate(txt_str_div_par):
        txt_str[i] = ''.join(txt)

    vocab_list = []
    for i, txt in enumerate(txt_str):
        vocab = list(re.split(' \n| ', txt))

        if '' in set(vocab):
            vocab.remove("")
        vocab_list.append(list(set(vocab)))
    return vocab_list, txt_str

In [41]:
def list_of_words_max_len(txt_str):
    list_of_words = [txt.split(' ') for txt in txt_str]
    for i in range(len(list_of_words)):
        list_of_words[i] = [w for w in list_of_words[i] if w]

    len_words = [len(v) for v in list_of_words]
    max_len_words = max(len_words)
    return list_of_words, len_words, max_len_words

make functions for calculating results in sparse matrix for time and memory optimization

In [42]:
def sparse_triangle_max(max_len_words, sig=25):    
    S = sp.dok_array((max_len_words + 2 * sig, max_len_words + 2 * sig), dtype=np.float32)
    res_f = [(1 - np.abs(i - sig) / sig) for i in range(1, sig * 2)]
    for i in tqdm(range(sig, sig + max_len_words), total=max_len_words):
        S[i, i-sig+1:i+sig] = res_f
    return S

In [43]:
def sparse_triangle_crop(S, size, sig=25):
    return S[sig:sig+size, sig:sig+size]

In [44]:
def sparse_vocab(text, vocab):    
    S = sp.dok_array((len(vocab), len(text)), dtype=np.float32)
    v_to_i = {v:i for i, v in enumerate(vocab)}
    for j, w in enumerate(text):
        S[v_to_i[w], j] = 1
    
    return S

calculate matix

In [45]:
def matix_txt(S, text, vocab):
    len_txt = len(text)

    f_res = sparse_triangle_crop(S, len_txt) # l x l
    c_vocab = sparse_vocab(text, vocab) # w x l

    num = c_vocab.tocsr() @ f_res.tocsc() # w x l @ l x l = w x l
    denum = (c_vocab.tocsr() @ f_res.tocsc()).tocsc().sum(axis=0) # w x l @ l x l = w x l sum axis0 = 1xl
    
    denum = sp.csc_matrix(1 / denum)

    return num * denum

calculate propogation matix

In [46]:
# dir_to_save=r"C:\Users\RedmiBook\Documents\SCE\proj\datasets\calculated\npz_2_parts\res_1\\"
# # dir_to_save + name_of_files[0][:-4]
# sp.save_npz(dir_to_save + name_of_files[0][:-4], matix_txt(S_1, list_of_words_1[0], vocab_list_1[0]))
        

In [47]:
# matix_txt(S_1, list_of_words_1[0], vocab_list_1[0])

In [48]:
def calculate_save_npz(list_of_words, S, vocab_list, name_of_files, dir_to_save):
    for i in tqdm(range(len(list_of_words))):
        matix_item = matix_txt(S, list_of_words[i], vocab_list[i])    
        sp.save_npz(dir_to_save + name_of_files[i][:-4], matix_item)

download calculated matix and sum scores of the words per paragraph

In [49]:
def calculate_sum_of_art(txt_str_div_art, sum_arr_word, v_to_i):
    article_weight, len_per_art = [], []
    for article in txt_str_div_art:
        split = [j for j in article.split(' ') if j]
        len_art = len(split)
        if len_art != 0:
            par_split = 0
            for par in split:
                par_split += sum_arr_word[v_to_i[par]]
            article_weight.append(float(par_split) / len_art)
            len_per_art.append(len_art)
        else:
            article_weight.append(0)
            len_per_art.append(0)
            
    return pd.DataFrame(list(zip(article_weight, len_per_art)), columns=['sum', 'len'])

In [50]:
def top_art(res):
    sorted_res = res.sort_values(by=['sum'], ascending=False)
    sorted_res['index'] = sorted_res.index

    i, len_txt = 0, 0
    num_art = []
    while i < len(sorted_res) and len_txt < 1000:
        len_txt += sorted_res.iloc(0)[i]['len']
        num_art.append(int(sorted_res.iloc(0)[i]['index'])) 
        i += 1
    num_art.sort()
    return num_art, int(len_txt)

In [51]:
# text_name = name_of_files[1]
# i_t = 1
# sum_arr_word = sp.load_npz(r"C:\Users\RedmiBook\Documents\SCE\proj\datasets\calculated\res val short\\" + text_name[:-4] + ".npz").sum(axis=1)
# v_to_i = {v:i for i, v in enumerate(vocab_list[i_t])}
# pd_res = calculate_sum_of_art(txt_str_div_par[i_t], sum_arr_word, v_to_i)
# list_art, _ = top_art(pd_res)

In [52]:
def download_npz(name_of_files, vocab_list, txt_str_div_par, dir_to_upload):
    art_sum = []
    for i_t, text_name in tqdm(enumerate(name_of_files), total=len(name_of_files)):

        #calculate best articles in txt based on matrix
        sum_arr_word = sp.load_npz(dir_to_upload + text_name[:-4] + ".npz").sum(axis=1)
        v_to_i = {v:i for i, v in enumerate(vocab_list[i_t])}
        art_sum.append(calculate_sum_of_art(txt_str_div_par[i_t], sum_arr_word, v_to_i))
    return art_sum

In [53]:
# max_rouge_part = [np.argmax([sum(file[:len(file)//2]['sum']), sum(file[len(file)//2:]['sum'])]) for file in art_sum]

In [54]:
def read_files(val_dir):
    name_of_files = [f for f in listdir(val_dir) 
                 if isfile(join(val_dir, f))]
    list_of_files = []
    for file_name in name_of_files:        
        list_of_files.append(open(val_dir + "\\" + file_name, "r", encoding='utf-8').read())
    return name_of_files, list_of_files

In [55]:
def calc_rouge_mean(name_of_files, list_of_files, name_of_files_gold=name_of_files_gold, list_of_files_gold=list_of_files_gold, suf=4, cut=True):
    j = 0
    df_scores = pd.DataFrame(columns = ["rouge-1, r", "rouge-1, p", "rouge-1, f", 
                                        "rouge-2, r", "rouge-2, p", "rouge-2, f", 
                                        "rouge-l, r", "rouge-l, p", "rouge-l, f"])

    for i in tqdm(range(len(name_of_files)), total=len(name_of_files)):
        scores = []

        while j < len(name_of_files_gold) and name_of_files_gold[j][:-6] != name_of_files[i][:-suf]:
            j += 1
        while j < len(name_of_files_gold) and name_of_files_gold[j][:-6] == name_of_files[i][:-suf]:
            if cut == True:
                list_of_files[i] = ' '.join(list_of_files[i].split(' ')[:1000])            
            scores.append(rouge.get_scores(list_of_files[i], list_of_files_gold[j])[0])
            j += 1
#         print(scores)
        if scores != []:
            res_arr = np.zeros((len(scores), 9))
            for i in range(len(scores)):
                lst_i = list(scores[i].items())
                
                res_arr[i] = [list(lst_i[0][1].items())[0][1],
                              list(lst_i[0][1].items())[1][1],
                              list(lst_i[0][1].items())[2][1], 
                              list(lst_i[1][1].items())[0][1],
                              list(lst_i[1][1].items())[1][1],
                              list(lst_i[1][1].items())[2][1],
                              list(lst_i[2][1].items())[0][1],
                              list(lst_i[2][1].items())[1][1],
                              list(lst_i[2][1].items())[2][1]]
#                 print(res_arr[i])
            df_scores.loc[len(df_scores)] = res_arr.mean(axis=0)
        
    return df_scores

In [56]:
vocab_list_1, txt_str_1 = make_vocab(txt_str_div_par_1)
vocab_list_2, txt_str_2 = make_vocab(txt_str_div_par_2)

In [57]:
list_of_words_1, len_words_1, max_len_words_1 = list_of_words_max_len(txt_str_1)
list_of_words_2, len_words_2, max_len_words_2 = list_of_words_max_len(txt_str_2)

In [58]:
S_1 = sparse_triangle_max(max_len_words_1)
S_2 = sparse_triangle_max(max_len_words_2)

  0%|          | 0/7317 [00:00<?, ?it/s]

  0%|          | 0/7192 [00:00<?, ?it/s]

In [59]:
calculate_save_npz(list_of_words_1, S_1, vocab_list_1, name_of_files, dir_to_save=r"C:\Users\RedmiBook\Documents\SCE\proj\datasets\calculated\no mask 10per\res_1\\")
calculate_save_npz(list_of_words_2, S_2, vocab_list_2, name_of_files, dir_to_save=r"C:\Users\RedmiBook\Documents\SCE\proj\datasets\calculated\no mask 10per\res_2\\")

  0%|          | 0/363 [00:00<?, ?it/s]

  0%|          | 0/363 [00:00<?, ?it/s]

In [60]:
art_sum_1 = download_npz(name_of_files, vocab_list_1, txt_str_div_par_1, dir_to_upload=r"C:\Users\RedmiBook\Documents\SCE\proj\datasets\calculated\no mask 10per\res_1\\")
art_sum_2 = download_npz(name_of_files, vocab_list_2, txt_str_div_par_2, dir_to_upload=r"C:\Users\RedmiBook\Documents\SCE\proj\datasets\calculated\no mask 10per\res_2\\")

  0%|          | 0/363 [00:00<?, ?it/s]

  0%|          | 0/363 [00:00<?, ?it/s]

In [61]:
for i_t, text_name in tqdm(enumerate(name_of_files), total=len(name_of_files)):
    
    #calculate best articles in txt based on matrix for 2 txt
    list_art_1, _ = top_art(art_sum_1[i_t])
    list_art_2, _ = top_art(art_sum_2[i_t])
    
    #write results to txt
    txt_1 = ''
    for i in list_art_1:
        txt_1 += txt_splited_1[i_t][i] + '\n'
    with open(r"C:\Users\RedmiBook\Documents\SCE\proj\datasets\calculated\no mask 10per\txt_1\\" + text_name, 'w') as f:
        f.write(txt_1)
        
    txt_2 = ''
    for i in list_art_2:
        txt_2 += txt_splited_2[i_t][i] + '\n'
    with open(r"C:\Users\RedmiBook\Documents\SCE\proj\datasets\calculated\no mask 10per\txt_2\\" + text_name, 'w') as f:
        f.write(txt_2)

  0%|          | 0/363 [00:00<?, ?it/s]

In [62]:
val_res_1_dir = r"C:\Users\RedmiBook\Documents\SCE\proj\datasets\calculated\no mask 10per\txt_1"
val_res_2_dir = r"C:\Users\RedmiBook\Documents\SCE\proj\datasets\calculated\no mask 10per\txt_2"
val_gold_dir = r"C:\Users\RedmiBook\Documents\SCE\proj\datasets\fns2020_dataset_clean\validation\gold_summaries"
# val_top_dir = r"C:\Users\RedmiBook\Documents\SCE\proj\datasets\top1000-summaries"
# val_muse_dir = r"C:\Users\RedmiBook\Documents\SCE\proj\datasets\validation-muse-1000"

In [63]:
name_of_files_gold, list_of_files_gold = read_files(val_gold_dir)
name_of_files_calc_1, list_of_files_calc_1 = read_files(val_res_1_dir)
name_of_files_calc_2, list_of_files_calc_2 = read_files(val_res_2_dir)   
# name_of_files_1000, list_of_files_1000 = read_files(val_top_dir)
# name_of_files_muse, list_of_files_muse = read_files(val_muse_dir)

In [64]:
def calc_rouge_mean_half(name_of_files, list_of_files_1, list_of_files_2, name_of_files_gold=name_of_files_gold, list_of_files_gold=list_of_files_gold, suf=4, cut=True):
    j = 0
    df_scores = pd.DataFrame(columns = ["rouge-1, r", "rouge-1, p", "rouge-1, f", 
                                        "rouge-2, r", "rouge-2, p", "rouge-2, f", 
                                        "rouge-l, r", "rouge-l, p", "rouge-l, f"])
    scores_id = []
    for i in tqdm(range(len(name_of_files)), total=len(name_of_files)):
        scores = []

        while j < len(name_of_files_gold) and name_of_files_gold[j][:-6] != name_of_files[i][:-suf]:
            j += 1
        while j < len(name_of_files_gold) and name_of_files_gold[j][:-6] == name_of_files[i][:-suf]:
            if cut == True:
                list_of_files_1[i] = ' '.join(list_of_files_1[i].split(' ')[:1000])     
                list_of_files_2[i] = ' '.join(list_of_files_2[i].split(' ')[:1000])     
            sc_1 = rouge.get_scores(list_of_files_1[i], list_of_files_gold[j])[0]
            sc_2 = rouge.get_scores(list_of_files_2[i], list_of_files_gold[j])[0]
            #take max res for calc rouge
            best_sc_id = np.argmax([sc_1['rouge-1']['f'], sc_2['rouge-1']['f']])
            if best_sc_id > 0:
                scores.append(sc_2)
            else:
                scores.append(sc_1)
            j += 1
#         print(scores)
        if scores != []:
            res_arr = np.zeros((len(scores), 9))
            for i in range(len(scores)):
                lst_i = list(scores[i].items())
                
                res_arr[i] = [list(lst_i[0][1].items())[0][1],
                              list(lst_i[0][1].items())[1][1],
                              list(lst_i[0][1].items())[2][1], 
                              list(lst_i[1][1].items())[0][1],
                              list(lst_i[1][1].items())[1][1],
                              list(lst_i[1][1].items())[2][1],
                              list(lst_i[2][1].items())[0][1],
                              list(lst_i[2][1].items())[1][1],
                              list(lst_i[2][1].items())[2][1]]
                
            df_scores.loc[len(df_scores)] = res_arr.mean(axis=0)
        scores_id.append(best_sc_id)
    return df_scores, scores_id

In [65]:
# best_rouge_calc_mean_best_half, scores_res = calc_rouge_mean_half(name_of_files, list_of_files_1, list_of_files_2)

In [66]:
best_rouge_calc_mean_1 = calc_rouge_mean(name_of_files_calc_1, list_of_files_calc_1)
best_rouge_calc_mean_2 = calc_rouge_mean(name_of_files_calc_2, list_of_files_calc_2)

  0%|          | 0/363 [00:00<?, ?it/s]

  0%|          | 0/363 [00:00<?, ?it/s]

In [67]:
# best_rouge_1000_mean = calc_rouge_mean(name_of_files_1000, list_of_files_1000, suf=9)
# best_rouge_muse_mean = calc_rouge_mean(name_of_files_muse, list_of_files_muse, suf=9)

In [68]:
dir_res = r"C:\Users\RedmiBook\Documents\SCE\proj\datasets\calculated\no mask 10per\\"

In [73]:
def write_best_2_txt(dir_res=dir_res, name_of_files=name_of_files, art_sum=[art_sum_1, art_sum_2], txt_splited=[txt_splited_1, txt_splited_2], res_id_list=res_id_list):
    for i_t, text_name in tqdm(enumerate(name_of_files), total=len(name_of_files)):
    
        #calculate best articles in txt based on matrix for 2 txt
        list_art_1, _ = top_art(art_sum[0][i_t])
        list_art_2, _ = top_art(art_sum[1][i_t])

        #write results to txt
        txt_1, txt_2 = '', ''
        for i_1, i_2 in zip(list_art_1, list_art_2):
            txt_1 += txt_splited[0][i_t][i_1] + '\n'
            txt_2 += txt_splited[1][i_t][i_2] + '\n'
       
        with open(dir_res + "txt_best\\" + text_name, 'w') as f:
            f.write([txt_1, txt_2][res_id_list[i_t]])
        
        with open(dir_res + "txt_1\\" + text_name, 'w') as f:
            f.write(txt_1)        
        with open(dir_res + "txt_2\\" + text_name, 'w') as f:
            f.write(txt_2)

In [74]:
write_best_2_txt()

  0%|          | 0/363 [00:00<?, ?it/s]

In [76]:
name_of_files_calc_fin, list_of_files_calc_fin = read_files(r"C:\Users\RedmiBook\Documents\SCE\proj\datasets\calculated\no mask 10per\txt_best")
best_rouge_calc_mean_fin = calc_rouge_mean(name_of_files_calc_fin, list_of_files_calc_fin)


  0%|          | 0/363 [00:00<?, ?it/s]

In [79]:
names = [
    np.array(["rouge1", "rouge1", "rouge1", "rouge2", "rouge2", "rouge2", "rougel", "rougel", "rougel"]),
    np.array(["r", "p", "f1", "r", "p", "f1", "r", "p", "f1"]),
    ]

df_rouge = pd.DataFrame([list(best_rouge_calc_mean_1.mean()), list(best_rouge_calc_mean_2.mean()), list(best_rouge_calc_mean_fin.mean())], 
                        index=['calc_1', 'calc_2', 'calc_fin'], columns=names)
df_rouge.transpose()

Unnamed: 0,Unnamed: 1,calc_1,calc_2,calc_fin
rouge1,r,0.42017,0.336501,0.349956
rouge1,p,0.407456,0.36766,0.400912
rouge1,f1,0.376646,0.319294,0.338283
rouge2,r,0.258895,0.154765,0.184426
rouge2,p,0.233701,0.190439,0.22096
rouge2,f1,0.217404,0.150651,0.176218
rougel,r,0.399542,0.309396,0.326856
rougel,p,0.384866,0.341449,0.375409
rougel,f1,0.35681,0.294623,0.316073


In [80]:
df_rouge.transpose().to_csv(r"C:\Users\RedmiBook\Documents\SCE\proj\datasets\calculated\tabel of results 10per.csv", index=False)

In [70]:
names = [
    ["rouge1", "rouge1", "rouge1", "rouge2", "rouge2", "rouge2", "rougel", "rougel", "rougel"],
    ["r", "p", "f1", "r", "p", "f1", "r", "p", "f1"],
    ]
 
df_rouge = pd.DataFrame([list(best_rouge_calc_mean_fin.mean()), list(best_rouge_1000_mean.mean()), list(best_rouge_muse_mean.mean())], 
                        index=['calc_fin', 'top-1000', 'muse'], columns=names)
df_rouge.transpose()

Unnamed: 0,Unnamed: 1,calc_fin,top-1000,muse
rouge1,r,0.34937,0.534894,0.456453
rouge1,p,0.367413,0.374408,0.40853
rouge1,f1,0.325019,0.399157,0.391644
rouge2,r,0.173686,0.398208,0.288377
rouge2,p,0.187328,0.225484,0.2488
rouge2,f1,0.157917,0.255004,0.234891
rougel,r,0.324953,0.51619,0.436119
rougel,p,0.342461,0.354901,0.388336
rougel,f1,0.302328,0.381251,0.373106


In [103]:
sum_r = 0
for i in range(len(scores_res)):
    if scores_calc[i] == scores_res[i]:
        sum_r += 1
sum_r/len(scores_res)

0.5426997245179064

In [1]:
def mask_NUM_PROPN(txt_splited):
    res_finish = []
    for text in tqdm(txt_splited):
        res_txt = []
        for paragraph in text:
            res_par = ''
            for token in nlp(paragraph):
                if token.pos_ == 'NUM' or token.pos_ == 'PROPN':
                    res_par += token.pos_ + ' '
                elif token.text.isalpha():
                    res_par += token.text.lower() + ' '
            res_txt.append(res_par)
        res_finish.append(res_txt)        
    return res_finish

In [6]:
txt_splited_1 = [file_txt.split('\n') for file_txt in list_of_files_1]
txt_str_div_par_1_num = mask_NUM_PROPN(txt_splited_1)

txt_splited_2 = [file_txt.split('\n') for file_txt in list_of_files_2]
txt_str_div_par_2_num = mask_NUM_PROPN(txt_splited_2)

  0%|          | 0/363 [00:00<?, ?it/s]

  0%|          | 0/363 [00:00<?, ?it/s]

In [13]:
vocab_list_1, txt_str_1 = make_vocab(txt_str_div_par_1_num)
vocab_list_2, txt_str_2 = make_vocab(txt_str_div_par_2_num)

In [21]:
list_of_words_1, len_words_1, max_len_words_1 = list_of_words_max_len(txt_str_1)
list_of_words_2, len_words_2, max_len_words_2 = list_of_words_max_len(txt_str_2)

In [35]:
S_1 = sparse_triangle_max(max_len_words_1)
S_2 = sparse_triangle_max(max_len_words_2)

  0%|          | 0/11535 [00:00<?, ?it/s]

  0%|          | 0/11271 [00:00<?, ?it/s]

In [36]:
calculate_save_npz(list_of_words_1, S_1, vocab_list_1, name_of_files, dir_to_save=r"C:\Users\RedmiBook\Documents\SCE\proj\datasets\calculated\npz_2_parts_PROPN\res_1\\")
calculate_save_npz(list_of_words_2, S_2, vocab_list_2, name_of_files, dir_to_save=r"C:\Users\RedmiBook\Documents\SCE\proj\datasets\calculated\npz_2_parts_PROPN\res_2\\")

  0%|          | 0/363 [00:00<?, ?it/s]

  0%|          | 0/363 [00:00<?, ?it/s]

In [21]:
art_sum_1 = download_npz(name_of_files, vocab_list_1, txt_str_div_par_1_num, dir_to_upload=r"C:\Users\RedmiBook\Documents\SCE\proj\datasets\calculated\npz_2_parts_PROPN\res_1\\")
art_sum_2 = download_npz(name_of_files, vocab_list_2, txt_str_div_par_2_num, dir_to_upload=r"C:\Users\RedmiBook\Documents\SCE\proj\datasets\calculated\npz_2_parts_PROPN\res_2\\")

NameError: name 'vocab_list_1' is not defined

In [71]:
def choose_best_txt_from2(art_sum_1, art_sum_2):
    res_id_list = []
    for art in tqdm(range(len(art_sum_1))):
        res_id_list.append(np.argmax([sum([art_sum_1[art]['sum'][i] for i in top_art(art_sum_1[art])[0]]), 
                                      sum([art_sum_2[art]['sum'][i] for i in top_art(art_sum_2[art])[0]])]))
    return res_id_list            

In [72]:
res_id_list = choose_best_txt_from2(art_sum_1, art_sum_2)

  0%|          | 0/363 [00:00<?, ?it/s]

In [41]:
write_best_2_txt()

  0%|          | 0/363 [00:00<?, ?it/s]

In [64]:
name_of_files_calc_best, list_of_files_calc_best = read_files(dir_res + "txt_best")

In [65]:
best_rouge_calc_mean_best = calc_rouge_mean(name_of_files_calc_best, list_of_files_calc_best)

  0%|          | 0/363 [00:00<?, ?it/s]

In [42]:
val_res_1_dir = r"C:\Users\RedmiBook\Documents\SCE\proj\datasets\calculated\npz_2_parts_PROPN\txt_1"
val_res_2_dir = r"C:\Users\RedmiBook\Documents\SCE\proj\datasets\calculated\npz_2_parts_PROPN\txt_2"
val_gold_dir = r"C:\Users\RedmiBook\Documents\SCE\proj\datasets\fns2020_dataset_clean\validation\gold_summaries"
# val_top_dir = r"C:\Users\RedmiBook\Documents\SCE\proj\datasets\top1000-summaries"
# val_muse_dir = r"C:\Users\RedmiBook\Documents\SCE\proj\datasets\validation-muse-1000"

In [45]:
name_of_files_gold, list_of_files_gold = read_files(val_gold_dir)
name_of_files_calc_1, list_of_files_calc_1 = read_files(val_res_1_dir)
name_of_files_calc_2, list_of_files_calc_2 = read_files(val_res_2_dir)   
# name_of_files_1000, list_of_files_1000 = read_files(val_top_dir)
# name_of_files_muse, list_of_files_muse = read_files(val_muse_dir)

In [48]:
best_rouge_calc_mean_1 = calc_rouge_mean(name_of_files_calc_1, list_of_files_calc_1)
best_rouge_calc_mean_2 = calc_rouge_mean(name_of_files_calc_2, list_of_files_calc_2)

  0%|          | 0/363 [00:00<?, ?it/s]

  0%|          | 0/363 [00:00<?, ?it/s]

In [49]:
# best_rouge_1000_mean = calc_rouge_mean(name_of_files_1000, list_of_files_1000, suf=9)
# best_rouge_muse_mean = calc_rouge_mean(name_of_files_muse, list_of_files_muse, suf=9)

In [50]:
write_best_2_txt()

  0%|          | 0/363 [00:00<?, ?it/s]

In [52]:
# name_of_files_calc_fin, list_of_files_calc_fin = read_files(r"C:\Users\RedmiBook\Documents\SCE\proj\datasets\calculated\npz_2_parts\txt_final")
# best_rouge_calc_mean_fin = calc_rouge_mean(name_of_files_calc_fin, list_of_files_calc_fin)


In [66]:
rouge_calc_before = pd.read_csv(r"C:\Users\RedmiBook\Documents\SCE\proj\datasets\calculated\tabel of results.csv", index_col=False)
rouge_calc_after = rouge_calc_before.assign(calc_1_NUM_PROPN = list(best_rouge_calc_mean_1.mean()), calc_2_NUM_PROPN=list(best_rouge_calc_mean_2.mean()), calc_best_NUM=list(best_rouge_calc_mean_best.mean()))#, columns=['Unnamed: 0','Unnamed: 1'])
rouge_calc_after

Unnamed: 0,calc_1,calc_2,calc_fin,top-1000,muse,calc_1_NUM_PROPN,calc_2_NUM_PROPN,calc_best_NUM
0,0.393375,0.303933,0.34937,0.534894,0.456453,0.36984,0.302338,0.342805
1,0.403178,0.329488,0.367413,0.374408,0.40853,0.363726,0.318622,0.355753
2,0.361987,0.286512,0.325019,0.399157,0.391644,0.328316,0.277918,0.311051
3,0.231007,0.115975,0.173686,0.398208,0.288377,0.201997,0.110604,0.175218
4,0.229855,0.144012,0.187328,0.225484,0.2488,0.185878,0.134464,0.174536
5,0.203098,0.112294,0.157917,0.255004,0.234891,0.165917,0.102338,0.14838
6,0.371458,0.276123,0.324953,0.51619,0.436119,0.351751,0.278267,0.324597
7,0.38,0.301894,0.342461,0.354901,0.388336,0.342768,0.295549,0.334579
8,0.341309,0.260873,0.302328,0.381251,0.373106,0.31056,0.256204,0.293215


In [67]:
rouge_calc_after.to_csv(r"C:\Users\RedmiBook\Documents\SCE\proj\datasets\calculated\tabel of results NUM_PROPN 15per.csv", index=False)