# Preprocessing

## Evaluating Extractive Summarization Techniques on News Articles (EEST)

In [5]:
# !pip install tqdm
# !pip install install openpyxl

In [6]:
import pandas as pd
import re
from tqdm import tqdm
import os

In [7]:
def cleaning(content,bbc_news = False):
    content = content.replace(".\n\n",". \n\n")
    content = content.replace(".",". ")
    punctuation = u'''!()-[]|{};:'"\,<>/?@#$%^&*_~—“””•■\u2019'''  # punctuation
    content = re.sub('\s+', ' ', content).strip()
    
    for idx, i in enumerate(content):
        if i in punctuation:
            content = content.replace(i, "")
    

    content_tmp = (content + '.')[:-1] # copy string to new pointer
    done = 0
    for idx, i in enumerate(content_tmp): 
        try: # 
            if i == '.' and content_tmp[idx+2].isdigit() and content_tmp[idx-1].isdigit():
                idx_origin = idx - done
                content = content[:idx_origin+1] + content[idx_origin+2:]
                done += 1
                #print(content_tmp[idx], content[idx])
        except Exception as e :
            continue

    
    content = re.sub("https*\S+", " ", content)

    return content

In [8]:
# read by default 1st sheet of an excel file
df = pd.read_excel('raw_data\EEST.xlsx')

In [9]:
df.theme.unique()

array(['politics', 'crime', 'entertainment', 'technology', 'art',
       'science', 'health', 'business', 'architecture', 'lifestyle',
       'sports', 'law', 'accidents', 'environment '], dtype=object)

In [10]:
df_business = df[df.theme == 'business']
df_business = df_business[['id','human_summary','content']]

In [11]:
content_list = list()
summary_list = list()
for index, row in tqdm(df_business.iterrows()) :
    try:
        content = cleaning(row['content'])
        #content_sentences = nltk.sent_tokenize(content)

        summary = cleaning(row['human_summary'])
        #summary_sentences = nltk.sent_tokenize(summary)

        content_list.append(content)
        summary_list.append(summary)
    except Exception as e :
        print('ERORR:', row)
        print(e)

df_business["content"] = content_list
df_business["human_summary"] = summary_list

208it [00:00, 262.84it/s]


In [14]:
for index, row in tqdm(df_business.iterrows()) :
    if 'bundesbank' in row['content']: print(row['content'])

208it [00:00, 11554.71it/s]


In [16]:
for index, row in tqdm(df_business.iterrows()) :
    _id = row['id']
    with open(f'preprocessed_data\eest\{_id}.story', 'w', encoding="utf-8") as f:
        f.write(row['content'])

    with open(f'preprocessed_data\eest\{_id}.story', 'a', encoding="utf-8") as f:
        f.write('\n\n@highlight\n')
        #f.write('\n\n@highlight\n'.join(row['human_summary']))
        f.write(row['human_summary'])

208it [00:00, 686.46it/s]


## BBC News

In [17]:
path_content = 'raw_data\BBC_News\content'
path_summary= 'raw_data\BBC_News\summary'
for file_name in tqdm(os.listdir(path_content)):
    with open(f'{path_content}\{file_name}') as f:
        content = f.read()
    with open(f'{path_summary}\{file_name}') as f:
        summary = f.read()
    
    content = cleaning(content, bbc_news =True)
    summary = cleaning(summary, bbc_news =True)

    # if 'bundesbank' in summary: print(summary)
    # content_sentences = nltk.sent_tokenize(content)
    # summary_sentences = nltk.sent_tokenize(summary)

    # create file
    with open(f'preprocessed_data\BBC_News_preprocessed\{file_name.split(".")[0]}.story', 'w') as f:
        f.write(content)

    with open(f'preprocessed_data\BBC_News_preprocessed\{file_name.split(".")[0]}.story', 'a') as f:
        f.write('\n\n@highlight\n')
        f.write(summary)

100%|██████████| 510/510 [00:10<00:00, 47.29it/s]


# Summary Dataset

In [18]:
print('BBC_News_preprocessed: ',len(os.listdir('preprocessed_data\BBC_News_preprocessed')))
print('EEST: ',len(os.listdir('preprocessed_data\eest')))

BBC_News_preprocessed:  510
EEST:  208
