# **Imports**

In [1]:
import os
import nltk
import chardet
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer as Summarizer
import ssl
import shutil

# **Configs**

In [2]:
# paths
INPUT_PATH = '../1 data_preprocessing/output'
OUTPUT_PATH = './summary_output'

# **Downlad nltk**

In [3]:
# using SSL to download the 'punkt' package from nltk (code from https://stackoverflow.com/questions/41348621/ssl-error-downloading-nltk-data)
try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context
    
# download the 'punkt' package from nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/niklas/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
# function to process the text files and identify the encoding
def process_text_file(file_path, OUTPUT_PATH, language='german', summary_ratio=0.3):
    with open(file_path, 'rb') as file:
        raw_data = file.read()
        result = chardet.detect(raw_data)
        encoding = result['encoding']

    # define parser and summarizer
    with open(file_path, 'r', encoding=encoding) as file:
        text = file.read()
        parser = PlaintextParser.from_string(text, Tokenizer(language))
        summarizer = Summarizer()

        # compute the number of sentences in the text file to have a good ratio for the summary
        total_sentences = len(list(parser.document.sentences))
        summary_length = 100#int(total_sentences * summary_ratio)

        # compute the summary
        summary = summarizer(parser.document, summary_length)

        # create file and ad '_summary.txt' to the name of the file
        output_file_name = os.path.splitext(os.path.basename(file_path))[0] + '_summary.txt'
        output_file_path = os.path.join(OUTPUT_PATH, output_file_name)

        with open(output_file_path, 'w', encoding='utf-8') as output_file:
            for sentence in summary:
                output_file.write(str(sentence) + "\n")

# **Process all files**

In [5]:
# create output directory if it does not exist
if not os.path.exists(OUTPUT_PATH):
    os.makedirs(OUTPUT_PATH)

# process all text files in the input directory
for file_name in os.listdir(INPUT_PATH):
    if file_name.endswith('.txt'):
        process_text_file(os.path.join(INPUT_PATH, file_name), OUTPUT_PATH)

# **Copy files into frontend/src**

In [6]:
# copy folder into react frontend src folder so that react can access the data
shutil.copytree('./summary_output', '../../frontend/src/pages/charts/data/summary', dirs_exist_ok=True)

'../../frontend/src/pages/charts/data/summary'