## preprocess file and load file

This file contains funtions for preprocessing files. Given list of documents' names, it can at first determine if the document have already been preprocessed. If this document has not been preprocessed, the funciton at first preprocessed this document, and then store this document in hard disc. If it has been preprocessed, this function directly load its preprocessed vision. Since preprocessing cost highly, this mechanism save lots of time.

Here we are using spacy library and its German package to do the preprocessing.

* **file_utils.py** contains code for reading json file.
* **configuration.py** contains original and after preprocessing documents.
* The first part of this file is code for loading non german documents list. Since our program only process German documents, these non german documents will have an inverse impact for preprocessing and should be removed at first.
* function ** readContentOfParagraphs** read documents in json, one paragraph by one paragraph
* funciton ** lemmatize_paragraphs** is to clean data.
* function ** load_lemmatization_paragraph** is to load preprocessed document from hard disc.
* function ** get_clean_data** is the interface for other users. They should only use this function to get clean, preprocessed data.

In [4]:
import spacy
import json
from spacy.lang.de.stop_words import STOP_WORDS
from pathlib import Path

In [5]:
%run src/file_utils.py
%run src/configuration.py   #store useful file path

In [6]:
TYPE = 'type'
PARAGRAPH = 'paragraph'
CONTENT = 'content'

In [13]:
# read non german file list
# we have a Non_german_file list file, so here we just read this list from that txt file.
# this list will be used to filter all non german files
with open(NON_GERMAN_FILE_PATH) as f:
    non_german_documents = f.readlines()
non_german_documents = [x.strip() for x in non_german_documents] 
temp_docu = list()
for document in non_german_documents:
    temp_docu.append(document + '.json')
non_german_documents = temp_docu

In [12]:
# funciton to read raw json file
def readContentOfParagraphs(file_name):
    contents = []
    try:
        data = json.loads(FileUtils.fix_json(file_name))
        for item in data:
            typeDoc = item[TYPE]
            if typeDoc == PARAGRAPH:
                contents.append(item[CONTENT])
    except:
        print('Bad file: ' + file_name)
    return contents

In [9]:
replace_dict = dict()
replace_dict['gCO2'] = 'CO2'

# meaningful combinations of letters and digits, that we want to preserve.
digit_and_letter_combintaions = list()
digit_and_letter_combintaions.append('CO2')

# here we using spacy to do preprocessing, remove stop words, useless words and so on
def lemmatize_paragraphs(document):
    nlp = spacy.load("de",disable=['parser', 'ner'])
    paragraphs = readContentOfParagraphs(document)
    # here is try to remove company name in documents, becasue these name is useless and harmful for our approach
    company_name = document[len(FILE_PATH):document.find('-')].lower()
    lemmatized_paragraphs = []
    for paragraph in paragraphs:
        # remove the - in document
        content_of_document = paragraph.replace('-\n','')
        content_of_document = content_of_document.replace('\n',' ')
    
        #replace all entries, which can be found in replacement dictionary
        for replace_source, replace_target in replace_dict.items():
            content_of_document = content_of_document.replace(replace_source, 
                                                              replace_target)
    
        #remove the character we don't need
        remove_char = content_of_document.maketrans('-',' ','+*<>%/&$')
        content_of_document = content_of_document.translate(remove_char)
    
        sentence = nlp(content_of_document)
        filtered_words = [word for word in sentence if word.lower_ not in STOP_WORDS] 
        filtered_words_withoutnum = [word for word in filtered_words if word.pos_ != 'NUM']
        filtered_words_withoutsym = [word for word in filtered_words_withoutnum if word.pos_ != 'SYM']
        filtered_words_withoutdigits = [word for word in filtered_words_withoutsym if not word.is_digit]
        filtered_words_withoutpunc = [word for word in filtered_words_withoutdigits if word.pos_ != 'PUNCT']
        filtered_lemmas = [word.lemma_ for word in filtered_words_withoutpunc]
    
        final = []  
        for item in filtered_lemmas:
            #remove the words contain digit except of co2
            if company_name in item.lower():
                continue
                
            if(any(c.isdigit() for c in item)):
                for combination in digit_and_letter_combintaions: 
                    if combination in item:
                        final.append(item)
            else:
                #remove the words contain dot
                if '.' not in item:
                    final.append(item)
        
        lemmatized_content = " ".join(item for item in final)
        lemmatized_paragraphs.append(lemmatized_content.lower())
        
    #output the result into file 
    if document.startswith(FILE_PATH):# if file path is in document name, remove it
        filename = CLEAN_FILE_PREFIX  + document[len(FILE_PATH):]
    filename = SHARE_SPACE_FOLDER_PATH + filename  # add the clean data folder path
    with open(filename, 'w') as outfile:
        json.dump(lemmatized_paragraphs, outfile) # save clean data in json file, which is easy to read
        
    return lemmatized_paragraphs

In [10]:
# this function try to load clean paragraph which we already have done preprocess and saved in folder
def load_lemmatization_paragraph(document):
    with open(document, 'r') as f:
        datastore = json.load(f)
    return datastore

In [11]:
# parameter :
#     documents_list: a list, contain string of file name, which we want to preprocessing
#     get_paragraph: True if you want to get every paragrah, False if you want to get the whole document
#     logging: default = False, if set it as true, then it will print information about which document are currently preprocessing
#     and which document has been already done.
# return :
#     documents_clean: a list contain string, which contain the whole document
#     documents_clean_name: a list contain string, every output documents' corresponding name

def get_clean_data(documents_list, get_paragraph = False, logging=False):
    documents_clean = list()
    documents_clean_name = list()

    for document in documents_list:
        # fist check if this document is english
        if document in non_german_documents:
            if logging:
                print("this file "+ document + ' is non german, skip it')
            continue 
        # second check if this doc already be preprocessed
        my_file = Path(SHARE_SPACE_FOLDER_PATH + CLEAN_FILE_PREFIX + document)
        if my_file.is_file():
        # if already exist, we directly load the clean data from hard disk
            if logging: 
                print(CLEAN_FILE_PREFIX + document + " has already done preprocess")
            # load file!
            documents_clean_name.append(document)
            documents_clean.append(load_lemmatization_paragraph(SHARE_SPACE_FOLDER_PATH + CLEAN_FILE_PREFIX + document))
        else:
        # if not find, we do preprocess for this document, and save it in hard disk
            documents_clean_name.append(document)
            documents_clean.append(lemmatize_paragraphs(FILE_PATH + document))

    if not get_paragraph:
        # if we don't want paragraph, but the whole report. Here we join all paragraph to get a whole report. 
        documents_tmp = list()
        for document in documents_clean:
            document_tmp = " ".join(para for para in document)
            documents_tmp.append(document_tmp)
        documents_clean = documents_tmp
    return documents_clean, documents_clean_name