## preprocess fild and load file

In [2]:
import spacy
import json
from spacy.lang.de.stop_words import STOP_WORDS
from pathlib import Path

In [4]:
%run src/file_utils.py
%run src/configuration.py   #store path
## adding these path in src/configuration.py
##FILE_PATH = "LabShare/data/all/json/"
##CLEAN_FILE_PREFIX = "filtered_"
##NON_GERMAN_FILE_PATH = "LabShare/data/non_german_files.txt"
##SHARE_SPACE_FOLDER_PATH = "LabShare/data/group_bitcuda5/"

In [5]:
TYPE = 'type'
PARAGRAPH = 'paragraph'
CONTENT = 'content'

In [22]:
# read non german file list
with open(NON_GERMAN_FILE_PATH) as f:
    non_german_documents = f.readlines()
non_german_documents = [x.strip() for x in non_german_documents] 
temp_docu = list()
for document in non_german_documents:
    temp_docu.append(document + '.json')
non_german_documents = temp_docu

In [None]:
def readContentOfParagraphs(file_name):
    contents = []
    try:
        with open(file_name) as f:
            data = json.load(f)
            for item in data:
                typeDoc = item[TYPE]
                if typeDoc == PARAGRAPH:
                    contents.append(item[CONTENT])
    except:
        FileUtils.fix_json(file_name)
        with open(file_name) as f:
            data = json.load(f)
            for item in data:
                typeDoc = item[TYPE]
                if typeDoc == PARAGRAPH:
                    contents.append(item[CONTENT])
    return contents

In [6]:
def lemmatize_paragraphs(document):
    nlp = spacy.load("de",disable=['parser', 'ner'])
    paragraphs = readContentOfParagraphs(document)
    #print('file name', document)
    company_name = document[len(FILE_PATH):document.find('-')].lower()
    ## can be improved, try to filter all company name
    #print('company name is '+ company_name)
    lemmatized_paragraphs = []
    for paragraph in paragraphs:
        content_of_document = paragraph.replace('-\n','')
        content_of_document = content_of_document.replace('\n',' ')
    
        #replace all gco2 with co2
        content_of_document = content_of_document.replace('gCO2','CO2')
    
        #remove the character we don't need
        remove_char = content_of_document.maketrans('-',' ','+*<>%/&$')
        content_of_document = content_of_document.translate(remove_char)
    
        sentence = nlp(content_of_document)
        filtered_words = [word for word in sentence if word.lower_ not in STOP_WORDS]
        filtered_words_withoutnum = [word for word in filtered_words if word.pos_ != 'NUM']
        filtered_words_withoutsym = [word for word in filtered_words_withoutnum if word.pos_ != 'SYM']
        filtered_words_withoutdigits = [word for word in filtered_words_withoutsym if not word.is_digit]
        filtered_words_withoutpunc = [word for word in filtered_words_withoutdigits if word.pos_ != 'PUNCT']
        filtered_lemmas = [word.lemma_ for word in filtered_words_withoutpunc]
    
        final = []  
        for item in filtered_lemmas:
            #remove the words contain digit except of co2
            if company_name in item.lower():
                continue
                
            if(any(c.isdigit() for c in item)):
                if 'CO2' in item:
                    final.append(item)
            else:
                #remove the words contain dot
                if '.' not in item:
                    final.append(item)
        
        lemmatized_content = " ".join(item for item in final)
        lemmatized_paragraphs.append(lemmatized_content.lower())
        
    #output the result into file 
    if document.startswith(FILE_PATH):
        filename = CLEAN_FILE_PREFIX  + document[len(FILE_PATH):]
    filename = SHARE_SPACE_FOLDER_PATH + filename 
    with open(filename, 'w') as outfile:
        json.dump(lemmatized_paragraphs, outfile)
        
    return lemmatized_paragraphs

In [None]:
def load_lemmatization_paragraph(document):
    with open(document, 'r') as f:
        datastore = json.load(f)
    return datastore

In [1]:
# parameter :
#     documents_list: a list, contain string of  file name, which we want to preprocessing
#     get_paragraph: True if you want to get every paragrah, False if you want to get the whole document
# return :
#     documents_clean: a list contain string, which contain the whole document
#     documents_clean_name: a list contain string, every output documents' corresponding name
def get_clean_data(documents_list, get_paragraph = False, logging=False):
    #print (documents_list)
    documents_clean = list()
    documents_clean_name = list()

    for document in documents_list:
            
        # fist check if this document is english
        if document in non_german_documents:
            if logging:
                print("this file "+ document + ' is non german, skip it')
            continue 
        # second check if this doc already be preprocessed
        my_file = Path(SHARE_SPACE_FOLDER_PATH + CLEAN_FILE_PREFIX + document)
        if my_file.is_file():
        # already exist
            if logging: 
                print(CLEAN_FILE_PREFIX + document + " has already done preprocess")
            # load file!
            documents_clean_name.append(document)
            documents_clean.append(load_lemmatization_paragraph(SHARE_SPACE_FOLDER_PATH + CLEAN_FILE_PREFIX + document))
        # third do preprocessing
        else:
            documents_clean.append(lemmatize_paragraphs(FILE_PATH + document))
            documents_clean_name.append(document)
    if not get_paragraph:
        # need the whole document, then do join!
        documents_tmp = list()
        for document in documents_clean:
            document_tmp = " ".join(para for para in document)
            documents_tmp.append(document_tmp)
        documents_clean = documents_tmp
    return documents_clean, documents_clean_name