In [19]:
from prettytable import PrettyTable
import json
import numpy as np
import re
import stanza
import nltk

# Global Variables

In [20]:
# Inputs file
path_authors_infos = "inputs\\authors.json"
path_reorganize_authors_infos = "inputs\\reorganize_authors.json"
path_preprocessing_reorganize_authors_infos = "inputs\\preprocessing_reorganize_authors.json"

# Outputs file

# The list of entry keys we take from the authors' JSON file.
entries_keys_authors_infos = ["_id", "name", "ContinentOfBirth", "birthCountry", "birthPlaceLabel", "paragraphs", "summary"]

# Stanza
nlp = stanza.Pipeline(lang='en', processors={'tokenize': 'spacy'})
 


2021-10-26 22:02:27 INFO: Loading these models for language: en (English):
| Processor    | Package   |
----------------------------
| tokenize     | spacy     |
| pos          | combined  |
| lemma        | combined  |
| depparse     | combined  |
| sentiment    | sstplus   |
| constituency | wsj       |
| ner          | ontonotes |

2021-10-26 22:02:27 INFO: Use device: cpu
2021-10-26 22:02:27 INFO: Loading: tokenize
2021-10-26 22:02:27 INFO: Loading: pos
2021-10-26 22:02:27 INFO: Loading: lemma
2021-10-26 22:02:27 INFO: Loading: depparse
2021-10-26 22:02:28 INFO: Loading: sentiment
2021-10-26 22:02:28 INFO: Loading: constituency
2021-10-26 22:02:29 INFO: Loading: ner
2021-10-26 22:02:29 INFO: Done loading processors!


# Common Functions

In [21]:
"""Attempts to read JSON file by the file url.
:param path_file_json: May be a file name containing the JSON.
:returns: A list or dictionary parsed from JSON.
"""
def get_json_from_file(path_file_json) :
    json_entries = []
    try :
        with open(path_file_json, "r", encoding='utf-8') as f_js :
            for json_object in f_js :
                json_entrie = json.loads(json_object)
                json_entries.append(json_entrie)
    except IOError as err_:
        err = "Cannot get JSON from file {0}, Error: {1}".format(path_file_json, err_)
        raise AttributeError(err)
    except ValueError as err_:
        err = "For JSON: {0}, error: {1}".format(path_file_json, err_)
        raise AttributeError(err)
    return json_entries

"""Attempts to write JSON OBJECTS TO JSON file on the file url given.
:param json_entries and output_path_file_json: May be a file name non exist or json is empty.
:returns: Void.
"""
def save_json_to_file(json_entries, output_path_file_json) :
    try :
        with open(output_path_file_json, "w", encoding="utf-8") as f :
            f.write(json.dumps(json_entries, indent=4, ensure_ascii=False))
    except IOError as err_:
        err = "Cannot write JSON OBJECT to JSON file {0}, Error: {1}".format(output_path_file_json, err_)
        raise AttributeError(err)


"""Attempts to return Sentences Array on the text given.
:param text: May be text is empty.
:returns: Sentences Array.
"""
def preprocessing_text_with_stanza(text) :
    output_text = []
    doc = nlp(text)
    for i, sentence in enumerate(doc.sentences) :
        tokens = [token.text for token in sentence.tokens]
        output_text.append(' '.join(tokens))
    return output_text

def preprocessing_authors_infos(path_reorganize_authors_infos) :
    json_entries = []
    preprocessing_json_entries = []
    try :
        with open(path_reorganize_authors_infos, "r", encoding='utf-8') as f_js :
            json_entries = json.load(f_js)
    except IOError as err_:
        err = "Cannot get JSON from file {0}, Error: {1}".format(path_reorganize_authors_infos, err_)
        raise AttributeError(err)
    except ValueError as err_:
        err = "For JSON: {0}, error: {1}".format(path_reorganize_authors_infos, err_)
        raise AttributeError(err)
    for entrie in json_entries :
        filter_paragraphs_entrie = list(filter(lambda elem: not isinstance(elem, type(None)) and len(elem) > 1 and not elem is None and not isinstance(elem[1], type(None)) and len(elem[1]) > 0, entrie.get("paragraphs", [])))
        paragraphs = '\n\n'.join(map(lambda paragraph: paragraph[1], filter_paragraphs_entrie))
        # infos = nlp(re.sub(r"\s+", '', entrie.get("summary", "") + "\n\n" + paragraphs)) 
        preprocessing_json_entries.append({
            "identifiant": re.sub(r"\s+", '_', entrie.get("name", "")) + "_" + entrie.get("_id", {"$oid": ""})["$oid"],
            "name": entrie.get("name", ""),
            "infos": preprocessing_text_with_stanza(re.sub(r"\s+", ' ', re.sub(r'(\(.*\)|\[.*\])', '', entrie.get("summary", "") + "\n\n" + paragraphs)))
        })
    return preprocessing_json_entries

# Reorganize Authors Informations and make Pre Processing

In [22]:
# Get JSON Objects From JSON File and Reorganize.
reorganize_authors_infos = get_json_from_file(path_authors_infos)

# Save the JSOM Objects.
save_json_to_file(reorganize_authors_infos, path_reorganize_authors_infos)

# Get JSON Objects From JSON File after preprocessing
authors_infos = preprocessing_authors_infos(path_reorganize_authors_infos)

# Save the JSOM Objects.
save_json_to_file(authors_infos, path_preprocessing_reorganize_authors_infos)

2021-10-26 22:14:32 ERROR: Went infinite!:
Final state:
State(
  buffer:[(NNP Yousafzai), (VBZ has), (VBN received), (DT the), (VBG following), (JJ national), (CC and), (JJ international), (NNS honours), (, ,), (VBN listed), (IN by), (DT the), (NN date), (PRP they), (VBD were), (VBN awarded), (: :), (CD 2011), (: :), (NNP International), (NNP Children), (POS 's), (NNP Peace), (NNP Prize), (CD 2011), (: :), (NNP National), (NNP Youth), (NNP Peace), (NNP Prize), (NNP January), (CD 2012), (: :), (NNP Anne), (NNP Frank), (NNP Award), (IN for), (NNP Moral), (NNP Courage), (NNP October), (CD 2012), (: :), (NNP Sitara), (HYPH -), (NNP e), (HYPH -), (NNP Shujaat), (, ,), (NNP Pakistan), (POS 's), (JJ second), (HYPH -), (JJS highest), (JJ civilian), (NN bravery), (NN award), (NNP November), (CD 2012), (: :), (JJ Foreign), (NNP Policy), (NN magazine), (JJ top), (CD 100), (JJ global), (NN thinker), (NNP December), (CD 2012), (: :), (NNP Time), (NN magazine), (NNP Person), (IN of), (DT the), (NNP 

AssertionError: Contents must have the same length as the sentences

In [None]:
print(len(authors_infos))

8057


In [None]:
doc = nlp(authors_infos[1]["infos"])
out_t = []
for i, sentence in enumerate(doc.sentences):
    #words = set(nltk.corpus.words.words())
    tokens = [token.text for token in sentence.tokens]
    out_t.append(' '.join(tokens))
    #print(sentence.tokens)
    #print(*[f'id: {token.id}\ttext: {token.text}' for token in sentence.tokens], sep='\n')
print(out_t)

['Boutros Boutros - Ghali from January 1992 to December 1996 .', 'An academic and former Vice Foreign Minister of Egypt , Boutros - Ghali oversaw the UN over a period coinciding with several world crises , including the breakup of Yugoslavia and the Rwandan genocide .', 'He went on to serve as the first Secretary - General of the Organisation internationale de la Francophonie from 16 November 1997 to 31 December 2002 .', 'Boutros Boutros - Ghali was born in Cairo , Egypt , on 14 November 1922 into a Coptic Christian family .', 'His father Yusuf Butros Ghali was the son of Boutros Ghali Bey then Pasha and diploma in international relations from the Sciences Po in 1949 .', 'During 1949–1979 , he was appointed Professor of International Law and International Relations at Cairo University .', 'He became President of the Centre of Political and Strategic Studies in 1975 and President of the African Society of Political Studies in 1980 .', 'He was a Fulbright Research Scholar at Columbia Uni

In [None]:
filename = "Example[hkdb[[kadbhckbd]]hchcd]_file_((ousmane)((kbdhjsdb))extra_descriptor).ext"
#filename = re.sub(r'\(.*\)', '', filename)
print(re.sub(r'(\(.*\)|\[.*\])', '', filename))

Example_file_.ext
