# Better NLP

### Extract entities

In [11]:
from org.neomatrix369.better_nlp import BetterNLP

In [12]:
# Can be any factual text or any text to experiment with
generic_text = """Denis Guedj (1940 – April 24, 2010) was a French novelist and 
a professor of the History of Science at Paris VIII University. He was born 
in Setif. He spent many years devising courses and games to teach adults 
and children math. He is the author of Numbers: The Universal Language and 
of the novel The Parrot's Theorem. He died in Paris. 
"""

In [13]:
model = BetterNLP().load_nlp_model()
model = model["model"]

Loading model en_core_web_lg...


In [14]:
parsed_generic_text = BetterNLP().extract_entities(model, generic_text)

In [15]:
parsed_generic_text = parsed_generic_text['parsed_text']
print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
[print(f"{each_entity.text} ({each_entity.label_})") for each_entity in parsed_generic_text.ents if each_entity.text.strip() == each_entity.text]
print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
print("\nToken types legend: ", BetterNLP.token_legend())

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Denis Guedj (PERSON)
1940 – April 24, 2010 (DATE)
French (NORP)
the History of Science (ORG)
Paris VIII University (ORG)
Setif (GPE)
many years (DATE)
The Universal Language (ORG)
The Parrot's Theorem (ORG)
Paris (GPE)
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Token types legend:  ['GPE = Geographic Point Entity', 'FAC = ', 'DATE = calendar date', 'NORP = Noun or Pronoun', 'PERSON = name of a person (proper noun)', 'ORG = Organisation']


### Noun extraction

In [25]:
chunks = BetterNLP().extract_nouns_chunks(model, generic_text)
chunks = chunks.get("noun_chunks")

In [26]:
print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
set_of_noun_chunks = set(chunks)
if len(set_of_noun_chunks) == 0:
	print("Did not find words that belong together.")
else:
	print("A list of words that belong together (in lowercase):")

[print(each_noun_chunk) for each_noun_chunk in set_of_noun_chunks if len(each_noun_chunk.split(" ")) > BetterNLP.minimum_occurrence_frequency]
print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
A list of words that belong together (in lowercase):
denis guedj
children math
parrot's theorem
universal language
french novelist
paris viii university
many years
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~


### Gather facts

In [18]:
target_topic = "Denis Guedj"
extracted_facts = BetterNLP().extract_facts(model, generic_text, target_topic)

In [19]:
extracted_facts = extracted_facts.get("facts")

print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
print("Trying to gather details about " + target_topic)

number_of_facts_found = 0
for each_fact_statement in extracted_facts:
    number_of_facts_found =+ 1
    subject, verb, fact = each_fact_statement
    print(f" - {fact}")

if number_of_facts_found == 0:
    print("There were no facts on " + target_topic)
print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Trying to gather details about Denis Guedj
 - a French novelist and 
a professor of the History of Science at Paris VIII University
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~


### Obfuscate privacy details

In [20]:
obfuscated_text = BetterNLP().obfuscate_text(model, generic_text)
obfuscated_text = obfuscated_text.get("obfuscated_text")
print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
print("Obfuscated generic text: ", "".join(obfuscated_text))
print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Obfuscated generic text:  [OBFUSCATED] (1940 – April 24, 2010) was a French novelist and a professor of the History of Science at Paris VIII University. He was born in Setif. He spent many years devising courses and games to teach adults and children math. He is the author of Numbers: The Universal Language and of the novel The Parrot's Theorem. He died in Paris. 
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
