In [4]:
import spacy

# Load the large English NLP model
nlp = spacy.load('en_core_web_lg')

# The text we want to examine
text = """Hong Kong is a highly developed territory and ranks fourth on the UN Human Development Index.[8] The lack of space caused demand for denser constructions, which developed the city to a centre for modern architecture and the world's most vertical city. The city also has the largest number of skyscrapers of any city in the world[28] and its residents have some of the highest life expectancies in the world.[8] The dense space also led to a highly developed transportation network with public transport rates exceeding 90 percent.[29] Hong Kong ranks highly in numerous international financial and economic rankings. For instance, Hong Kong is rated third in the Global Financial Centre Index, behind New York City and London.[30] Hong Kong also holds first place in an annual ranking of the world’s most “economically free” countries, topping the chart for 25 years, according to the Heritage Foundation, a U.S. think tank.[31]
"""

# Parse the text with spaCy. This runs the entire pipeline.
doc = nlp(text)

# 'doc' now contains a parsed version of text. We can use it to do anything we want!
# For example, this will print out all the named entities that were detected:
for entity in doc.ents:
    print(f"{entity.text} ({entity.label_})")


Hong Kong (GPE)
fourth (ORDINAL)
the UN Human Development Index.[8 (ORG)
world[28 (GPE)
90 (CARDINAL)
Hong Kong (GPE)
Hong Kong (GPE)
third (ORDINAL)
New York City (GPE)
Hong Kong (GPE)
first (ORDINAL)
annual (DATE)
25 years (DATE)
the Heritage Foundation (ORG)
U.S. (GPE)


In [5]:
import spacy

# Load the large English NLP model
nlp = spacy.load('en_core_web_lg')


# Replace a token with "REDACTED" if it is a place
def replace_name_with_placeholder(token):
    if token.ent_iob != 0 and token.ent_type_ == "GPE":
        return "[REDACTED] "
    else:
        return token.string
    

# Loop through all the entities in a document and check if they are names
def scrub(text):
    doc = nlp(text)
    for ent in doc.ents:
        ent.merge()
    tokens = map(replace_name_with_placeholder, doc)
    return "".join(tokens)

s = """Hong Kong is a highly developed territory and ranks fourth on the UN Human Development Index.[8] The lack of space caused demand for denser constructions, which developed the city to a centre for modern architecture and the world's most vertical city. The city also has the largest number of skyscrapers of any city in the world[28] and its residents have some of the highest life expectancies in the world.[8] The dense space also led to a highly developed transportation network with public transport rates exceeding 90 percent.[29] Hong Kong ranks highly in numerous international financial and economic rankings. For instance, Hong Kong is rated third in the Global Financial Centre Index, behind New York City and London.[30] Hong Kong also holds first place in an annual ranking of the world’s most “economically free” countries, topping the chart for 25 years, according to the Heritage Foundation, a U.S. think tank.[31]
"""

print(scrub(s))




[REDACTED] is a highly developed territory and ranks fourth on the UN Human Development Index.[8] The lack of space caused demand for denser constructions, which developed the city to a centre for modern architecture and the world's most vertical city. The city also has the largest number of skyscrapers of any city in the [REDACTED] ] and its residents have some of the highest life expectancies in the world.[8] The dense space also led to a highly developed transportation network with public transport rates exceeding 90 percent.[29] [REDACTED] ranks highly in numerous international financial and economic rankings. For instance, [REDACTED] is rated third in the Global Financial Centre Index, behind [REDACTED] and London.[30] [REDACTED] also holds first place in an annual ranking of the world’s most “economically free” countries, topping the chart for 25 years, according to the Heritage Foundation, a [REDACTED] think tank.[31]



ModuleNotFoundError: No module named 'textacy'