In [1]:
import re
import spacy
from spacy import displacy

In [2]:
%%html
<style>
table {float:left}
</style>

In [3]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/oonisim/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
stopwords = nltk.corpus.stopwords.words('english')

# Language Pipeline

In [5]:
# By convention "nlp" is the pipeline instance name
nlp = spacy.load("en_core_web_trf")

In [6]:
text = "Australian Melissa Georgiou (Melissa Georgiou) moved to Finland over a decade ago to seek happiness in one of the coldest and darkest places on Earth. “One of my favorite things about living here is that it's easy to get close to nature whether you're in a residential area or in the middle of the city,” Melissa said. Originally a teacher, 12 years ago, she switched from the beaches of Sydney to the dark winters and cold lakes of Finland, and has never looked back since. Melissa said, “For Finns, the concept of happiness is very different from the Australian concept of happiness. Finns, she said, are happy to accept portrayals of themselves as melancholy and stubborn — a popular local saying is, “People who have happiness must hide it.” “The first thing I noticed here is that you don't go to dinners or barbecues, and you don't talk about real estate. No one asks you where you live, what suburb do you live in, where your kids go to school.” The Finns seem quite happy with the status quo, and they don't always seem to want more. Melissa Georgio's Dark Night in Northern Europe Finland was named the happiest country in the world for the sixth year in a row in the “World Happiness Report” released by the United Nations. “The Nordic countries are often countries with (good) unemployment benefits, pensions, and other benefits,” explains happiness expert and researcher Frank Martela (Frank Martela). However, Frank said that Finland's position in the rankings often surprised its own people. “Finns, they're almost outraged because they don't think this can be true. We listen to sad music and hard rock.” “Therefore, happiness is not part of the Finnish self-image.” The other side of Finnish melancholy is a cultural focus on perseverance. Frank said it redefines the way Finns view happiness — a concept known as “sisu” — which is part of Finnish culture and is hard to translate directly, but can be understood as will, determination, perseverance, and reason to face adversity. This, he said, is best reflected in Finns' favorite pastime — getting warm in a sauna after taking a bath in freezing temperatures. “It's about this paradox — from one extreme to the other, and it's a pretty fun experience... because you need perseverance.” Melissa said, but Finland has many things that are great and can provide happiness for people in this country. Finland is one of the European countries least affected by the COVID-19 pandemic, and experts attribute this to a high level of trust in the government and little resistance to complying with restrictions. Trust in government, on the other hand, stems from a country's investment in its citizens. The public school system rarely tests children and is one of the best in the world. Finland also has a universal health care system with affordable childcare and strong support for parents. Melissa said, “The whole country is taking care of the child's development. This system is very well set up. So from having my son to raising him at home, to sending him to daycare, to going to school, I was very well supported in every aspect of this.” Finland vs. China, which country has the strongest sense of happiness? Since the publication of the World Happiness Report, the Nordic countries have dominated the top ten. In this year's report, Finland and its neighbors Denmark (2nd), Iceland (3rd), Sweden (6th), and Norway (7th) all scored high on happiness indicators, including healthy life expectancy, per capita GDP, low levels of corruption, social support, freedom, trust, and generosity. Other top ten countries/regions include the Netherlands (5th), Switzerland (8th), Luxembourg (9th), and New Zealand (10th). Australia ranked 12th in this report, followed by Canada (13th), Ireland (14th), and the United States (15th). In Asia, Singapore ranked 25th in the world, up two places from last year, Taiwan dropped one place from last year to 27th, Japan rose to 47th, mainland China ranked 64th, and Hong Kong ranked 82nd. Meanwhile, the polling agency Ipsos Group (Ipsos) released a survey report on the global happiness index. The results showed that out of 32 countries, the country with the highest happiness index was China (91%), followed by Saudi Arabia (86%), the Netherlands (85%), India (84%), and Brazil (83%). Australia ranked 9th in this report. According to the survey report, on average, happiness increased more significantly in middle-income countries (as defined by the World Bank) than in high-income countries."

In [7]:
# text = "Australian Melissa Georgiou moved to Finland over a decade ago to seek happiness in one of the coldest and darkest places on Earth."

# Processed Document

In [8]:
doc = nlp(text)

# Dependencies

In [9]:
print("{}{}{}{}{}".format(
    "Token".ljust(15), "POS".ljust(10), "Tag".ljust(8), "Dependency".ljust(14), "Head"
))
print("-" * 80)
for token in doc:
    print(f"{token.text:15}{token.pos_:10}{token.tag_:8}{token.dep_:14}{token.head.text}")

Token          POS       Tag     Dependency    Head
--------------------------------------------------------------------------------
Australian     PROPN     NNP     compound      Georgiou
Melissa        PROPN     NNP     compound      Georgiou
Georgiou       PROPN     NNP     nsubj         moved
(              PUNCT     -LRB-   punct         Georgiou
Melissa        PROPN     NNP     compound      Georgiou
Georgiou       PROPN     NNP     appos         Georgiou
)              PUNCT     -RRB-   punct         Georgiou
moved          VERB      VBD     ROOT          moved
to             ADP       IN      prep          moved
Finland        PROPN     NNP     pobj          to
over           ADP       IN      nmod          decade
a              DET       DT      det           decade
decade         NOUN      NN      npadvmod      ago
ago            ADV       RB      advmod        moved
to             PART      TO      aux           seek
seek           VERB      VB      advcl         moved
happi

In [10]:
displacy.serve(doc, style="dep", auto_select_port=True, page=False)




Using the 'dep' visualizer
Serving on http://0.0.0.0:5001 ...

Shutting down server on port 5001.


----
# Entities

Named Entities have been identified by NER in the document object.

* [Named Entities](https://spacy.io/usage/spacy-101#annotations-ner)

### Entity Labels
```
PERSON:      People, including fictional.
NORP:        Nationalities or religious or political groups.
FAC:         Buildings, airports, highways, bridges, etc.
ORG:         Companies, agencies, institutions, etc.
GPE:         Countries, cities, states.
LOC:         Non-GPE locations, mountain ranges, bodies of water.
PRODUCT:     Objects, vehicles, foods, etc. (Not services.)
EVENT:       Named hurricanes, battles, wars, sports events, etc.
WORK_OF_ART: Titles of books, songs, etc.
LAW:         Named documents made into laws.
LANGUAGE:    Any named language.
DATE:        Absolute or relative dates or periods.
TIME:        Times smaller than a day.
PERCENT:     Percentage, including ”%“.
MONEY:       Monetary values, including unit.
QUANTITY:    Measurements, as of weight or distance.
ORDINAL:     “first”, “second”, etc.
CARDINAL:    Numerals that do not fall under another type.
```

In [20]:
displacy.serve(doc, style="ent", auto_select_port=True, page=False)


Using the 'ent' visualizer
Serving on http://0.0.0.0:5001 ...

Shutting down server on port 5001.


In [21]:
print("{}{}{}{}".format(
    "Entity".ljust(50), "Start".ljust(7), "End".ljust(7), "Label".ljust(10)
))
print("-" * 80)
for ent in doc.ents:
    print(f"{ent.text:50}{ent.start_char:<7}{ent.end_char:<7}{ent.label_:10}")

Entity                                            Start  End    Label     
--------------------------------------------------------------------------------
Australian                                        0      10     NORP      
Melissa Georgiou                                  11     27     PERSON    
Melissa Georgiou                                  29     45     PERSON    
Finland                                           56     63     GPE       
over a decade ago                                 64     81     DATE      
Earth                                             144    149    LOC       
Melissa                                           305    312    PERSON    
12 years ago                                      341    353    DATE      
Sydney                                            388    394    GPE       
Finland                                           433    440    GPE       
Melissa                                           475    482    PERSON    
Finns              

## Entities per label

In [22]:
gpes = set()
locs = set()
orgs = set()
titles = set()
nationalities = set()
landmarks = set()

for entity in doc.ents:
    if entity.label_ == 'GPE':
        gpes.add(entity.text)
    elif entity.label_ == 'LOC':
        locs.add(entity.text)
    elif entity.label_ == 'ORG':
        orgs.add(entity.text)
    elif entity.label_ == 'WORK_OF_ART':
        titles.add(entity.text)
    elif entity.label_ == 'NORP':
        nationalities.add(entity.text)
    elif entity.label_ == 'FAC':
        landmarks.add(entity.text)

In [23]:
titles

{'Dark Night', 'the “World Happiness Report”'}

In [24]:
landmarks

set()

In [25]:
gpes

{'Australia',
 'Brazil',
 'Canada',
 'China',
 'Denmark',
 'Finland',
 'Hong Kong',
 'Iceland',
 'India',
 'Ireland',
 'Japan',
 'Luxembourg',
 'Netherlands',
 'New Zealand',
 'Norway',
 'Saudi Arabia',
 'Singapore',
 'Sweden',
 'Switzerland',
 'Sydney',
 'Taiwan',
 'the United States'}

In [26]:
locs

{'Asia', 'Earth', 'Northern Europe'}

In [27]:
nationalities

{'Australian', 'European', 'Finnish', 'Finns', 'Nordic'}

In [28]:
orgs

{'Ipsos',
 'Ipsos Group',
 'the United Nations',
 'the World Bank',
 'the World Happiness Report'}

### Exclude "ORDINAL", "CARDINAL", "PERCENT", "DATE", "NORP"

In [29]:
def clean(text: str):
    return re.sub(r'[“”"!`~%&*()]*', '', text)

In [30]:
def clean_text(text: str, lower: bool=False):
    text = text.lower() if lower else text
    return " ".join([
        clean(word)
        for word in text.split()
        if word not in stopwords
    ])

In [35]:
print("{}{}{}{}".format(
    "Entity".ljust(50), "Start".ljust(7), "End".ljust(7), "Label".ljust(10)
))
print("-" * 80)

words_in_entities = set()
entity_phrases= set()
selected_named_entities = {}

for ent in doc.ents:
    if (ent.text.lower() not in entity_phrases) and (ent.label_ not in ("ORDINAL", "CARDINAL", "PERCENT", "DATE", "NORP")):        
        _entity = clean_text(text=ent.text, lower=True)
        _ = [
            clean_text(word) 
            for word in ent.text.lower().split() 
            if word not in stopwords
        ]
        words_in_entities.update(_entity.split())
        entity_phrases.add(_entity)
        
        named_entity = clean_text(text=ent.text, lower=False)
        selected_named_entities[named_entity] = ent
        print(f"{named_entity:50}{ent.start_char:<7}{ent.end_char:<7}{ent.label_:10}")


Entity                                            Start  End    Label     
--------------------------------------------------------------------------------
Melissa Georgiou                                  11     27     PERSON    
Finland                                           56     63     GPE       
Earth                                             144    149    LOC       
Melissa                                           305    312    PERSON    
Sydney                                            388    394    GPE       
Melissa Georgio's                                 1042   1059   PERSON    
Dark Night                                        1060   1070   WORK_OF_ART
Northern Europe                                   1074   1089   LOC       
World Happiness Report                            1173   1201   WORK_OF_ART
United Nations                                    1214   1232   ORG       
Frank Martela                                     1383   1396   PERSON    
Frank            

In [36]:
set(selected_named_entities)

{'Asia',
 'Australia',
 'Brazil',
 'Canada',
 'China',
 'Dark Night',
 'Denmark',
 'Earth',
 'Finland',
 'Frank',
 'Frank Martela',
 'Hong Kong',
 'Iceland',
 'India',
 'Ipsos',
 'Ipsos Group',
 'Ireland',
 'Japan',
 'Luxembourg',
 'Melissa',
 "Melissa Georgio's",
 'Melissa Georgiou',
 'Netherlands',
 'New Zealand',
 'Northern Europe',
 'Norway',
 'Saudi Arabia',
 'Singapore',
 'Sweden',
 'Switzerland',
 'Sydney',
 'Taiwan',
 'United Nations',
 'United States',
 'World Bank',
 'World Happiness Report'}

---
# Noun Phrases

* [Noun chunks](https://spacy.io/usage/linguistic-features#noun-chunks)

> **Noun chunks** are “base noun phrases” – flat phrases that **have a noun as their head**. You can think of noun chunks as a noun plus the words describing the noun – for example, “the lavish green grass” or “the world’s largest tech fund”. 

In [33]:
for chunk in doc.noun_chunks:
    # if chunk.text.lower() != chunk.root.text.lower():
        print(f"phrase:[{chunk.text:50}] root:[{chunk.root.text:15}] type:[{chunk.root.dep_:6}]")

phrase:[Australian Melissa Georgiou                       ] root:[Georgiou       ] type:[nsubj ]
phrase:[Melissa Georgiou                                  ] root:[Georgiou       ] type:[appos ]
phrase:[Finland                                           ] root:[Finland        ] type:[pobj  ]
phrase:[happiness                                         ] root:[happiness      ] type:[dobj  ]
phrase:[the coldest and darkest places                    ] root:[places         ] type:[pobj  ]
phrase:[Earth                                             ] root:[Earth          ] type:[pobj  ]
phrase:[my favorite things                                ] root:[things         ] type:[pobj  ]
phrase:[it                                                ] root:[it             ] type:[nsubj ]
phrase:[nature                                            ] root:[nature         ] type:[pobj  ]
phrase:[you                                               ] root:[you            ] type:[nsubj ]
phrase:[a residential area    

In [34]:
phrases = doc.noun_chunks
tokens = doc

phrase = next(phrases)
for sentence in doc.sents:
    print(sentence)
    while phrase.root.sent == sentence:
        print(
            f"phrase:[{phrase.text:30}] "\
            f"root:[{phrase.root.text:12}] "\
            f"type:[{phrase.root.dep_:6}] "\
            f"parent:[{phrase.root.head}]" 
        )
        phrase = next(phrases)
    

Australian Melissa Georgiou (Melissa Georgiou) moved to Finland over a decade ago to seek happiness in one of the coldest and darkest places on Earth.
phrase:[Australian Melissa Georgiou   ] root:[Georgiou    ] type:[nsubj ] parent:[moved]
phrase:[Melissa Georgiou              ] root:[Georgiou    ] type:[appos ] parent:[Georgiou]
phrase:[Finland                       ] root:[Finland     ] type:[pobj  ] parent:[to]
phrase:[happiness                     ] root:[happiness   ] type:[dobj  ] parent:[seek]
phrase:[the coldest and darkest places] root:[places      ] type:[pobj  ] parent:[of]
phrase:[Earth                         ] root:[Earth       ] type:[pobj  ] parent:[on]
“One of my favorite things about living here is that it's easy to get close to nature whether you're in a residential area or in the middle of the city,” Melissa said.
phrase:[my favorite things            ] root:[things      ] type:[pobj  ] parent:[of]
phrase:[it                            ] root:[it          ] type:[ns

StopIteration: 

In [None]:
for chunk in doc.noun_chunks:
    print(
        f"{chunk.text:32} " \
        f"tag:{chunk.root.dep_:10} " \
        f"root:{chunk.root.head.text:10} " \
#        sentence:{chunk.root.sent} " \
        f"children:{list(chunk.root.children)}"
    )

In [None]:
print("{}{}{}{}{}{}".format(
    "Token".ljust(15), "Lemma".ljust(15), "POS".ljust(10), "Tag".ljust(8), "Dependency".ljust(14), "StopWord"
))
print("-" * 80)
for token in doc:
    if (not token.is_stop) and len(token.text) > 1:
        print(f"{token.text:15}{token.lemma_:15}{token.pos_:10}{token.tag_:8}{token.dep_:14}{token.is_stop:}")

---

# Key Noun Phrases

Find a phrase that includes an identified named entity.

In [55]:
candidates = set()

for chunk in doc.noun_chunks:
    # --------------------------------------------------------------------------------
    # Skip the phrase if it is the same with its root.
    # --------------------------------------------------------------------------------
    if chunk.text.lower() != chunk.root.text.lower():
        
        # --------------------------------------------------------------------------------
        # Select a noun phrase that may include a named entity.
        # --------------------------------------------------------------------------------
        if set(chunk.text.lower().split()).intersection(words_in_entities):
            phrase = clean_text(text=chunk.text, lower=True)
            
            # --------------------------------------------------------------------------------
            # Select a phrase that a named entity is a part of. For instance, select
            # "happiness expert researcher Frank Martela" which the named entity 
            # "Frank Martela" is a part of.
            # --------------------------------------------------------------------------------
            # if any([set(entity.split()).issubset(phrase) for entity in words_in_entities]):
            if any([(entity.lower() in phrase.lower()) for entity in entity_phrases]):
                key_phrase_candidate = clean_text(text=chunk.text, lower=False)
                if key_phrase_candidate not in candidates:
                    # print(f"phrase:[{key_phrase_candidate:50}] root:[{chunk.root.text:15}] type:[{chunk.root.dep_:6}]")
                    candidates.add(key_phrase_candidate)

# --------------------------------------------------------------------------------
# Remove duplicates that are already in selected_named_entities
# --------------------------------------------------------------------------------
key_phrases = {
    candidate for candidate in candidates
    if candidate not in selected_named_entities
}
key_phrases

{'Australian Melissa Georgiou',
 "Melissa Georgio's Dark Night",
 'happiness expert researcher Frank Martela',
 'mainland China'}

In [53]:
set(selected_named_entities)

{'Asia',
 'Australia',
 'Brazil',
 'Canada',
 'China',
 'Dark Night',
 'Denmark',
 'Earth',
 'Finland',
 'Frank',
 'Frank Martela',
 'Hong Kong',
 'Iceland',
 'India',
 'Ipsos',
 'Ipsos Group',
 'Ireland',
 'Japan',
 'Luxembourg',
 'Melissa',
 "Melissa Georgio's",
 'Melissa Georgiou',
 'Netherlands',
 'New Zealand',
 'Northern Europe',
 'Norway',
 'Saudi Arabia',
 'Singapore',
 'Sweden',
 'Switzerland',
 'Sydney',
 'Taiwan',
 'United Nations',
 'United States',
 'World Bank',
 'World Happiness Report'}

---
# Keyword Extraction

In [56]:
import textacy

In [57]:
document = textacy.make_spacy_doc(text, lang="en_core_web_sm")

In [58]:
[kps for kps, weights in textacy.extract.keyterms.sgrank(doc=document, ngrams=[1,2,3,4,5], topn=20)]

['World Happiness Report',
 'Melissa Georgiou',
 'Finland',
 'happiness',
 'Frank Martela',
 'australian',
 'Finns',
 'thing',
 'nordic country',
 'year',
 'concept',
 'cold',
 'favorite',
 'place',
 'happy',
 'melancholy',
 'middle',
 'school',
 'world',
 'finnish']

In [59]:
[kps for kps, weights in textacy.extract.keyterms.textrank(doc=document, window_size=20, topn=20)]

['high happiness index',
 'global happiness index',
 'happiness expert',
 'happy country',
 'happiness indicator',
 'Northern Europe Finland',
 'nordic country',
 'european country',
 'income country',
 'australian Melissa Georgiou',
 'universal health care system',
 'public school system',
 'World Happiness Report',
 'polling agency Ipsos Group',
 'way Finns',
 'survey report',
 'high level',
 'strong support',
 'researcher Frank Martela',
 'sixth year']

---

# Wordnet



In [76]:
import nltk
from itertools import chain
from nltk.corpus import wordnet

nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /Users/oonisim/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## Stem 

happy -> hapiness


In [77]:
# Just to make it a bit more readable
WN_NOUN = 'n'
WN_VERB = 'v'
WN_ADJECTIVE = 'a'
WN_ADJECTIVE_SATELLITE = 's'
WN_ADVERB = 'r'


def convert(word, from_pos, to_pos):    
    """ Transform words given from/to POS tags """
    synsets = wordnet.synsets(word, pos=from_pos)

    # Word not found
    if not synsets:
        return []

    # Get all lemmas of the word (consider 'a'and 's' equivalent)
    lemmas = []
    for s in synsets:
        for l in s.lemmas():
            if s.name().split('.')[1] == from_pos or from_pos in (WN_ADJECTIVE, WN_ADJECTIVE_SATELLITE) and s.name().split('.')[1] in (WN_ADJECTIVE, WN_ADJECTIVE_SATELLITE):
                lemmas += [l]

    # Get related forms
    derivationally_related_forms = [(l, l.derivationally_related_forms()) for l in lemmas]

    # filter only the desired pos (consider 'a' and 's' equivalent)
    related_noun_lemmas = []

    for drf in derivationally_related_forms:
        for l in drf[1]:
            if l.synset().name().split('.')[1] == to_pos or to_pos in (WN_ADJECTIVE, WN_ADJECTIVE_SATELLITE) and l.synset().name().split('.')[1] in (WN_ADJECTIVE, WN_ADJECTIVE_SATELLITE):
                related_noun_lemmas += [l]

    # Extract the words from the lemmas
    words = [l.name() for l in related_noun_lemmas]
    len_words = len(words)

    # Build the result in the form of a list containing tuples (word, probability)
    result = [(w, float(words.count(w)) / len_words) for w in set(words)]
    result.sort(key=lambda w:-w[1])

    # return all the possibilities sorted by probability
    return result

In [78]:
convert("happy", "a", "n")

[('happiness', 0.6), ('felicitousness', 0.2), ('felicity', 0.2)]

In [115]:
convert("happy", "a", "r")

[]

In [79]:
convert("speedy", "a", "n")

[('speed', 0.375),
 ('speediness', 0.25),
 ('quickness', 0.125),
 ('rapidness', 0.125),
 ('rapidity', 0.125)]

In [80]:
convert("speedy", "a", "r")

[]

In [106]:
def get_synonyms(text: str, pos=None):
    """Get synonym of the POS type. 
    
    "speed" can be verb and noun. To get the synonyms for verb, set pos='v'
    
    Args:
        text: text to find the synonyms
        pos: part of speech tag (n=noun, v=verb, a=adjective, r=adverb)
    Returns: set of synonyms
    """
    text = "_".join(text.split())
    synonyms = wordnet.synsets(text, pos=pos)
    lemmas = set(chain.from_iterable([word.lemma_names() for word in synonyms]))
    return lemmas - {text}

In [113]:
get_synonyms(text="speed", pos="v")

{'accelerate',
 'belt_along',
 'bucket_along',
 'cannonball_along',
 'hasten',
 'hie',
 'hotfoot',
 'hurry',
 'pelt_along',
 'quicken',
 'race',
 'rush',
 'rush_along',
 'speed_up',
 'step_on_it',
 'travel_rapidly',
 'zip'}