## Приклад опрацювання англійської мови

In [1]:
# import all necessary toolkits

import en_core_web_md
nlp = en_core_web_md.load()

In [2]:
# read the text

with open("gatsby.txt", "r") as f:
    text = f.read()

In [3]:
# process the text

corpus = nlp(text)
"There are {} words in the text.".format(len(corpus))

'There are 61185 words in the text.'

In [4]:
print(list(corpus.sents)[115])

I looked back at my cousin who began to ask me questions in her low, thrilling voice.


In [5]:
# explore the processed text

sentence = list(corpus.sents)[115]
for token in sentence:
    print("{:12}{:12}{:7}".format(token.text, token.lemma_, token.pos_))

I           -PRON-      PRON   
looked      look        VERB   
back        back        ADV    
at          at          ADP    
my          -PRON-      ADJ    
cousin      cousin      NOUN   
who         who         NOUN   
began       begin       VERB   
to          to          PART   
ask         ask         VERB   
me          -PRON-      PRON   
questions   question    NOUN   
in          in          ADP    
her         -PRON-      ADJ    
low         low         ADJ    
,           ,           PUNCT  
thrilling   thrilling   ADJ    
voice       voice       NOUN   
.           .           PUNCT  


In [6]:
from spacy import displacy

# visualize the syntactic tree
sentence = nlp(str(sentence))
displacy.render(sentence, style='dep',
                options={"distance": 120}, jupyter=True)

In [7]:
# extract adjective-noun collocations from the corpus

def extract_adj_noun(corpus, adj=None, noun=None):
    adj_noun = []
    if adj or noun:
        for token in corpus:
            if token.pos_ == "ADJ" and (token.lemma_ == adj or not adj) \
            and token.dep_ == "amod" and token.head.pos_ == "NOUN" \
            and (token.head.lemma_ == noun or not noun):
                adj_noun.append("{} {}".format(token.text, token.head.text))
    return adj_noun

In [8]:
extract_adj_noun(corpus, "nice", None)

['nice place',
 'nice girl',
 'nice things',
 'nice cardboard',
 'nice restaurant',
 'nice bit',
 'nice ride',
 'nice one',
 'nice girl',
 'nice girl']

In [9]:
extract_adj_noun(corpus, "great", None)

['great deal',
 'great uncle',
 'great idea',
 'great bursts',
 'great barnyard',
 'great pack',
 'great specimen',
 'great flock',
 'great portfolio',
 'greater degree',
 'great table',
 'great number',
 'great doors',
 'great capitalists',
 'great relief',
 'great bridge',
 'great sportswoman',
 'great advantage',
 'great distance',
 'great figures',
 'great expression',
 'great rooms',
 'great things',
 'great rooms',
 'great man']

In [10]:
extract_adj_noun(corpus, None, "woman")

['Finnish woman',
 'young women',
 'young women',
 'young women',
 'romantic women',
 'pretty woman',
 'massive woman',
 'drunken woman']

In [11]:
extract_adj_noun(corpus, None, "girl")

['other girl',
 'slender girl',
 'breasted girl',
 'little girl',
 'good girl',
 'nice girl',
 'slender girl',
 'worldly girl',
 'confident girls',
 'other girl',
 'first girl',
 'young girls',
 'single girls',
 'rowdy girl',
 'little girl',
 'other girl',
 'certain girl',
 'young girls',
 'older girls',
 'young girl',
 'mad girl',
 'little girl',
 'radiant girl',
 'young girl',
 'low girl',
 'vulgar girl',
 'little girl',
 'golden girl',
 'poor girl',
 'first girl',
 'nice girl',
 'nice girl']

In [12]:
extract_adj_noun(corpus, None, "man")

['wild men',
 'unknown men',
 'young men',
 'single man',
 'young man',
 'rounded man',
 'sturdy man',
 'haired man',
 'blonde man',
 'spiritless man',
 'old man',
 'grey man',
 'old man',
 'pale man',
 'feminine man',
 'single man',
 'yellow men',
 'stout man',
 'aged man',
 'old men',
 'young men',
 'wayward men',
 'sober men',
 'clever men',
 'shrewd men',
 'young men',
 'dead man',
 'wrong man',
 'young men',
 'smart man',
 'same man',
 'footed men',
 'sighted men',
 'young men',
 'eyed man',
 'dishevelled man',
 'elderly man',
 'embarrassed man',
 'young man',
 'florid man',
 'dumb man',
 'first man',
 'single men',
 'many men',
 'penniless man',
 'young man',
 'garrulous man',
 'several men',
 'other men',
 'other man',
 'mad man',
 'solemn man',
 'old man',
 'helpless man',
 'mad man',
 'young man',
 'great man',
 'young men',
 'young man',
 'young man',
 'old man',
 'eyed man',
 'solemn men']