<a href="https://colab.research.google.com/github/maciejlehmann/amw-wdum/blob/main/Lab5/Lab5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
import re
import spacy
from spacy.tokenizer import Tokenizer
from spacy import displacy
from spacy.matcher import Matcher
import textacy

nlp = spacy.load("en_core_web_sm")

In [11]:
# Czytanie ze stringa

introduction_text = ('This is just an example sentence for a task.')
introduction_doc = nlp(introduction_text)

print ([token.text for token in introduction_doc])

['This', 'is', 'just', 'an', 'example', 'sentence', 'for', 'a', 'task', '.']


In [13]:
# Czytanie z pliku

file_name = 'introduction.txt'
introduction_file_text = open(file_name).read()
introduction_file_doc = nlp(introduction_file_text)

print ([token.text for token in introduction_file_doc])

['My', 'name', 'is', 'Maciej', '.', 'I', 'am', 'studying', 'computer', 'science', 'at', 'the', 'Naval', 'Academy', '.']


In [14]:
# Wykrywanie zdań ze stringa

about_text = ('Maciej Lehmann is a 4th year'
               ' computer science student at the Naval'
               ' Academy in Gdynia. He is in the process'
               ' of preparing his engineering thesis.')
about_doc = nlp(about_text)
sentences = list(about_doc.sents)
len(sentences)

for sentence in sentences:
  print (sentence)

Maciej Lehmann is a 4th year computer science student at the Naval Academy in Gdynia.
He is in the process of preparing his engineering thesis.


In [18]:
# Wykrywanie zdań z zastosowaniem własnego separatora

def set_custom_boundaries(doc):
  for token in doc[:-1]:
    if token.text == '...':
     doc[token.i+1].is_sent_start = True
  return doc

ellipsis_text = ('Maciej Lehmann is ... a 4th year'
               ' computer science student at the Naval'
               ' Academy ... in Gdynia. He is in the process'
               ' of preparing ... his engineering thesis.')

print("Zdania podzielone przy użyciu nowego separatora: ")
custom_nlp = spacy.load('en_core_web_sm')
custom_nlp.add_pipe(set_custom_boundaries, before='parser')
custom_ellipsis_doc = custom_nlp(ellipsis_text)
custom_ellipsis_sentences = list(custom_ellipsis_doc.sents)
for sentence in custom_ellipsis_sentences:
  print(sentence)

print("\nZdania z użyciem domyślnego separatora:")
ellipsis_doc = nlp(ellipsis_text)
ellipsis_sentences = list(ellipsis_doc.sents)
for sentence in ellipsis_sentences:
  print(sentence)

Zdania podzielone przy użyciu nowego separatora: 
Maciej Lehmann is ...
a 4th year computer science student at the Naval Academy ...
in Gdynia.
He is in the process of preparing ...
his engineering thesis.

Zdania z użyciem domyślnego separatora:
Maciej Lehmann is ... a 4th year computer science student at the Naval Academy ... in Gdynia.
He is in the process of preparing ...
his engineering thesis.


In [19]:
# Podział na podstawowe jednostki w tekście - tokeny

for token in about_doc:
  print (token, token.idx)

Maciej 0
Lehmann 7
is 15
a 18
4th 20
year 24
computer 29
science 38
student 46
at 54
the 57
Naval 61
Academy 67
in 75
Gdynia 78
. 84
He 86
is 89
in 92
the 95
process 99
of 107
preparing 110
his 120
engineering 124
thesis 136
. 142


In [20]:
# Wskazanie szczegółowych atrybutów tokenów

for token in about_doc:
  print (token, token.idx, token.text_with_ws, token.is_alpha, token.is_punct, token.is_space, token.shape_, token.is_stop)

Maciej 0 Maciej  True False False Xxxxx False
Lehmann 7 Lehmann  True False False Xxxxx False
is 15 is  True False False xx True
a 18 a  True False False x True
4th 20 4th  False False False dxx False
year 24 year  True False False xxxx False
computer 29 computer  True False False xxxx False
science 38 science  True False False xxxx False
student 46 student  True False False xxxx False
at 54 at  True False False xx True
the 57 the  True False False xxx True
Naval 61 Naval  True False False Xxxxx False
Academy 67 Academy  True False False Xxxxx False
in 75 in  True False False xx True
Gdynia 78 Gdynia True False False Xxxxx False
. 84 .  False True False . False
He 86 He  True False False Xx True
is 89 is  True False False xx True
in 92 in  True False False xx True
the 95 the  True False False xxx True
process 99 process  True False False xxxx False
of 107 of  True False False xx True
preparing 110 preparing  True False False xxxx False
his 120 his  True False False xxx True
engineering

In [21]:
# Dostosowanie procesu tokenizacji, aby wykrywał tokeny na niestandardowych znakach

custom_nlp = spacy.load('en_core_web_sm')
prefix_re = spacy.util.compile_prefix_regex(custom_nlp.Defaults.prefixes)
suffix_re = spacy.util.compile_suffix_regex(custom_nlp.Defaults.suffixes)
infix_re = re.compile(r'''[-~]''')
def customize_tokenizer(nlp):
  return Tokenizer(nlp.vocab, prefix_search=prefix_re.search, suffix_search=suffix_re.search, infix_finditer=infix_re.finditer, token_match=None)

custom_nlp.tokenizer = customize_tokenizer(custom_nlp)
custom_tokenizer_about_doc = custom_nlp(about_text)
print([token.text for token in custom_tokenizer_about_doc])

['Maciej', 'Lehmann', 'is', 'a', '4th', 'year', 'computer', 'science', 'student', 'at', 'the', 'Naval', 'Academy', 'in', 'Gdynia', '.', 'He', 'is', 'in', 'the', 'process', 'of', 'preparing', 'his', 'engineering', 'thesis', '.']


In [22]:
# Wypisanie kilku słów stopów

spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS
len(spacy_stopwords)

for stop_word in list(spacy_stopwords)[:10]:
  print(stop_word)

yet
thereupon
they
are
up
why
'm
mostly
regarding
almost


In [25]:
# Wypisanie tokenów pomijając słowa stopy

about_text = ('Maciej Lehmann is a 4th year'
               ' computer science student at the Naval'
               ' Academy in Gdynia. He is in the process'
               ' of preparing his engineering thesis.')
about_doc = nlp(about_text)


for token in about_doc:
  if not token.is_stop:
    print (token)

Maciej
Lehmann
4th
year
computer
science
student
Naval
Academy
Gdynia
.
process
preparing
engineering
thesis
.


In [26]:
# Tworzenie listy tokenów bez słów stopów

about_no_stopword_doc = [token for token in about_doc if not token.is_stop]
print (about_no_stopword_doc)

[Maciej, Lehmann, 4th, year, computer, science, student, Naval, Academy, Gdynia, ., process, preparing, engineering, thesis, .]


In [27]:
# Redukcja odmioych form wyrazów

conference_help_text = ('Maciej Lehmann is a 4th year'
               ' computer science student at the Naval'
               ' Academy in Gdynia. He is in the process'
               ' of preparing his engineering thesis.')
conference_help_doc = nlp(conference_help_text)
for token in conference_help_doc:
  print (token, token.lemma_)

Maciej Maciej
Lehmann Lehmann
is be
a a
4th 4th
year year
computer computer
science science
student student
at at
the the
Naval Naval
Academy Academy
in in
Gdynia Gdynia
. .
He -PRON-
is be
in in
the the
process process
of of
preparing prepare
his -PRON-
engineering engineering
thesis thesis
. .


In [35]:
# Sprawdzanie statystyk występowania słów w danym tekście

from collections import Counter
complete_text = ('Avengers: Endgame is the culmination of a decade of blockbuster'
     ' filmmaking, the result of years of work from thousands of people. It is'
     ' designed to be the most blockbuster of all the blockbusters, a movie'
     ' with a dozen subplots colliding, and familiar faces from over 20 other'
     ' movies. It’s really like nothing that Hollywood has produced before,'
     ' existing not just to acknowledge or exploit the fans of this series, but to'
     ' reward their love, patience, and undying adoration. The blunt thing'
     ' you probably want to know most: It’s hard to see serious MCU fans'
     ' walking away from this disappointed. It checks all the boxes, even'
     ' ticking off a few ones that fans won’t expect to be on the list. It’s a'
     ' satisfying end to a chapter of blockbuster history that will be hard to'
     ' top for pure spectacle. In terms of sheer entertainment value, it’s on'
     ' the higher end of the MCU, a film that elevates its most iconic heroes'
     ' to the legendary status they deserve and provides a few legitimate'
     ' thrills along the way.')

complete_doc = nlp(complete_text)

words = [token.text for token in complete_doc if not token.is_stop and not token.is_punct]
word_freq = Counter(words)

common_words = word_freq.most_common(5)
print (common_words)

unique_words = [word for (word, freq) in word_freq.items() if freq == 1]
print (unique_words)

words_all = [token.text for token in complete_doc if not token.is_punct]
word_freq_all = Counter(words_all)

common_words_all = word_freq_all.most_common(5)
print (common_words_all)

[('blockbuster', 3), ('fans', 3), ('hard', 2), ('MCU', 2), ('end', 2)]
['Avengers', 'Endgame', 'culmination', 'decade', 'filmmaking', 'result', 'years', 'work', 'thousands', 'people', 'designed', 'blockbusters', 'movie', 'dozen', 'subplots', 'colliding', 'familiar', 'faces', '20', 'movies', 'like', 'Hollywood', 'produced', 'existing', 'acknowledge', 'exploit', 'series', 'reward', 'love', 'patience', 'undying', 'adoration', 'blunt', 'thing', 'probably', 'want', 'know', 'walking', 'away', 'disappointed', 'checks', 'boxes', 'ticking', 'ones', 'wo', 'expect', 'list', 'satisfying', 'chapter', 'history', 'pure', 'spectacle', 'terms', 'sheer', 'entertainment', 'value', 'higher', 'film', 'elevates', 'iconic', 'heroes', 'legendary', 'status', 'deserve', 'provides', 'legitimate', 'thrills', 'way']
[('the', 11), ('of', 10), ('to', 9), ('a', 8), ('It', 5)]


In [28]:
# Określenie jaką część mowy stanowią dane słowa

for token in about_doc:
  print (token, token.tag_, token.pos_, spacy.explain(token.tag_))

nouns = []
adjectives = []
for token in about_doc:
  if token.pos_ == 'NOUN':
    nouns.append(token)
  if token.pos_ == 'ADJ':
    adjectives.append(token)

print("\n\nNouns: ", nouns)

print("Adjectives: ", adjectives)

Maciej NNP PROPN noun, proper singular
Lehmann NNP PROPN noun, proper singular
is VBZ AUX verb, 3rd person singular present
a DT DET determiner
4th JJ ADJ adjective
year NN NOUN noun, singular or mass
computer NN NOUN noun, singular or mass
science NN NOUN noun, singular or mass
student NN NOUN noun, singular or mass
at IN ADP conjunction, subordinating or preposition
the DT DET determiner
Naval NNP PROPN noun, proper singular
Academy NNP PROPN noun, proper singular
in IN ADP conjunction, subordinating or preposition
Gdynia NNP PROPN noun, proper singular
. . PUNCT punctuation mark, sentence closer
He PRP PRON pronoun, personal
is VBZ AUX verb, 3rd person singular present
in IN ADP conjunction, subordinating or preposition
the DT DET determiner
process NN NOUN noun, singular or mass
of IN ADP conjunction, subordinating or preposition
preparing VBG VERB verb, gerund or present participle
his PRP$ DET pronoun, possessive
engineering NN NOUN noun, singular or mass
thesis NN NOUN noun, sin

In [31]:
about_interest_text = ('He is interested in learning' ' Vue.js frontend framework.')
about_interest_doc = nlp(about_interest_text)
displacy.render(about_interest_doc, style='dep', jupyter=True)

In [36]:
# Tworzenie funkcji, która konwertuje tekst do formy, którą można analizować

def is_token_allowed(token):
  if (not token or not token.string.strip() or
    token.is_stop or token.is_punct):
    return False
  return True

def preprocess_token(token):
  return token.lemma_.strip().lower()

complete_filtered_tokens = [preprocess_token(token)
for token in complete_doc if is_token_allowed(token)]
complete_filtered_tokens

['avenger',
 'endgame',
 'culmination',
 'decade',
 'blockbuster',
 'filmmaking',
 'result',
 'year',
 'work',
 'thousand',
 'people',
 'design',
 'blockbuster',
 'blockbuster',
 'movie',
 'dozen',
 'subplot',
 'colliding',
 'familiar',
 'face',
 '20',
 'movie',
 'like',
 'hollywood',
 'produce',
 'exist',
 'acknowledge',
 'exploit',
 'fan',
 'series',
 'reward',
 'love',
 'patience',
 'undying',
 'adoration',
 'blunt',
 'thing',
 'probably',
 'want',
 'know',
 'hard',
 'mcu',
 'fan',
 'walk',
 'away',
 'disappointed',
 'check',
 'box',
 'tick',
 'one',
 'fan',
 'will',
 'expect',
 'list',
 'satisfying',
 'end',
 'chapter',
 'blockbuster',
 'history',
 'hard',
 'pure',
 'spectacle',
 'term',
 'sheer',
 'entertainment',
 'value',
 'high',
 'end',
 'mcu',
 'film',
 'elevate',
 'iconic',
 'hero',
 'legendary',
 'status',
 'deserve',
 'provide',
 'legitimate',
 'thrill',
 'way']

In [33]:
# Wyciąganie imienia i nazwiska za pomocą dopasowania opartego o wskazane reguły

matcher = Matcher(nlp.vocab)
def extract_full_name(nlp_doc):
  pattern = [{'POS': 'PROPN'}, {'POS': 'PROPN'}]
  matcher.add('FULL_NAME', None, pattern)
  matches = matcher(nlp_doc)
  for match_id, start, end in matches:
    span = nlp_doc[start:end]
    return span.text

extract_full_name(about_doc)

'Maciej Lehmann'

In [37]:
# Użycie dopasowania opartego na regułach do wyciągnięcia numeru telefonu

matcher = Matcher(nlp.vocab)
conference_org_text = ('There is a developer conference'
     'happening on 21 July 2019 in London. It is titled'
     ' "Applications of Natural Language Processing".'
     ' There is a helpline number available'
     ' at (123) 456-789')

def extract_phone_number(nlp_doc):
  pattern = [{'ORTH': '('}, {'SHAPE': 'ddd'},
                {'ORTH': ')'}, {'SHAPE': 'ddd'},
                {'ORTH': '-', 'OP': '?'},
                {'SHAPE': 'ddd'}]
  matcher.add('PHONE_NUMBER', None, pattern)
  matches = matcher(nlp_doc)
  for match_id, start, end in matches:
    span = nlp_doc[start:end]
    return span.text

conference_org_doc = nlp(conference_org_text)
extract_phone_number(conference_org_doc)

'(123) 456-789'

In [38]:
piano_text = 'Maciej is watching movies'
piano_doc = nlp(piano_text)
for token in piano_doc:
  print (token.text, token.tag_, token.head.text, token.dep_)

displacy.render(piano_doc, style='dep', jupyter=True)

Maciej NNP watching nsubj
is VBZ watching aux
watching VBG watching ROOT
movies NNS watching dobj


In [None]:
# Drzewo posiada informacje o strukturze zdań i gramatyce i można je przeglądać na różne sposoby, aby wyodrębnić relacje

one_line_about_text = ('Gus Proto is a Python developer'
' currently working for a London-based Fintech company')
one_line_about_doc = nlp(one_line_about_text)
# Extract children of `developer`
print([token.text for token in one_line_about_doc[5].children])

# Extract previous neighboring node of `developer`
print (one_line_about_doc[5].nbor(-1))

# Extract next neighboring node of `developer`
print (one_line_about_doc[5].nbor())

# Extract all tokens on the left of `developer`
print([token.text for token in one_line_about_doc[5].lefts])

# Extract tokens on the right of `developer`
print([token.text for token in one_line_about_doc[5].rights])

# Print subtree of `developer`
print (list(one_line_about_doc[5].subtree))

['a', 'Python', 'working']
Python
currently
['a', 'Python']
['working']
[a, Python, developer, currently, working, for, a, London, -, based, Fintech, company]


In [None]:
# Tworzenie funkcji, która tworzy stringa na podstawie drzewa

def flatten_tree(tree):
  return ''.join([token.text_with_ws for token in list(tree)]).strip()

# Print flattened subtree of `developer`
print (flatten_tree(one_line_about_doc[5].subtree))

a Python developer currently working for a London-based Fintech company


In [None]:
# Wykrywanie fraz rzeczownikowych w tekście

conference_text = ('There is a developer conference' ' happening on 21 July 2019 in London.')
conference_doc = nlp(conference_text)
# Extract Noun Phrases
for chunk in conference_doc.noun_chunks:
  print (chunk)

a developer conference
21 July
London


In [None]:
# Wykrywanie fraz czaskownikowych w tekście

about_talk_text = ('The talk will introduce reader about Use'
                    ' cases of Natural Language Processing in'
                    ' Fintech')
pattern = r'(<VERB>?<ADV>*<VERB>+)'
about_talk_doc = textacy.make_spacy_doc(about_talk_text, lang='en_core_web_sm')
verb_phrases = textacy.extract.pos_regex_matches(about_talk_doc, pattern)
# Print all Verb Phrase
for chunk in verb_phrases:
  print(chunk.text)

# Extract Noun Phrase to explain what nouns are involved
for chunk in about_talk_doc.noun_chunks:
  print (chunk)

will introduce
The talk
reader
Use cases
Natural Language Processing
Fintech


  action="once",


In [None]:
# Znajdowanie nazwanych obiektów w tekście

piano_class_text = ('Great Piano Academy is situated'
     ' in Mayfair or the City of London and has'
     ' world-class piano instructors.')
piano_class_doc = nlp(piano_class_text)
for ent in piano_class_doc.ents:
  print(ent.text, ent.start_char, ent.end_char, ent.label_, spacy.explain(ent.label_))

displacy.render(piano_class_doc, style='ent', jupyter=True)

Great Piano Academy 0 19 ORG Companies, agencies, institutions, etc.
Mayfair 35 42 GPE Countries, cities, states
the City of London 46 64 GPE Countries, cities, states


In [None]:
# Ukrywanie imion w tekście w oparciu o nazwane obiekty

survey_text = ('Out of 5 people surveyed, James Robert,'
                ' Julie Fuller and Benjamin Brooks like'
                ' apples. Kelly Cox and Matthew Evans'
                ' like oranges.')

def replace_person_names(token):
  if token.ent_iob != 0 and token.ent_type_ == 'PERSON':
    return '[CONFIDENTIAL] '
  return token.string

def redact_names(nlp_doc):
  for ent in nlp_doc.ents:
    ent.merge()
  tokens = map(replace_person_names, nlp_doc)
  return ''.join(tokens)

survey_doc = nlp(survey_text)
redact_names(survey_doc)

'Out of 5 people surveyed, [CONFIDENTIAL] , [CONFIDENTIAL] and [CONFIDENTIAL] like apples. [CONFIDENTIAL] and [CONFIDENTIAL] like oranges.'