# Demo1: Discrete Text Representation

## Bag-of-Words (BOW)

### Importing Required Libraries

In [2]:
import spacy                                                  # Import spaCy library
from spacy.lang.en import English                             # Import specific model
nlp = spacy.load("en_core_web_sm")                            # Load model
import collections
from typing import Dict, List, Tuple                          # import dictionaries 

### BOW Representation

In [15]:
def text2bow(words: List[str], dictionary: Dict[str, int]) -> List[Tuple[int, int]]:   # Text to BOW
    word_frequences = collections.defaultdict(int)
    for word in words:
        if word not in dictionary:                                                 # Check condition
            dictionary[word] = len(dictionary)
        word_frequences[dictionary[word]] += 1
    return list(word_frequences.items())   # Return word frequencies



print(sample_text.split())
sample_text = 'today is Monday and Rainy day today on Monday'                           # Input text
dictionary = {}                                                              # Initialize dictionary
print('\nBOW Representation: \n', text2bow(sample_text.split(), dictionary)) 
# print BOW
print("\n")
print(dictionary)

['today', 'is', 'Monday', 'and', 'Rainy', 'day', 'today', 'on', 'Monday']

BOW Representation: 
 [(0, 2), (1, 1), (2, 2), (3, 1), (4, 1), (5, 1), (6, 1)]


{'today': 0, 'is': 1, 'Monday': 2, 'and': 3, 'Rainy': 4, 'day': 5, 'on': 6}


### Interpretation:
### (0,2): Frequency of word at 0th position is 2 in the input sentence
### Example: 'today': 0th position. Frequency: 2

### Dictionary Values

In [16]:
print('Input Text:\n',sample_text)                # print input
print('\nDictionary: \n', dictionary)             # print dictionary values

Input Text:
 today is Monday and Rainy day today on Monday

Dictionary: 
 {'today': 0, 'is': 1, 'Monday': 2, 'and': 3, 'Rainy': 4, 'day': 5, 'on': 6}


### today: 0th position, is: 1st position, and so on

## Example 2

In [17]:
def text2bow(words: List[str], dictionary: Dict[str, int]) -> List[Tuple[int, int]]:                  # Text to BOW
    word_frequences = collections.defaultdict(int)
    for word in words:
        if word not in dictionary:                                                                    # Check condition
            dictionary[word] = len(dictionary)
        word_frequences[dictionary[word]] += 1
    return list(word_frequences.items())   # Return word frequencies



sample_text = 'process large amount text optimize step post focus count number occurrence word text'  # Input text
dictionary = {}                                                                                       # Initialize dictionary
print('\nBOW Representation: \n', text2bow(sample_text.split(), dictionary))                          # print BOW
print('Input Text:\n',sample_text)                                                                    # print input
print('\nDictionary: \n', dictionary)                                                                 # print dictionary values


BOW Representation: 
 [(0, 1), (1, 1), (2, 1), (3, 2), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1)]
Input Text:
 process large amount text optimize step post focus count number occurrence word text

Dictionary: 
 {'process': 0, 'large': 1, 'amount': 2, 'text': 3, 'optimize': 4, 'step': 5, 'post': 6, 'focus': 7, 'count': 8, 'number': 9, 'occurrence': 10, 'word': 11}


### What are your observations for this example?

# Demo2: Word Vectors and Semantic Similarity

# 1. Word Vector Representation
## a) Representation for Single Word

In [18]:
doc = nlp(u'Mango')                                       # for single word
print('Vector Length:\n',doc.vector.shape)                # length of vector
print('Word Vector Representation:\n',doc.vector)         # print output

Vector Length:
 (96,)
Word Vector Representation:
 [ 3.8194752e-01  2.0621240e-01 -1.9165128e-01 -7.4691767e-01
  1.1863470e-02 -3.6899722e-01 -1.9101667e-01  4.1357988e-01
  7.3490846e-01 -6.8910700e-01 -1.0190321e+00  4.8589745e-01
  1.0669317e+00 -6.6930652e-03  9.1499284e-02 -2.2813612e-01
 -8.2159829e-01 -3.1203431e-01 -7.1994603e-01 -1.8531923e-01
 -8.0108643e-05 -2.9838240e-01 -1.0633576e+00  4.3431371e-01
  5.4322565e-01  3.6899900e-01 -1.3740352e-01  5.4435045e-01
 -2.6267877e-01 -1.0797143e+00  1.4821892e+00 -6.4936471e-01
 -7.8164339e-01 -9.6260095e-01 -6.4607427e-02 -7.5819835e-02
 -2.9920429e-01 -7.5433105e-02 -2.9082629e-01 -4.9716616e-01
 -1.3747739e+00 -9.3444777e-01 -2.0836490e-01 -8.0806613e-02
 -4.3597981e-01  2.6771492e-01 -5.2945018e-03 -1.0419630e+00
  1.3329253e+00  6.0584283e-01 -7.5366354e-01 -2.4943669e-01
 -9.5081687e-02  1.9147819e-01  4.3883514e-01  1.0505683e+00
  1.4141293e+00  8.0665970e-01 -3.6420840e-01  1.6820957e-01
 -1.2254603e+00 -3.0482271e-01  1.

## b) Representation for Sentence

In [19]:
doc = nlp(u'The infrastructure of our school is wonderful.')            # Input
for token in doc:
    print('Vector Length:\n',token.vector.shape)                       # length of vector
    print('Word Vector Representation:\n',token.vector)                # print output

Vector Length:
 (96,)
Word Vector Representation:
 [-0.08385818 -1.0672662  -0.965654    1.206026    0.28913158 -0.17783883
  0.45566997  0.6252638  -0.16610631 -0.6601055   0.12031695 -0.04118265
  0.13128175  0.40687072 -0.85072976  0.8772753  -1.1745211  -0.6571657
  1.1783938   0.60096866 -0.8293002   0.73897886 -1.4559261  -1.2420754
 -0.10491681 -0.02193156  0.9765005  -0.84779406 -0.83398616 -0.12860048
 -0.9319618   0.4529175  -1.2058035  -0.09523329 -0.4876092   1.2158086
 -1.5960569  -0.44387788  1.1071453  -1.342288    1.4305793  -0.24217686
 -0.4314505  -0.09381703 -0.98731637  0.35084686  2.0341878   1.1884524
  1.0561584  -1.1218573  -1.3778015  -0.8246187  -0.40175927  0.5922643
  1.4155642   2.054421    1.0763749  -0.3054471   0.23147038 -0.16627765
  1.8739094  -0.70183986 -1.0464184   1.3253887   0.6395492   0.740419
  0.10558176 -1.047605   -0.363693   -0.42682174  1.9331903   0.00430372
 -0.36634806  1.2523555  -0.63966334  0.4602051   0.79135525 -0.7855885
  0.9027

# 2. Semantic Similarity

### Download the following Models 
#### python -m spacy download en_core_web_lg
#### python -m spacy download en_core_web_md

In [26]:
!python -m spacy download en_core_web_lg

Collecting en-core-web-lg==3.3.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.3.0/en_core_web_lg-3.3.0-py3-none-any.whl (400.7 MB)
     -------------------------------------- 400.7/400.7 MB 1.4 MB/s eta 0:00:00
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.3.0
[+] Download and installation successful
You can now load the package via spacy.load('en_core_web_lg')


In [24]:
!python -m spacy download en_core_web_md

Collecting en-core-web-md==3.3.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.3.0/en_core_web_md-3.3.0-py3-none-any.whl (33.5 MB)
     ---------------------------------------- 33.5/33.5 MB 6.2 MB/s eta 0:00:00
Installing collected packages: en-core-web-md
Successfully installed en-core-web-md-3.3.0
[+] Download and installation successful
You can now load the package via spacy.load('en_core_web_md')


In [28]:
nlp = spacy.load("en_core_web_lg")                                                    # Load model    
doc = nlp("dog cat car afskfsd apple")                                                # Input text

for token in doc:
    print('Text=',token.text,', Vector=',token.has_vector,', OOV=', token.is_oov)     # Check words in vocab or not

Text= dog , Vector= True , OOV= False
Text= cat , Vector= True , OOV= False
Text= car , Vector= True , OOV= False
Text= afskfsd , Vector= False , OOV= True
Text= apple , Vector= True , OOV= False


#### Text: The original token text.
#### Vector: Does the token have a vector representation?
#### OOV: Out-of-vocabulary

## Check the similarity of words

In [30]:
for token1 in doc:                                                      # For token1
    for token2 in doc:                                                  # For token 2
        print(token1.text, token2.text, token1.similarity(token2))      # check similarity of token 1 with token 2

dog dog 1.0
dog cat 0.8016854524612427
dog car 0.35629159212112427
dog afskfsd 0.0
dog apple 0.2633902430534363
cat dog 0.8016854524612427
cat cat 1.0
cat car 0.3190753161907196
cat afskfsd 0.0
cat apple 0.2821384370326996
car dog 0.35629159212112427
car cat 0.3190753161907196
car car 1.0
car afskfsd 0.0
car apple 0.2174709141254425
afskfsd dog 0.0
afskfsd cat 0.0
afskfsd car 0.0
afskfsd afskfsd 1.0
afskfsd apple 0.0
apple dog 0.2633902430534363
apple cat 0.2821384370326996
apple car 0.2174709141254425
apple afskfsd 0.0
apple apple 1.0


  print(token1.text, token2.text, token1.similarity(token2))      # check similarity of token 1 with token 2


# Demo3: Rule-based Matching

# 1. Token-based Matching
### Pattern 1

In [37]:
from spacy.matcher import Matcher                                             # Import matcher
nlp = spacy.load("en_core_web_sm")                                            # Load model
matcher = Matcher(nlp.vocab)
pattern = [{"LOWER": "hello"}, {"IS_PUNCT": True}, {"LOWER": "world"}]        # Add match ID "HelloWorld" with no callback and one pattern
#pattern = [{"LOWER": "hello"}, {"LOWER": "world"}]
matcher.add("HelloWorld", [pattern])
doc = nlp("Hello, world! Hello world!")                                       # Input text
matches = matcher(doc)
for match_id, start, end in matches:                                          # Find matches
    string_id = nlp.vocab.strings[match_id]  # Get string representation
    span = doc[start:end]  # The matched span
    #print(match_id, string_id, start, end, span.text)
    print(span.text)                                                          # print output

Hello world


### Pattern 2

In [32]:
from spacy.matcher import Matcher                                             # Import matcher
nlp = spacy.load("en_core_web_sm")                                            # Load model
matcher = Matcher(nlp.vocab)
pattern = [{"LOWER": "hello"}, {"LOWER": "world"}]                            # Add match ID "HelloWorld" with no callback and one pattern
matcher.add("HelloWorld", [pattern])
doc = nlp("Hello, world! Hello world!")                                       # Input text
matches = matcher(doc)

for match_id, start, end in matches:                                          # Find matches
    string_id = nlp.vocab.strings[match_id]  # Get string representation
    span = doc[start:end]  # The matched span
    #print(match_id, string_id, start, end, span.text)
    print(span.text)                                                          # print output

Hello world


# 2. Efficient Phrase Matching

In [33]:
from spacy.matcher import PhraseMatcher                                               # Import matcher
nlp = spacy.load("en_core_web_sm")                                                    # Load model
matcher = PhraseMatcher(nlp.vocab)
terms = ["Barack Obama", "Angela Merkel", "Washington, D.C."]
patterns = [nlp.make_doc(text) for text in terms]                                     # Only run nlp.make_doc to speed things up
matcher.add("TerminologyList", patterns)
doc = nlp("German Chancellor Angela Merkel and US President Barack Obama "            # Input text
          "converse in the Oval Office inside the White House in Washington, D.C.")
matches = matcher(doc)
for match_id, start, end in matches:                                                  # Find matches
    span = doc[start:end]
    print(span.text)                                                                  # print output

Angela Merkel
Barack Obama
Washington, D.C.


# Case-Insensitive Match Patterns

In [34]:
from spacy.matcher import PhraseMatcher
matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
patterns = [nlp.make_doc(name) for name in ["Angela Merkel", "Barack Obama"]]
matcher.add("Names", patterns)
doc = nlp("angela merkel and us president barack Obama")
for match_id, start, end in matcher(doc):
    print("Matched based on lowercase token text:", doc[start:end])

Matched based on lowercase token text: angela merkel
Matched based on lowercase token text: barack Obama


# 3. Rule-based Entity Recognition

In [35]:
# Validating and debugging EntityRuler patterns
from spacy.lang.en import English

nlp = English()
ruler = nlp.add_pipe("entity_ruler")
patterns = [{"label": "ORG", "pattern": "Apple", "id": "apple"},
            {"label": "GPE", "pattern": [{"LOWER": "san"}, {"LOWER": "francisco"}], "id": "san-francisco"},
            {"label": "GPE", "pattern": [{"LOWER": "san"}, {"LOWER": "fran"}], "id": "san-francisco"}]
ruler.add_patterns(patterns)

doc1 = nlp("Apple is opening its first big office in San Francisco.")
print([(ent.text, ent.label_, ent.ent_id_) for ent in doc1.ents])

doc2 = nlp("Apple is opening its first big office in San Fran.")
print([(ent.text, ent.label_, ent.ent_id_) for ent in doc2.ents])

doc3 = nlp("Dr. Alex Smith chaired first board meeting of Acme Corp Inc.")
print([(ent.text, ent.label_) for ent in doc3.ents])

doc = nlp("Alex Smith worked at Acme Corp Inc.")
print([(ent.text, ent.label_) for ent in doc.ents])

[('Apple', 'ORG', 'apple'), ('San Francisco', 'GPE', 'san-francisco')]
[('Apple', 'ORG', 'apple'), ('San Fran', 'GPE', 'san-francisco')]
[]
[]
