In [1]:
import spacy                                         
from spacy.lang.en import English                            
nlp = spacy.load("en_core_web_sm")              
import collections
from typing import Dict, List, Tuple  
from spacy.matcher import Matcher                                             

# TASK 1

In [2]:
# Review 1: This movie is very scary and long.
# Review 2: This movie is not vrey scary and is slow.
# Review 3: This movie is spooky and good.   

In [3]:
def text2bow(words: List[str], dictionary: Dict[str, int]) -> List[Tuple[int, int]]:
    word_frequences = collections.defaultdict(int)
    
    for word in words:
        if word not in dictionary:                                                
            dictionary[word] = len(dictionary)
        word_frequences[dictionary[word]] += 1
    return list(word_frequences.items())


sample_text1 = "Review 1: This movie is very scary and long."
print(sample_text1.split())                  
dictionary1 = {}                                                          
print('\nBOW Representation: \n', text2bow(sample_text1.split(), dictionary1)) 
print("\n")
print(dictionary1)

print("___________________________________________________________________________________________\n")

sample_text2 = "Review 2: This movie is not vrey scary and is slow." 
print(sample_text2.split()) 
dictionary2 = {}                                                        
print('\nBOW Representation: \n', text2bow(sample_text2.split(), dictionary2)) 
print("\n")
print(dictionary2)

print("___________________________________________________________________________________________\n")

sample_text3 = "Review 3: This movie is spooky and good."
print(sample_text3.split()) 
dictionary3 = {}                                                         
print('\nBOW Representation: \n', text2bow(sample_text3.split(), dictionary3)) 
print("\n")
print(dictionary3)

['Review', '1:', 'This', 'movie', 'is', 'very', 'scary', 'and', 'long.']

BOW Representation: 
 [(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1)]


{'Review': 0, '1:': 1, 'This': 2, 'movie': 3, 'is': 4, 'very': 5, 'scary': 6, 'and': 7, 'long.': 8}
___________________________________________________________________________________________

['Review', '2:', 'This', 'movie', 'is', 'not', 'vrey', 'scary', 'and', 'is', 'slow.']

BOW Representation: 
 [(0, 1), (1, 1), (2, 1), (3, 1), (4, 2), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1)]


{'Review': 0, '2:': 1, 'This': 2, 'movie': 3, 'is': 4, 'not': 5, 'vrey': 6, 'scary': 7, 'and': 8, 'slow.': 9}
___________________________________________________________________________________________

['Review', '3:', 'This', 'movie', 'is', 'spooky', 'and', 'good.']

BOW Representation: 
 [(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)]


{'Review': 0, '3:': 1, 'This': 2, 'movie': 3, 'is': 4, 'spooky': 5, 'and': 6, 'good.'

# TASK 2

In [4]:
matcher = Matcher(nlp.vocab)
pattern1 = [{"LOWER": "hey"}, {"LOWER": "siri"}]
pattern2 = [{"LOWER": "hey"}, {"IS_PUNCT": True}, {"LOWER": "siri"}]
matcher.add("HelloWorld", [pattern1,pattern2])
doc = nlp("Hey, Siri! Hey siri!")                               
matches = matcher(doc)

for match_id, start, end in matches:                                  
    string_id = nlp.vocab.strings[match_id]
    span = doc[start:end]
    print(span.text)

Hey, Siri
Hey siri


# TASK 3

In [5]:
doc = nlp("apple orange pikkstn German")

for token in doc:
    print('Text=',token.text,
          ', Vector=',token.has_vector,
          ', OOV=', token.is_oov)  

Text= apple , Vector= True , OOV= True
Text= orange , Vector= True , OOV= True
Text= pikkstn , Vector= True , OOV= True
Text= German , Vector= True , OOV= True


# TASK 4

In [6]:
from spacy.matcher import PhraseMatcher   

In [7]:
matcher = PhraseMatcher(nlp.vocab)

terms = ["ROTTEN mangoes", "sweet oranges"]

patterns = [nlp.make_doc(text) for text in terms]  

matcher.add("TerminologyList", patterns)

doc = nlp("Do not put rotten mangoes and sweet oranges together.")

matches = matcher(doc)

for match_id, start, end in matches:                                               
    span = doc[start:end]
    print(span.text)                                                       

sweet oranges


# TASK 5

In [8]:
doc = nlp(u"I prefer the morning flight through Denmark.")

for token in doc:                      
    print('Word Vector Representation:\n',token.vector)  
    print('Vector Length:\n',token.vector.shape) 

Word Vector Representation:
 [ 0.06806803 -1.2016169  -1.5652864  -0.40728194  1.6453729  -0.1759868
  1.6268498  -0.34248835  0.6206384   0.6909156  -0.19549385 -0.1040764
  1.2592496   0.380324   -1.2539439   0.76539445 -1.0354416  -0.12940559
  0.64465    -0.7193433   0.9052192  -0.06999022  0.24625885 -0.96396774
  2.9613056   0.54644954 -1.4664526  -0.11041537 -1.4427226   0.11520356
 -0.6636156  -0.00677587 -1.7327354   0.25221124 -0.50659573 -1.6204643
 -0.72535056 -0.00526944 -0.4279209   0.5101794   0.991848    1.6598321
 -0.35380322  0.86431944 -0.20077458  0.17857635 -0.13458827 -0.5561398
  0.53677666 -0.6357974   0.16459122  0.0145467   1.9127849  -0.20415738
  1.6507477   3.125585   -0.43996757  1.5095301   0.5535733   0.07370305
  0.6603886  -0.9397799   0.5003335  -0.96051013 -0.8717852   0.8672014
 -1.0910957  -0.13232425  0.6587483  -1.1100581  -0.21392867  0.01556435
 -0.75802076 -0.8030315  -0.70256245  0.47329092 -0.9076105  -0.4801259
  1.1674395  -0.23986194 -0.5

# TASK 6

In [9]:
doc=nlp("rotten sweet")

for i in doc:
    print("text-",i.text,"vector-",i.has_vector,",OOV",i.is_oov)

text- rotten vector- True ,OOV True
text- sweet vector- True ,OOV True


##### Similarities...

In [12]:
doc=nlp("Mangoes Oranges Sweet")

for i in doc:
    for k in doc:
        for l in doc:
            print(i.text,k.text,l.text,i.similarity(k))

Mangoes Mangoes Mangoes 1.0
Mangoes Mangoes Oranges 1.0
Mangoes Mangoes Sweet 1.0
Mangoes Oranges Mangoes 0.5993388295173645
Mangoes Oranges Oranges 0.5993388295173645
Mangoes Oranges Sweet 0.5993388295173645
Mangoes Sweet Mangoes 0.245577871799469
Mangoes Sweet Oranges 0.245577871799469
Mangoes Sweet Sweet 0.245577871799469
Oranges Mangoes Mangoes 0.5993388295173645
Oranges Mangoes Oranges 0.5993388295173645
Oranges Mangoes Sweet 0.5993388295173645
Oranges Oranges Mangoes 1.0
Oranges Oranges Oranges 1.0
Oranges Oranges Sweet 1.0
Oranges Sweet Mangoes 0.275787353515625
Oranges Sweet Oranges 0.275787353515625
Oranges Sweet Sweet 0.275787353515625
Sweet Mangoes Mangoes 0.245577871799469
Sweet Mangoes Oranges 0.245577871799469
Sweet Mangoes Sweet 0.245577871799469
Sweet Oranges Mangoes 0.275787353515625
Sweet Oranges Oranges 0.275787353515625
Sweet Oranges Sweet 0.275787353515625
Sweet Sweet Mangoes 1.0
Sweet Sweet Oranges 1.0
Sweet Sweet Sweet 1.0


  print(i.text,k.text,l.text,i.similarity(k))


In [16]:
doc1=nlp("Mangoes")
doc2=nlp("Oranges")

for i in doc1:
    for x in doc2:
        print(i.text,k.text,i.similarity(k))

Mangoes Sweet 0.3060813546180725


  print(i.text,k.text,i.similarity(k))


In [17]:
doc3=nlp("sweet oranges")

for i in doc3:
    for x in doc3:
        print(i.text,k.text,i.similarity(x))

sweet Sweet 1.0
sweet Sweet 0.03877781331539154
oranges Sweet 0.03877781331539154
oranges Sweet 1.0


  print(i.text,k.text,i.similarity(x))


1. I prefer the morning flight through Denmark.
2. The infrastructure of our school is wonderful.
3. 
4.  Ben slapped the mosquito on his arm quite suddenly