# **Chapter 2**

# Large-scale data analysis with spaCy

## Data Structures : Vocab, Lexeme and String Store

In [1]:
from spacy.lang.en import English
nlp = English()

doc = nlp('I Love Coffee')

In [2]:
# Vocab

print('Hash Value:', nlp.vocab.strings['Coffee'])

Hash Value: 3474706295102377020


In [3]:
# String store

print('String Value:', nlp.vocab.strings[3474706295102377020])

String Value: Coffee


In [4]:
# Lexeme

lexeme = nlp.vocab['Coffee']
print(lexeme.text, lexeme.orth, lexeme.is_alpha)

Coffee 3474706295102377020 True


## Data Structures: Doc, Span, Token

In [5]:
# Creating Doc manually

from spacy.lang.en import English
nlp = English()

from spacy.tokens import Doc

words = ['Hello', 'world', '!']
spaces = [True, False, False]

doc = Doc(nlp.vocab, words = words, spaces = spaces)

type(doc)

spacy.tokens.doc.Doc

In [6]:
# Creating Doc manually

from spacy.lang.en import English
nlp = English()

doc = nlp('Hello World!')

type(doc)

spacy.tokens.doc.Doc

In [7]:
# Creating Span manually

from spacy.lang.en import English
nlp = English()

from spacy.tokens import Doc, Span

words = ['Hello', 'world', '!']
spaces = [True, False, False]

doc = Doc(nlp.vocab, words = words, spaces = spaces)

span = Span(doc, 0, 2)

print(span)
print(type(span))


Hello world
<class 'spacy.tokens.span.Span'>


In [8]:
# Creating Span manually

from spacy.lang.en import English
nlp = English()

doc = nlp('Hello World!, How are you')

span = doc[0:4]

print(span)
print(type(span))

Hello World!,
<class 'spacy.tokens.span.Span'>


## Word vectors and Semantic Similarity

1) we can use Doc.similarity() , Span.similarity() , token.similarity() 
2) Similarity score is 0 to 1
3) use 'en_core_web_md' for medium models (to download open cmd and run this : python -m spacy download en_core_web_lg)
3) use 'en_core_web_lg' for large models (to download open cmd and run this : python -m spacy download en_core_web_lg)
3) Don't use 'en_core_web_sm' for finding similarity (to download open cmd and run this : python -m spacy download en_core_web_sm)

In [9]:
import en_core_web_md

nlp = en_core_web_md.load()

doc1 = nlp('I like fast food')
doc2 = nlp('I like pizza')

print(doc1.similarity(doc2))

0.8627204117787385


In [10]:
doc = nlp('I like pizza and pasta')

token1 = doc[2]
token2 = doc[4]

print(token1.similarity(token2))

0.7369546


In [11]:
doc = nlp('I like pizza')
token = nlp('Soap')[0]

print(doc.similarity(token))

0.32531983166759537


In [12]:
span = nlp('I like pizza and pasta')[2:5]
doc = nlp('McDonalds sells burgers')

print(span.similarity(doc))

0.6199092090831612


Similarity is used in recomendation systems, flagging duplicates ...etc

## how similarity is find out

* Ans : Word vector
* Default : is Cosine Similarity (but can be adjusted

In [13]:
# Word vector

doc = nlp('I have a banana')

print(doc[0].vector)

# print(doc.vector)

[ 1.8733e-01  4.0595e-01 -5.1174e-01 -5.5482e-01  3.9716e-02  1.2887e-01
  4.5137e-01 -5.9149e-01  1.5591e-01  1.5137e+00 -8.7020e-01  5.0672e-02
  1.5211e-01 -1.9183e-01  1.1181e-01  1.2131e-01 -2.7212e-01  1.6203e+00
 -2.4884e-01  1.4060e-01  3.3099e-01 -1.8061e-02  1.5244e-01 -2.6943e-01
 -2.7833e-01 -5.2123e-02 -4.8149e-01 -5.1839e-01  8.6262e-02  3.0818e-02
 -2.1253e-01 -1.1378e-01 -2.2384e-01  1.8262e-01 -3.4541e-01  8.2611e-02
  1.0024e-01 -7.9550e-02 -8.1721e-01  6.5621e-03  8.0134e-02 -3.9976e-01
 -6.3131e-02  3.2260e-01 -3.1625e-02  4.3056e-01 -2.7270e-01 -7.6020e-02
  1.0293e-01 -8.8653e-02 -2.9087e-01 -4.7214e-02  4.6036e-02 -1.7788e-02
  6.4990e-02  8.8451e-02 -3.1574e-01 -5.8522e-01  2.2295e-01 -5.2785e-02
 -5.5981e-01 -3.9580e-01 -7.9849e-02 -1.0933e-02 -4.1722e-02 -5.5576e-01
  8.8707e-02  1.3710e-01 -2.9873e-03 -2.6256e-02  7.7330e-02  3.9199e-01
  3.4507e-01 -8.0130e-02  3.3451e-01  2.7063e-01 -2.4544e-02  7.2576e-02
 -1.8120e-01  2.3693e-01  3.9977e-01  4.5012e-01  2

## Combinig Models and Rules 

In [14]:
from spacy.matcher import Matcher

matcher = Matcher(nlp.vocab)

pattern = [{'LOWER': 'golden'}, {'LOWER': 'retriewer'}]

matcher.add('Dog', [pattern])

doc = nlp('I have a Golden Retriewer')

matches = matcher(doc)
# print(matches)

for match_id, start, end in matches:
    
    span = doc[start:end]
    print('Matched span:', span.text)
    
    print('Root token:', span.root.text)

    print('Root head token:', span.root.head.text)
    
    print('Previous token:', doc[start-1].text, doc[start-1].pos)

Matched span: Golden Retriewer
Root token: Retriewer
Root head token: have
Previous token: a 90


### Efficient way is using  pharse matching

In [15]:
from spacy.matcher import PhraseMatcher

matcher = PhraseMatcher(nlp.vocab)

pattern = nlp('Golden Retriewer')

matcher.add('Dog', [pattern])

doc = nlp('I have a Golden Retriewer')

for match_id, start, end in matcher(doc):
    
    span = doc[start:end]
    print('Matched span:', span.text)

Matched span: Golden Retriewer
