<a href="https://colab.research.google.com/github/mariatomy9/Python/blob/master/Spacy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#!pip install -U spacy

In [None]:
#!pip install -U spacy-lookups-data

In [None]:
#!python -m spacy download en_core_web_sm

# TOKENISATION

In [1]:
import spacy

In [2]:
nlp = spacy.load('en_core_web_sm')

In [3]:
text = 'Apple is looking for buying a UK startup for $1 billion dollar'

In [4]:
doc = nlp(text)

In [5]:
for token in doc:
  print(token.text)

Apple
is
looking
for
buying
a
UK
startup
for
$
1
billion
dollar


# Parts Of Speech [POS] (Tagging)

In [6]:
for token in doc:
  print(token.text,token.pos_)

Apple PROPN
is AUX
looking VERB
for ADP
buying VERB
a DET
UK PROPN
startup NOUN
for ADP
$ SYM
1 NUM
billion NUM
dollar NOUN


In [7]:
for token in doc:
  print(f'{token.text:{15}},{token.pos_}')

Apple          ,PROPN
is             ,AUX
looking        ,VERB
for            ,ADP
buying         ,VERB
a              ,DET
UK             ,PROPN
startup        ,NOUN
for            ,ADP
$              ,SYM
1              ,NUM
billion        ,NUM
dollar         ,NOUN


#Visualisation

In [8]:
from spacy import displacy

In [9]:
displacy.render(doc,style='dep',jupyter=True, options={'distance': 100,'compact':True})

#Named Entity Recognition(NER)

In [10]:
for token in doc.ents:
  print(token.text,token.label_)

Apple ORG
UK GPE
$1 billion dollar MONEY


In [11]:
displacy.render(doc,style='ent',jupyter=True)

#Sentence Segmentation

In [12]:
doc = 'Apple is looking for buying a UK startup. Government has given permission'

In [13]:
doc=nlp(doc)

In [14]:
for sent in doc.sents:
  print(sent)

Apple is looking for buying a UK startup.
Government has given permission


#Rule based Phrase Matching

In [15]:
from spacy.matcher import Matcher
from spacy.tokens import Span

In [16]:
text = 'Hello, world! hello world'

In [17]:
doc = nlp(text)

In [18]:
for token in doc:
  print(token)

Hello
,
world
!
hello
world


In [20]:
pattern = [{'LOWER':'hello'},{'IS_PUNCT':True, 'OP':'?'},{'LOWER':'world'}]

In [21]:
matcher = Matcher(nlp.vocab)
matcher.add('hw',None,pattern)

In [22]:
matches = matcher(doc)

In [23]:
matches

[(17790654416186116455, 0, 3), (17790654416186116455, 4, 6)]

In [24]:
for match_id,start,end in matches:
  string_id = nlp.vocab.strings[match_id]
  span = doc[start:end]
  print(match_id,string_id,start,end,span.text)

17790654416186116455 hw 0 3 Hello, world
17790654416186116455 hw 4 6 hello world


#Regular Expression

In [25]:
text = 'my phone number is 123. ohh its wrong. correct one is 1234567890'

In [26]:
import re

In [27]:
re.search(r'\d',text)

<re.Match object; span=(19, 20), match='1'>

In [28]:
re.findall(r'\d{3}',text)

['123', '123', '456', '789']

In [29]:
re.findall(r'\d+',text)

['123', '1234567890']

In [30]:
re.findall(r'\w+',text)

['my',
 'phone',
 'number',
 'is',
 '123',
 'ohh',
 'its',
 'wrong',
 'correct',
 'one',
 'is',
 '1234567890']

#Wildcard and Exclusion Matching

In [31]:
re.findall(r'c...+',text)

['correct one is 1234567890']

In [32]:
re.findall(r'2.+',text)

['23. ohh its wrong. correct one is 1234567890']

In [33]:
re.findall(r'[^\d]',text)

['m',
 'y',
 ' ',
 'p',
 'h',
 'o',
 'n',
 'e',
 ' ',
 'n',
 'u',
 'm',
 'b',
 'e',
 'r',
 ' ',
 'i',
 's',
 ' ',
 '.',
 ' ',
 'o',
 'h',
 'h',
 ' ',
 'i',
 't',
 's',
 ' ',
 'w',
 'r',
 'o',
 'n',
 'g',
 '.',
 ' ',
 'c',
 'o',
 'r',
 'r',
 'e',
 'c',
 't',
 ' ',
 'o',
 'n',
 'e',
 ' ',
 'i',
 's',
 ' ']

In [34]:
re.findall(r'[^\D]+',text)

['123', '1234567890']

#Processing Pipeline in Spacy

In [35]:
text = ['net income was $9.4 million compared to the prior year of $27 million',
        'revenue exceeds twelve billion dollars with a loss of $1b']

In [36]:
nlp = spacy.load('en_core_web_sm')

In [37]:
%%timeit
docs = nlp.pipe(text,disable=['tagger','parser'])
for doc in docs:
  for ent in doc.ents:
    print(ent.text,ent.label_)
  print()

$9.4 million MONEY
the prior year DATE
$27 million MONEY

twelve billion dollars MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
$27 million MONEY

twelve billion dollars MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
$27 million MONEY

twelve billion dollars MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
$27 million MONEY

twelve billion dollars MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
$27 million MONEY

twelve billion dollars MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
$27 million MONEY

twelve billion dollars MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
$27 million MONEY

twelve billion dollars MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
$27 million MONEY

twelve billion dollars MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
$27 million MONEY

twelve billion dollars MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
$27 million MONEY

twelve billion dollars MONEY
1b MONEY

$9.4 million MONEY
the prior y

#Hashtags and Emoji Detection

In [39]:
%%timeit
docs = nlp.pipe(text)
for doc in docs:
  for ent in doc.ents:
    print(ent.text,ent.label_)
  print()

$9.4 million MONEY
the prior year DATE
$27 million MONEY

twelve billion dollars MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
$27 million MONEY

twelve billion dollars MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
$27 million MONEY

twelve billion dollars MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
$27 million MONEY

twelve billion dollars MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
$27 million MONEY

twelve billion dollars MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
$27 million MONEY

twelve billion dollars MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
$27 million MONEY

twelve billion dollars MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
$27 million MONEY

twelve billion dollars MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
$27 million MONEY

twelve billion dollars MONEY
1b MONEY

$9.4 million MONEY
the prior year DATE
$27 million MONEY

twelve billion dollars MONEY
1b MONEY

$9.4 million MONEY
the prior y