<a href="https://colab.research.google.com/github/onlyabhilash/Spark_NLP/blob/main/spark-nlp_basics/spark_01.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import os

# Install java
! apt-get update -qq
! apt-get install -y openjdk-8-jdk-headless -qq > /dev/null

!wget -q https://archive.apache.org/dist/spark/spark-2.3.0/spark-2.3.0-bin-hadoop2.7.tgz

!tar xf spark-2.3.0-bin-hadoop2.7.tgz
!pip install -q findspark

os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["PATH"] = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"]
os.environ["SPARK_HOME"] = "/content/spark-2.3.0-bin-hadoop2.7"
! java -version

import findspark
findspark.init()
from pyspark.sql import SparkSession

! pip install --ignore-installed -q spark-nlp==2.7.5
import sparknlp
from sparknlp.base import *
from sparknlp.annotator import *

spark = sparknlp.start(spark23=True)

openjdk version "1.8.0_312"
OpenJDK Runtime Environment (build 1.8.0_312-8u312-b07-0ubuntu1~18.04-b07)
OpenJDK 64-Bit Server VM (build 25.312-b07, mixed mode)
[K     |████████████████████████████████| 139 kB 5.5 MB/s 
[?25h

In [None]:
spark = sparknlp.start()

print("Spark NLP version: ", sparknlp.version())
print("Apache Spark version: ", spark.version)

Spark NLP version:  2.7.5
Apache Spark version:  2.3.0


In [None]:
from sparknlp.pretrained import PretrainedPipeline

In [None]:
pipeline = PretrainedPipeline('recognize_entities_dl','en')

recognize_entities_dl download started this may take some time.
Approx size to download 159 MB
[OK!]


In [None]:
result = pipeline.annotate('Google has announced the release of a beta version of the popular TensorFlow machine learning library.')

In [None]:
print(result)

{'entities': ['Google', 'TensorFlow'], 'document': ['Google has announced the release of a beta version of the popular TensorFlow machine learning library.'], 'token': ['Google', 'has', 'announced', 'the', 'release', 'of', 'a', 'beta', 'version', 'of', 'the', 'popular', 'TensorFlow', 'machine', 'learning', 'library', '.'], 'ner': ['B-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'O', 'O', 'O', 'O'], 'embeddings': ['Google', 'has', 'announced', 'the', 'release', 'of', 'a', 'beta', 'version', 'of', 'the', 'popular', 'TensorFlow', 'machine', 'learning', 'library', '.'], 'sentence': ['Google has announced the release of a beta version of the popular TensorFlow machine learning library.']}


In [None]:
print(result['ner'])

['B-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'O', 'O', 'O', 'O']


In [None]:
print(result['entities'])

['Google', 'TensorFlow']


In [None]:
# Sentiment Analysis
pipeline = PretrainedPipeline('analyze_sentiment','en')

analyze_sentiment download started this may take some time.
Approx size to download 4.9 MB
[OK!]


In [None]:
result = pipeline.annotate('This is a very boring movie. I recommend others to awoid this movie is not good..')

In [None]:
print(result)

{'checked': ['This', 'is', 'a', 'very', 'boring', 'movie', '.', 'I', 'recommend', 'others', 'to', 'avoid', 'this', 'movie', 'is', 'not', 'good', '.', '.'], 'document': ['This is a very boring movie. I recommend others to awoid this movie is not good..'], 'sentiment': ['negative', 'negative', 'negative'], 'token': ['This', 'is', 'a', 'very', 'boring', 'movie', '.', 'I', 'recommend', 'others', 'to', 'awoid', 'this', 'movie', 'is', 'not', 'good', '.', '.'], 'sentence': ['This is a very boring movie.', 'I recommend others to awoid this movie is not good.', '.']}


In [None]:
print(result['sentiment'])

['negative', 'negative', 'negative']


In [None]:
#The word `awoid` has been corrected to `avoid` by spell checker insdie this pipeline
print(result['checked'])

['This', 'is', 'a', 'very', 'boring', 'movie', '.', 'I', 'recommend', 'others', 'to', 'avoid', 'this', 'movie', 'is', 'not', 'good', '.', '.']


In [None]:
testDoc = '''
Peter is a very good persn.
My life in Russia is very intersting.
John and Peter are brothrs. However they don't support each other that much.
Lucas Nogal Dunbercker is no longer happy. He has a good car though.
Europe is very culture rich. There are huge churches! and big houses!
'''

Explaining Document DL

**Stages**
- DocumentAssembler
- SentenceDetector
- Tokenizer
- NER (NER with GloVe 100D embeddings, CoNLL2003 dataset)
- Lemmatizer
- Stemmer
- Part of Speech
- SpellChecker (Norvig)

In [None]:
pipeline = PretrainedPipeline('explain_document_ml', lang='en')

explain_document_ml download started this may take some time.
Approx size to download 9.4 MB
[OK!]


In [None]:
pipeline.model.stages

[document_2ec0b742eccd,
 SENTENCE_98fb8e28cb7b,
 REGEX_TOKENIZER_1f63ed636a13,
 SPELL_e4ea67180337,
 LEMMATIZER_c62ad8f355f9,
 STEMMER_75edcc4a9cdb,
 POS_29fd848601e6]

In [None]:
# Load pretrained pipeline from local disk:

pipeline_local = PretrainedPipeline.from_disk('/root/cache_pretrained/explain_document_ml_en_2.4.0_2.4_1580252705962')

In [None]:
print(pipeline_local)

<sparknlp.pretrained.PretrainedPipeline object at 0x7f1aaa7a11d0>


In [None]:
%%time
result = pipeline.annotate(testDoc)

CPU times: user 66.1 ms, sys: 26.9 ms, total: 93 ms
Wall time: 1.58 s


In [None]:
result.keys()

dict_keys(['document', 'spell', 'pos', 'lemmas', 'token', 'stems', 'sentence'])

In [None]:
result['sentence']

['Peter is a very good persn.',
 'My life in Russia is very intersting.',
 'John and Peter are brothrs.',
 "However they don't support each other that much.",
 'Lucas Nogal Dunbercker is no longer happy.',
 'He has a good car though.',
 'Europe is very culture rich.',
 'There are huge churches!',
 'and big houses!']

In [None]:
result['token']

['Peter',
 'is',
 'a',
 'very',
 'good',
 'persn',
 '.',
 'My',
 'life',
 'in',
 'Russia',
 'is',
 'very',
 'intersting',
 '.',
 'John',
 'and',
 'Peter',
 'are',
 'brothrs',
 '.',
 'However',
 'they',
 "don't",
 'support',
 'each',
 'other',
 'that',
 'much',
 '.',
 'Lucas',
 'Nogal',
 'Dunbercker',
 'is',
 'no',
 'longer',
 'happy',
 '.',
 'He',
 'has',
 'a',
 'good',
 'car',
 'though',
 '.',
 'Europe',
 'is',
 'very',
 'culture',
 'rich',
 '.',
 'There',
 'are',
 'huge',
 'churches',
 '!',
 'and',
 'big',
 'houses',
 '!']

In [None]:
list(zip(result['token'],result['pos']))

[('Peter', 'NNP'),
 ('is', 'VBZ'),
 ('a', 'DT'),
 ('very', 'RB'),
 ('good', 'JJ'),
 ('persn', 'NN'),
 ('.', '.'),
 ('My', 'PRP$'),
 ('life', 'NN'),
 ('in', 'IN'),
 ('Russia', 'NNP'),
 ('is', 'VBZ'),
 ('very', 'RB'),
 ('intersting', 'JJ'),
 ('.', '.'),
 ('John', 'NNP'),
 ('and', 'CC'),
 ('Peter', 'NNP'),
 ('are', 'VBP'),
 ('brothrs', 'NNS'),
 ('.', '.'),
 ('However', 'RB'),
 ('they', 'PRP'),
 ("don't", 'VBP'),
 ('support', 'VB'),
 ('each', 'DT'),
 ('other', 'JJ'),
 ('that', 'IN'),
 ('much', 'JJ'),
 ('.', '.'),
 ('Lucas', 'NNP'),
 ('Nogal', 'NNP'),
 ('Dunbercker', 'NNP'),
 ('is', 'VBZ'),
 ('no', 'DT'),
 ('longer', 'RB'),
 ('happy', 'JJ'),
 ('.', '.'),
 ('He', 'PRP'),
 ('has', 'VBZ'),
 ('a', 'DT'),
 ('good', 'JJ'),
 ('car', 'NN'),
 ('though', 'IN'),
 ('.', '.'),
 ('Europe', 'NNP'),
 ('is', 'VBZ'),
 ('very', 'RB'),
 ('culture', 'RB'),
 ('rich', 'JJ'),
 ('.', '.'),
 ('There', 'EX'),
 ('are', 'VBP'),
 ('huge', 'JJ'),
 ('churches', 'NNS'),
 ('!', '.'),
 ('and', 'CC'),
 ('big', 'JJ'),
 ('house

In [None]:
list(zip(result['token'], result['lemmas'], result['stems'], result['spell']))

[('Peter', 'Peter', 'peter', 'Peter'),
 ('is', 'be', 'i', 'is'),
 ('a', 'a', 'a', 'a'),
 ('very', 'very', 'veri', 'very'),
 ('good', 'good', 'good', 'good'),
 ('persn', 'person', 'person', 'person'),
 ('.', '.', '.', '.'),
 ('My', 'My', 'my', 'My'),
 ('life', 'life', 'life', 'life'),
 ('in', 'in', 'in', 'in'),
 ('Russia', 'Russia', 'russia', 'Russia'),
 ('is', 'be', 'i', 'is'),
 ('very', 'very', 'veri', 'very'),
 ('intersting', 'interest', 'interest', 'interesting'),
 ('.', '.', '.', '.'),
 ('John', 'John', 'john', 'John'),
 ('and', 'and', 'and', 'and'),
 ('Peter', 'Peter', 'peter', 'Peter'),
 ('are', 'be', 'ar', 'are'),
 ('brothrs', 'broth', 'broth', 'broths'),
 ('.', '.', '.', '.'),
 ('However', 'However', 'howev', 'However'),
 ('they', 'they', 'thei', 'they'),
 ("don't", "don't", "don't", "don't"),
 ('support', 'support', 'support', 'support'),
 ('each', 'each', 'each', 'each'),
 ('other', 'other', 'other', 'other'),
 ('that', 'that', 'that', 'that'),
 ('much', 'much', 'much', 'much

In [None]:
import pandas as pd

df = pd.DataFrame ({'token' : result['token'],
                    'corrected' : result['spell'],'POS' : result['pos'],
                    'lemmas' : result['lemmas'],'stems' : result['stems']})

df

Unnamed: 0,token,corrected,POS,lemmas,stems
0,Peter,Peter,NNP,Peter,peter
1,is,is,VBZ,be,i
2,a,a,DT,a,a
3,very,very,RB,very,veri
4,good,good,JJ,good,good
5,persn,person,NN,person,person
6,.,.,.,.,.
7,My,My,PRP$,My,my
8,life,life,NN,life,life
9,in,in,IN,in,in


In [None]:
pipeline_dl = PretrainedPipeline('explain_document_dl', lang='en')

explain_document_dl download started this may take some time.
Approx size to download 168.4 MB
[OK!]


In [None]:
pipeline_dl.model

pipeline_9e9d13784977

In [None]:
pipeline_dl.model.stages

[document_7939d5bf1083,
 SENTENCE_05265b07c745,
 REGEX_TOKENIZER_b4f4b39b56e3,
 SPELL_e4ea67180337,
 LEMMATIZER_c62ad8f355f9,
 STEMMER_ba49f7631065,
 POS_29fd848601e6,
 WORD_EMBEDDINGS_MODEL_48cffc8b9a76,
 NerDLModel_d4424c9af5f4,
 NER_CONVERTER_a81db9af2d23]

In [None]:
pipeline_dl.model.stages[-2]

NerDLModel_d4424c9af5f4

In [None]:
pipeline_dl.model.stages[-2].getStorageRef()

'glove_100d'

In [None]:
pipeline_dl.model.stages[-2].getClasses()

['O', 'B-ORG', 'B-LOC', 'B-PER', 'I-PER', 'I-ORG', 'B-MISC', 'I-LOC', 'I-MISC']

In [None]:
%%time

result = pipeline_dl.annotate(testDoc)

result.keys()

CPU times: user 90.3 ms, sys: 31.6 ms, total: 122 ms
Wall time: 1.41 s


In [None]:
result.keys()

dict_keys(['entities', 'stem', 'checked', 'lemma', 'document', 'pos', 'token', 'ner', 'embeddings', 'sentence'])

In [None]:
result['entities']

['Peter', 'Russia', 'John', 'Peter', 'Lucas Nogal Dunbercker', 'Europe']

In [None]:
df = pd.DataFrame({'token':result['token'], 'ner_label':result['ner'],
                      'spell_corrected':result['checked'], 'POS':result['pos'],
                      'lemmas':result['lemma'], 'stems':result['stem']})

df

Unnamed: 0,token,ner_label,spell_corrected,POS,lemmas,stems
0,Peter,B-PER,Peter,NNP,Peter,peter
1,is,O,is,VBZ,be,i
2,a,O,a,DT,a,a
3,very,O,very,RB,very,veri
4,good,O,good,JJ,good,good
5,persn,O,person,NN,person,person
6,.,O,.,.,.,.
7,My,O,My,PRP$,My,my
8,life,O,life,NN,life,life
9,in,O,in,IN,in,in


RECOGNISE ENTITY DL

In [None]:
recognize_entities = PretrainedPipeline('recognize_entities_dl', lang='en')

recognize_entities_dl download started this may take some time.
Approx size to download 159 MB
[OK!]


In [None]:
recognize_entities.model.stages

[document_1c58bc1aca5d,
 SENTENCE_328d8a47c1a8,
 REGEX_TOKENIZER_e4d729e653b0,
 WORD_EMBEDDINGS_MODEL_48cffc8b9a76,
 NerDLModel_d4424c9af5f4,
 NER_CONVERTER_389b80afbf7d]

In [None]:
recognize_entities.model.stages[3].getStorageRef()

'glove_100d'

In [None]:
recognize_entities.model.stages[4].getClasses()

['O', 'B-ORG', 'B-LOC', 'B-PER', 'I-PER', 'I-ORG', 'B-MISC', 'I-LOC', 'I-MISC']

In [None]:
testDoc = '''
Peter is a very good persn.
My life in Russia is very intersting.
John and Peter are brothrs. However they don't support each other that much.
Lucas Nogal Dunbercker is no longer happy. He has a good car though.
Europe is very culture rich. There are huge churches! and big houses!
'''

result = recognize_entities.annotate(testDoc)

list(zip(result['token'], result['ner']))

[('Peter', 'B-PER'),
 ('is', 'O'),
 ('a', 'O'),
 ('very', 'O'),
 ('good', 'O'),
 ('persn', 'O'),
 ('.', 'O'),
 ('My', 'O'),
 ('life', 'O'),
 ('in', 'O'),
 ('Russia', 'B-LOC'),
 ('is', 'O'),
 ('very', 'O'),
 ('intersting', 'O'),
 ('.', 'O'),
 ('John', 'B-PER'),
 ('and', 'O'),
 ('Peter', 'B-PER'),
 ('are', 'O'),
 ('brothrs', 'O'),
 ('.', 'O'),
 ('However', 'O'),
 ('they', 'O'),
 ("don't", 'O'),
 ('support', 'O'),
 ('each', 'O'),
 ('other', 'O'),
 ('that', 'O'),
 ('much', 'O'),
 ('.', 'O'),
 ('Lucas', 'B-ORG'),
 ('Nogal', 'I-ORG'),
 ('Dunbercker', 'I-ORG'),
 ('is', 'O'),
 ('no', 'O'),
 ('longer', 'O'),
 ('happy', 'O'),
 ('.', 'O'),
 ('He', 'O'),
 ('has', 'O'),
 ('a', 'O'),
 ('good', 'O'),
 ('car', 'O'),
 ('though', 'O'),
 ('.', 'O'),
 ('Europe', 'B-LOC'),
 ('is', 'O'),
 ('very', 'O'),
 ('culture', 'O'),
 ('rich', 'O'),
 ('.', 'O'),
 ('There', 'O'),
 ('are', 'O'),
 ('huge', 'O'),
 ('churches', 'O'),
 ('!', 'O'),
 ('and', 'O'),
 ('big', 'O'),
 ('houses', 'O'),
 ('!', 'O')]

CLEAN STOPWORDS

In [None]:
clean_stop = PretrainedPipeline('clean_stop',lang = 'en')

clean_stop download started this may take some time.
Approx size to download 12.4 KB
[OK!]


In [None]:
result = clean_stop.annotate(testDoc)
result.keys()

dict_keys(['document', 'sentence', 'token', 'cleanTokens'])

In [None]:
' '.join(result['cleanTokens'])

"Peter good persn . life Russia intersting . John Peter brothrs . don't support . Lucas Nogal Dunbercker longer happy . good car . Europe culture rich . huge churches ! big houses !"

CLEAN SLANG

In [None]:
clean_slang = PretrainedPipeline('clean_slang', lang = 'en')
result = clean_slang.annotate(' Whatsup bro, call me ASAP')
result.keys()

clean_slang download started this may take some time.
Approx size to download 21.8 KB
[OK!]


dict_keys(['document', 'token', 'normal'])

In [None]:
' '.join(result['normal'])

'how are you friend call me as soon as possible'

In [None]:
clean_slang.model.stages

[document_d30c0ae7a10b, REGEX_TOKENIZER_4ec6a1a85734, NORMALIZER_64aaaca9eae5]

In [None]:
clean_slang.model.stages[-1]

NORMALIZER_64aaaca9eae5

### Spell Checker 

(Norvig Algo)

ref: https://norvig.com/spell-correct.html

In [None]:
check_spelling = PretrainedPipeline('check_spelling',lang = 'en')

check_spelling download started this may take some time.
Approx size to download 892.6 KB
[OK!]


In [None]:
testDoc = '''
Peter is a very good persn.
My life in Russia is very intersting.
John and Peter are brothrs. However they don't support each other that much.
Lucas Nogal Dunbercker is no longer happy. He has a good car though.
Europe is very culture rich. There are huge churches! and big houses!
'''

result = check_spelling.annotate(testDoc)

result.keys()

dict_keys(['document', 'sentence', 'token', 'checked'])

In [None]:
list(zip(result['token'], result['checked']))

[('Peter', 'Peter'),
 ('is', 'is'),
 ('a', 'a'),
 ('very', 'very'),
 ('good', 'good'),
 ('persn', 'person'),
 ('.', '.'),
 ('My', 'My'),
 ('life', 'life'),
 ('in', 'in'),
 ('Russia', 'Russia'),
 ('is', 'is'),
 ('very', 'very'),
 ('intersting', 'interesting'),
 ('.', '.'),
 ('John', 'John'),
 ('and', 'and'),
 ('Peter', 'Peter'),
 ('are', 'are'),
 ('brothrs', 'brothers'),
 ('.', '.'),
 ('However', 'However'),
 ('they', 'they'),
 ("don't", "don't"),
 ('support', 'support'),
 ('each', 'each'),
 ('other', 'other'),
 ('that', 'that'),
 ('much', 'much'),
 ('.', '.'),
 ('Lucas', 'Lucas'),
 ('Nogal', 'Nigel'),
 ('Dunbercker', 'Dunbercker'),
 ('is', 'is'),
 ('no', 'no'),
 ('longer', 'longer'),
 ('happy', 'happy'),
 ('.', '.'),
 ('He', 'He'),
 ('has', 'has'),
 ('a', 'a'),
 ('good', 'good'),
 ('car', 'car'),
 ('though', 'though'),
 ('.', '.'),
 ('Europe', 'Europe'),
 ('is', 'is'),
 ('very', 'very'),
 ('culture', 'culture'),
 ('rich', 'rich'),
 ('.', '.'),
 ('There', 'There'),
 ('are', 'are'),
 ('h

### Spell Checker DL

https://medium.com/spark-nlp/applying-context-aware-spell-checking-in-spark-nlp-3c29c46963bc

In [None]:
check_spelling_dl = PretrainedPipeline('check_spelling_dl', lang='en')

check_spelling_dl download started this may take some time.
Approx size to download 112.2 MB
[OK!]


In [None]:
text = 'We will go to swimming if the ueather is nice.'

result = check_spelling_dl.annotate(text)

list(zip(result['token'], result['checked']))

[('We', 'We'),
 ('will', 'will'),
 ('go', 'go'),
 ('to', 'to'),
 ('swimming', 'swimming'),
 ('if', 'if'),
 ('the', 'the'),
 ('ueather', 'Heather'),
 ('is', 'is'),
 ('nice', 'nice'),
 ('.', '.')]

In [None]:
result.keys()

dict_keys(['document', 'sentences', 'token', 'checked'])

In [None]:
# check for the different occurrences of the word "ueather"
examples = ['We will go to swimming if the ueather is nice.',\
    "I have a black ueather jacket, so nice.",\
    "I introduce you to my sister, she is called ueather."]

results = check_spelling_dl.annotate(examples)

for result in results:
  print(list(zip(result['token'], result['checked'])))

[('We', 'We'), ('will', 'will'), ('go', 'go'), ('to', 'to'), ('swimming', 'swimming'), ('if', 'if'), ('the', 'the'), ('ueather', 'Heather'), ('is', 'is'), ('nice', 'nice'), ('.', '.')]
[('I', 'I'), ('have', 'have'), ('a', 'a'), ('black', 'black'), ('ueather', 'leather'), ('jacket', 'jacket'), (',', ','), ('so', 'so'), ('nice', 'nice'), ('.', '.')]
[('I', 'I'), ('introduce', 'introduce'), ('you', 'you'), ('to', 'to'), ('my', 'my'), ('sister', 'sister'), (',', ','), ('she', 'she'), ('is', 'is'), ('called', 'called'), ('ueather', 'Heather'), ('.', '.')]


In [None]:
for result in results:
 print(result['document'],'>>',[pairs for pairs in list(zip(result['token'], result['checked'])) if pairs[0]!=pairs[1]])

['We will go to swimming if the ueather is nice.'] >> [('ueather', 'Heather')]
['I have a black ueather jacket, so nice.'] >> [('ueather', 'leather')]
['I introduce you to my sister, she is called ueather.'] >> [('ueather', 'Heather')]


In [None]:
# if we had tried the same with spell_checker (previous version)

results = check_spelling.annotate(examples)

for result in results:
  print (list(zip(result['token'], result['checked'])))

[('We', 'We'), ('will', 'will'), ('go', 'go'), ('to', 'to'), ('swimming', 'swimming'), ('if', 'if'), ('the', 'the'), ('ueather', 'weather'), ('is', 'is'), ('nice', 'nice'), ('.', '.')]
[('I', 'I'), ('have', 'have'), ('a', 'a'), ('black', 'black'), ('ueather', 'weather'), ('jacket', 'jacket'), (',', ','), ('so', 'so'), ('nice', 'nice'), ('.', '.')]
[('I', 'I'), ('introduce', 'introduce'), ('you', 'you'), ('to', 'to'), ('my', 'my'), ('sister', 'sister'), (',', ','), ('she', 'she'), ('is', 'is'), ('called', 'called'), ('ueather', 'weather'), ('.', '.')]


In [None]:
for result in results:
 print(result['document'],'>>',[pairs for pairs in list(zip(result['token'], result['checked'])) if pairs[0]!=pairs[1]])

['We will go to swimming if the ueather is nice.'] >> [('ueather', 'weather')]
['I have a black ueather jacket, so nice.'] >> [('ueather', 'weather')]
['I introduce you to my sister, she is called ueather.'] >> [('ueather', 'weather')]


Parsing a list of texts

In [None]:
testDoc_list = ['French author who helped pioner the science-fiction genre.',
'Verne wrate about space, air, and underwater travel before navigable aircrast',
'Practical submarines were invented, and before any means of space travel had been devised.']

testDoc_list

['French author who helped pioner the science-fiction genre.',
 'Verne wrate about space, air, and underwater travel before navigable aircrast',
 'Practical submarines were invented, and before any means of space travel had been devised.']

In [None]:
pipeline = PretrainedPipeline('explain_document_ml', lang='en')

explain_document_ml download started this may take some time.
Approx size to download 9.4 MB
[OK!]


In [None]:
result_list = pipeline.annotate(testDoc_list)
#result_list
len (result_list)

3

In [None]:
result_list[0]

{'document': ['French author who helped pioner the science-fiction genre.'],
 'lemmas': ['French',
  'author',
  'who',
  'help',
  'pioneer',
  'the',
  'sciencefiction',
  'genre',
  '.'],
 'pos': ['JJ', 'NN', 'WP', 'VBD', 'NN', 'DT', 'NN', 'NN', '.'],
 'sentence': ['French author who helped pioner the science-fiction genre.'],
 'spell': ['French',
  'author',
  'who',
  'helped',
  'pioneer',
  'the',
  'sciencefiction',
  'genre',
  '.'],
 'stems': ['french',
  'author',
  'who',
  'help',
  'pioneer',
  'the',
  'sciencefict',
  'genr',
  '.'],
 'token': ['French',
  'author',
  'who',
  'helped',
  'pioner',
  'the',
  'science-fiction',
  'genre',
  '.']}

### Using fullAnnotate to get more details

```
annotatorType: String, 
begin: Int, 
end: Int, 
result: String, (this is what annotate returns)
metadata: Map[String, String], 
embeddings: Array[Float]
```

In [None]:
text = 'Peter Parker is a nice guy and lives in New York'

# pipeline_dl >> explain_document_dl

detailed_result = pipeline_dl.fullAnnotate(text)
detailed_result

[{'checked': [Annotation(token, 0, 4, Peter, {'confidence': '1.0', 'sentence': '0'}),
   Annotation(token, 6, 11, Parker, {'confidence': '1.0', 'sentence': '0'}),
   Annotation(token, 13, 14, is, {'confidence': '1.0', 'sentence': '0'}),
   Annotation(token, 16, 16, a, {'confidence': '1.0', 'sentence': '0'}),
   Annotation(token, 18, 21, nice, {'confidence': '1.0', 'sentence': '0'}),
   Annotation(token, 23, 25, guy, {'confidence': '1.0', 'sentence': '0'}),
   Annotation(token, 27, 29, and, {'confidence': '1.0', 'sentence': '0'}),
   Annotation(token, 31, 35, lives, {'confidence': '1.0', 'sentence': '0'}),
   Annotation(token, 37, 38, in, {'confidence': '1.0', 'sentence': '0'}),
   Annotation(token, 40, 42, New, {'confidence': '1.0', 'sentence': '0'}),
   Annotation(token, 44, 47, York, {'confidence': '1.0', 'sentence': '0'})],
  'document': [Annotation(document, 0, 47, Peter Parker is a nice guy and lives in New York, {})],
  'embeddings': [Annotation(word_embeddings, 0, 4, Peter, {'is

In [None]:
detailed_result[0]['entities']

[Annotation(chunk, 0, 11, Peter Parker, {'entity': 'PER', 'sentence': '0', 'chunk': '0'}),
 Annotation(chunk, 40, 47, New York, {'entity': 'LOC', 'sentence': '0', 'chunk': '1'})]

In [None]:
detailed_result[0]['entities'][0].result

'Peter Parker'

In [None]:
chunks = []
entities = []

for n in detailed_result[0]['entities']:
  chunks.append(n.result)
  entities.append(n.metadata['entity'])

df = pd.DataFrame({'chunks' : chunks,'entities' : entities})
df

Unnamed: 0,chunks,entities
0,Peter Parker,PER
1,New York,LOC


In [None]:
tuples = []

for x,y,z in zip(detailed_result[0]["token"],detailed_result[0]["pos"],detailed_result[0]["ner"]):
  tuples.append((int(x.metadata['sentence']),x.result,x.begin,x.end,y.result,z.result))

df = pd.DataFrame(tuples, columns = ['sentence','token','start','end','pos','ner'])
df

Unnamed: 0,sentence,token,start,end,pos,ner
0,0,Peter,0,4,NNP,B-PER
1,0,Parker,6,11,NNP,I-PER
2,0,is,13,14,VBZ,O
3,0,a,16,16,DT,O
4,0,nice,18,21,JJ,O
5,0,guy,23,25,NN,O
6,0,and,27,29,CC,O
7,0,lives,31,35,NNS,O
8,0,in,37,38,IN,O
9,0,New,40,42,NNP,B-LOC


### Use pretrained match_chunk Pipeline for Individual Noun Phrase

**Stages**
- DocumentAssembler
- SentenceDetector
- Tokenizer
- Part of Speech
- Chunker

Pipeline:

- The pipeline uses regex `<DT>?<JJ>*<NN>+`
- which states that whenever the chunk finds an optional determiner (DT) followed by any number of adjectives (JJ) and then a noun (NN) then the Noun Phrase(NP) chunk should be formed.

In [None]:
pipeline = PretrainedPipeline('match_chunks', lang='en')

match_chunks download started this may take some time.
Approx size to download 4.3 MB
[OK!]


In [None]:
pipeline.model.stages

[document_07d28fdac7a8,
 SENTENCE_2b5ea649b767,
 REGEX_TOKENIZER_09632b6f1612,
 POS_29fd848601e6,
 CHUNKER_7c7ff8d3e0eb]

In [None]:
result = pipeline.annotate("The book has many chapters") # single noun phrase
result

{'chunk': ['The book'],
 'document': ['The book has many chapters'],
 'pos': ['DT', 'NN', 'VBZ', 'JJ', 'NNS'],
 'sentence': ['The book has many chapters'],
 'token': ['The', 'book', 'has', 'many', 'chapters']}

In [None]:
result['chunk']

['The book']

In [None]:
result = pipeline.annotate("the little yellow dog barked at the cat") #multiple noune phrases
result

{'chunk': ['the little yellow dog', 'the cat'],
 'document': ['the little yellow dog barked at the cat'],
 'pos': ['DT', 'JJ', 'JJ', 'NN', 'JJ', 'IN', 'DT', 'NN'],
 'sentence': ['the little yellow dog barked at the cat'],
 'token': ['the', 'little', 'yellow', 'dog', 'barked', 'at', 'the', 'cat']}

In [None]:
result['chunk']

['the little yellow dog', 'the cat']

In [None]:
pipeline = PretrainedPipeline('match_datetime', lang='en')

match_datetime download started this may take some time.
Approx size to download 12.9 KB
[OK!]


In [None]:
result = pipeline.annotate("I saw him yesterday and he told me that he will visit us next week")

result

{'date': ['2022/03/17', '2022/03/09'],
 'document': ['I saw him yesterday and he told me that he will visit us next week'],
 'sentence': ['I saw him yesterday and he told me that he will visit us next week'],
 'token': ['I',
  'saw',
  'him',
  'yesterday',
  'and',
  'he',
  'told',
  'me',
  'that',
  'he',
  'will',
  'visit',
  'us',
  'next',
  'week']}

In [None]:
detailed_result = pipeline.fullAnnotate("I saw him yesterday and he told me that he will visit us next week")

detailed_result

[{'date': [Annotation(date, 57, 65, 2022/03/17, {'sentence': '0'}),
   Annotation(date, 10, 18, 2022/03/09, {'sentence': '0'})],
  'document': [Annotation(document, 0, 65, I saw him yesterday and he told me that he will visit us next week, {})],
  'sentence': [Annotation(document, 0, 65, I saw him yesterday and he told me that he will visit us next week, {'sentence': '0'})],
  'token': [Annotation(token, 0, 0, I, {'sentence': '0'}),
   Annotation(token, 2, 4, saw, {'sentence': '0'}),
   Annotation(token, 6, 8, him, {'sentence': '0'}),
   Annotation(token, 10, 18, yesterday, {'sentence': '0'}),
   Annotation(token, 20, 22, and, {'sentence': '0'}),
   Annotation(token, 24, 25, he, {'sentence': '0'}),
   Annotation(token, 27, 30, told, {'sentence': '0'}),
   Annotation(token, 32, 33, me, {'sentence': '0'}),
   Annotation(token, 35, 38, that, {'sentence': '0'}),
   Annotation(token, 40, 41, he, {'sentence': '0'}),
   Annotation(token, 43, 46, will, {'sentence': '0'}),
   Annotation(token, 

In [None]:
tuples = []

for x in detailed_result[0]["token"]:

  tuples.append((int(x.metadata['sentence']), x.result, x.begin, x.end))

df = pd.DataFrame(tuples, columns=['sent_id','token','start','end'])

df

Unnamed: 0,sent_id,token,start,end
0,0,I,0,0
1,0,saw,2,4
2,0,him,6,8
3,0,yesterday,10,18
4,0,and,20,22
5,0,he,24,25
6,0,told,27,30
7,0,me,32,33
8,0,that,35,38
9,0,he,40,41


### Sentiment Analysis

In [None]:
sentiment = PretrainedPipeline('analyze_sentiment', lang='en')

analyze_sentiment download started this may take some time.
Approx size to download 4.9 MB
[OK!]


In [None]:
result = sentiment.annotate("The movie I watched today was not a good one")

result['sentiment']

['negative']

#### DL version (trained on imdb)

In [None]:
sentiment_imdb = PretrainedPipeline('analyze_sentimentdl_use_imdb', lang='en')

analyze_sentimentdl_use_imdb download started this may take some time.
Approx size to download 935.7 MB
[OK!]


In [None]:
sentiment_imdb_glove = PretrainedPipeline('analyze_sentimentdl_glove_imdb', lang='en')

analyze_sentimentdl_glove_imdb download started this may take some time.
Approx size to download 154 MB
[OK!]


In [None]:
comment = '''
It's a very scary film but what impressed me was how true the film sticks to the original's tricks; it isn't filled with loud in-your-face jump scares, in fact, a lot of what makes this film scary is the slick cinematography and intricate shadow play. The use of lighting and creation of atmosphere is what makes this film so tense, which is why it's perfectly suited for those who like Horror movies but without the obnoxious gore.
'''
result = sentiment_imdb_glove.annotate(comment)

result['sentiment']

['pos']

In [None]:
sentiment_imdb_glove.fullAnnotate(comment)[0]['sentiment']

[Annotation(category, 0, 433, pos, {'sentence': '0', 'pos': '0.98675287', 'neg': '0.013247096'})]