In [1]:
import nltk
from nltk.chunk import ne_chunk, conlltags2tree, tree2conlltags
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from collections import Counter
from pprint import pprint
import en_core_web_sm
import pandas as pd
import spacy
from spacy import displacy

pd.set_option('display.max_rows', None)

In [2]:
## Step 2: Collect & Split Dataset
##################################
import text_preprocessor # Import Variables via Files from Directory
from text_preprocessor import training_text # Import Variables via Files from Directory
##################################
## Example text from Training Data
text = ' '.join(training_text)
print(text[0:250])

japan island country lying off the east coast of asia it consists of a great string of islands in a northeastsouthwest arc that stretches for approximately miles km through the western north pacific ocean nearly the entire land area is taken up by th


In [3]:
ne_tree = ne_chunk(pos_tag(training_text))
# print(ne_tree)

In [4]:
pos_tokenized_words = nltk.pos_tag(training_text)
pos_df = pd.DataFrame(pos_tokenized_words, columns = ['word','pos tag'])
pos_abbreviation_dict = {
             'CC':'coordinating conjunction',
             'CD':'cardinal digit',
             'DT':'determiner',
             'EX':'existential there',
             'FW':'foreign word',
             'IN':'preposition/subordinating conjunction',
             'JJ':'adjective (large)',
             'JJR':'adjective, comparative (larger)',
             'JJS':'adjective, superlative (largest)',
             'LS':'list market',
             'MD':'modal (could, will)',
             'NN':'noun, singular (cat, tree)',
             'NNS':'noun plural (desks)',
             'NNP':'proper noun, singular (sarah)',
             'NNPS':'proper noun, plural (indians or americans)',
             'PDT':'predeterminer (all, both, half)',
             'POS':'possessive ending (parent‘s)',
             'PRP':'personal pronoun (hers, herself, him, himself)',
             'PRP$':'possessive pronoun (her, his, mine, my, our)',
             'RB':'adverb (occasionally, swiftly)',
             'RBR':'adverb, comparative (greater)',
             'RBS':'adverb, superlative (biggest)',
             'RP':'particle (about)',
             'TO':'infinite marker (to)',
             'UH':'interjection (goodbye)',
             'VB':'verb (ask)',
             'VBG':'verb gerund (judging)',
             'VBD':'verb past tense (pleaded)',
             'VBN':'verb past participle (reunified)',
             'VBP':'verb, present tense not 3rd person singular (wrap)',
             'VBZ':'verb, present tense with 3rd person singular (bases)',
             'WDT':'wh-determiner (that, what)',
             'WP':'wh- pronoun (who)',
             'WRB':'wh- adverb (how)'
            }
pos_df['pos tag meaning'] = pos_df['pos tag'].map(pos_abbreviation_dict)
pos_df.head(25)

Unnamed: 0,word,pos tag,pos tag meaning
0,japan,NN,"noun, singular (cat, tree)"
1,island,VBP,"verb, present tense not 3rd person singular (w..."
2,country,NN,"noun, singular (cat, tree)"
3,lying,VBG,verb gerund (judging)
4,off,RP,particle (about)
5,the,DT,determiner
6,east,JJ,adjective (large)
7,coast,NN,"noun, singular (cat, tree)"
8,of,IN,preposition/subordinating conjunction
9,asia,NN,"noun, singular (cat, tree)"


In [5]:
nlp = en_core_web_sm.load()
doc = nlp(text)
array = [(X, X.ent_iob_, X.ent_type_) for X in doc]
df = pd.DataFrame(array, columns=['word','iob','entity type'])
description_dict = {'B':'beginning of an entity',
                    'I':'inside of an entity',
                    'O':'outside of an entity'}
df['iob description'] = df['iob'].map(description_dict)
description_dict = {'PERSON':'people, including fictional',
                    'NORP':'nationalities, religious or political groups',
                    'FAC':'buildings, airports, highways, bridges, etc',
                    'ORG':'companies, agencies, institutions, etc',
                    'GPE':'countries, cities, states',
                    'LOC':'non-GPE locations, mountain ranges, bodies of water',
                    'PRODUCT':'objects, vehicles, foods, etc (not services)',
                    'EVENT':'named hurricanes, battles, wars, sports events, etc',
                    'WORK_OF_ART':'titles of books, songs, etc',
                    'LAW':'named documents made into laws',
                    'LANGUAGE':'any named language',
                    'DATE':'absolute or relative dates or periods',
                    'TIME':'times smaller than a day',
                    'PERCENT':'percentage',
                    'MONEY':'monetary values',
                    'QUANTITY':'measurements, as of weight or distances',
                    'ORDINAL':'first, second, etc',
                    'CARDINAL':'numerals that do not fall under another type'
                   }
df['entity type description'] = df['entity type'].map(description_dict)
df.head(25)

Unnamed: 0,word,iob,entity type,iob description,entity type description
0,japan,B,ORG,beginning of an entity,"companies, agencies, institutions, etc"
1,island,I,ORG,inside of an entity,"companies, agencies, institutions, etc"
2,country,O,,outside of an entity,
3,lying,O,,outside of an entity,
4,off,O,,outside of an entity,
5,the,O,,outside of an entity,
6,east,O,,outside of an entity,
7,coast,O,,outside of an entity,
8,of,O,,outside of an entity,
9,asia,B,LOC,beginning of an entity,"non-GPE locations, mountain ranges, bodies of ..."


In [6]:
array = [(x.orth_,x.pos_, x.lemma_) for x in [y for y in nlp(str(text)) if not y.is_stop and y.pos_ != 'PUNCT']]
df = pd.DataFrame(array, columns=['word','type','lemma'])
df.head(25)

Unnamed: 0,word,type,lemma
0,japan,PROPN,japan
1,island,PROPN,island
2,country,NOUN,country
3,lying,VERB,lie
4,east,PROPN,east
5,coast,PROPN,coast
6,asia,PROPN,asia
7,consists,VERB,consist
8,great,ADJ,great
9,string,NOUN,string


In [7]:
displacy.render(nlp(str(text)), jupyter=True, style='ent')

In [8]:
example_sentence = 'European authorities fined Google a record $5.1 billion on Wednesday for abusing its power in the mobile phone market and ordered the company to alter its practices'
sample = nltk.word_tokenize(example_sentence)
sample = nltk.pos_tag(sample)
pattern = 'Noun Phrase: {<DT>?<JJ>*<NN>}'
NPChunker = nltk.RegexpParser(pattern)
result = NPChunker.parse(sample)
# result.draw()

In [9]:
displacy.render(nlp(str(text)), style='dep', jupyter = True, options = {'distance': 120})