In [1]:
import nltk
import nltk.book as nb

*** Introductory Examples for the NLTK Book ***
Loading text1, ..., text9 and sent1, ..., sent9
Type the name of the text or sentence to view it.
Type: 'texts()' or 'sents()' to list the materials.
text1: Moby Dick by Herman Melville 1851
text2: Sense and Sensibility by Jane Austen 1811
text3: The Book of Genesis
text4: Inaugural Address Corpus
text5: Chat Corpus
text6: Monty Python and the Holy Grail
text7: Wall Street Journal
text8: Personals Corpus
text9: The Man Who Was Thursday by G . K . Chesterton 1908


In [2]:
import keras

Using TensorFlow backend.


In [3]:
from keras.models import Sequential

model = Sequential()

In [4]:
text = '''Love finding hidden patterns and learning about Data and its value in the real world. I have a Highly Motivated
and Entrepreneurial Mindset. I mainly use Python and the PyData stack for anything that I do but I also know
a bit about Apache Spark, TensorFlow, R, SQL and much other stuff.'''

In [5]:
nltk.download('punkt')

# tokenize doc
tokenized_doc = nltk.word_tokenize(text)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Rit\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


In [6]:
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

# tag sentences and use nltk's Named Entity Chunker
tagged_sentences = nltk.pos_tag(tokenized_doc)
ne_chunked_sents = nltk.ne_chunk(tagged_sentences)

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Rit\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\Rit\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\Rit\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


In [7]:
# extract all named entities
named_entities = []
for tagged_tree in ne_chunked_sents:
    if hasattr(tagged_tree, 'label'):
        entity_name = ' '.join(c[0] for c in tagged_tree.leaves()) #
        entity_type = tagged_tree.label() # get NE category
        named_entities.append((entity_name, entity_type))
print(named_entities)

[('Data', 'PERSON'), ('Highly Motivated', 'ORGANIZATION'), ('Entrepreneurial Mindset', 'ORGANIZATION'), ('Python', 'PERSON'), ('PyData', 'ORGANIZATION'), ('Apache Spark', 'PERSON'), ('TensorFlow', 'ORGANIZATION'), ('R', 'GPE'), ('SQL', 'ORGANIZATION')]


In [8]:
data = [(['Linux', 'is', 'the', 'best', 'OS'], ['OS','IR','IR','IR','IR']),
(['Ubuntu', 'is', 'my', 'favourite', 'OS'], ['OS','IR','IR','IR','IR'])]

corpus = []

for (doc, tags) in data:
    doc_tag = []
    for word, tag in zip(doc,tags):
        print(word,tag)
        doc_tag.append((word, tag))
    corpus.append(doc_tag)

print(corpus)

Linux OS
is IR
the IR
best IR
OS IR
Ubuntu OS
is IR
my IR
favourite IR
OS IR
[[('Linux', 'OS'), ('is', 'IR'), ('the', 'IR'), ('best', 'IR'), ('OS', 'IR')], [('Ubuntu', 'OS'), ('is', 'IR'), ('my', 'IR'), ('favourite', 'IR'), ('OS', 'IR')]]


In [9]:
def doc2features(doc, i):
    word = doc[i][0]
    
    # Features from current word
    features={
        'word.word': word,
    }
    # Features from previous word
    if i > 0:
        prevword = doc[i-1][0]
        features['word.prevword'] = prevword
    else:
        features['BOS'] = True # Special "Beginning of Sequence" tag
        
    # Features from next word
    if i < len(doc)-1:
        nextword = doc[i+1][0]
        features['word.nextword'] = nextword
    else:
        features['EOS'] = True # Special "End of Sequence" tag
    return features
 
def extract_features(doc):
    return [doc2features(doc, i) for i in range(len(doc))]
 
X = [extract_features(doc) for doc in corpus]
print(X)

[[{'word.word': 'Linux', 'BOS': True, 'word.nextword': 'is'}, {'word.word': 'is', 'word.prevword': 'Linux', 'word.nextword': 'the'}, {'word.word': 'the', 'word.prevword': 'is', 'word.nextword': 'best'}, {'word.word': 'best', 'word.prevword': 'the', 'word.nextword': 'OS'}, {'word.word': 'OS', 'word.prevword': 'best', 'EOS': True}], [{'word.word': 'Ubuntu', 'BOS': True, 'word.nextword': 'is'}, {'word.word': 'is', 'word.prevword': 'Ubuntu', 'word.nextword': 'my'}, {'word.word': 'my', 'word.prevword': 'is', 'word.nextword': 'favourite'}, {'word.word': 'favourite', 'word.prevword': 'my', 'word.nextword': 'OS'}, {'word.word': 'OS', 'word.prevword': 'favourite', 'EOS': True}]]


In [10]:
def get_labels(doc):
    return [tag for (token,tag) in doc]
y = [get_labels(doc) for doc in corpus]

print(y)

[['OS', 'IR', 'IR', 'IR', 'IR'], ['OS', 'IR', 'IR', 'IR', 'IR']]


In [16]:
!pip install sklearn-crfsuite

Collecting sklearn-crfsuite
  Downloading https://files.pythonhosted.org/packages/25/74/5b7befa513482e6dee1f3dd68171a6c9dfc14c0eaa00f885ffeba54fe9b0/sklearn_crfsuite-0.3.6-py2.py3-none-any.whl
Collecting tabulate (from sklearn-crfsuite)
  Downloading https://files.pythonhosted.org/packages/c2/fd/202954b3f0eb896c53b7b6f07390851b1fd2ca84aa95880d7ae4f434c4ac/tabulate-0.8.3.tar.gz (46kB)
Collecting python-crfsuite>=0.8.3 (from sklearn-crfsuite)
  Downloading https://files.pythonhosted.org/packages/29/c9/b206fa75d5978a631b5e6914a051139d99ff4624f96eac1bec6486413944/python_crfsuite-0.9.6-cp36-cp36m-win_amd64.whl (154kB)
Building wheels for collected packages: tabulate
  Building wheel for tabulate (setup.py): started
  Building wheel for tabulate (setup.py): finished with status 'done'
  Stored in directory: C:\Users\Rit\AppData\Local\pip\Cache\wheels\2b\67\89\414471314a2d15de625d184d8be6d38a03ae1e983dbda91e84
Successfully built tabulate
Installing collected packages: tabulate, python-crfsuit

In [17]:
import sklearn_crfsuite
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=20,
    all_possible_transitions=False,
)

In [18]:
crf.fit(X, y);

In [19]:
test = [['CentOS', 'is', 'my', 'favourite', 'OS']]
X_test = extract_features(test)
print(crf.predict_single(X_test))

['OS']
