In [27]:
import os, re, time, sys
import pandas as pd
import numpy as np
from dateutil.parser import parse
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
%matplotlib inline

import nltk
import pickle
# from nltk.tag.simplify import simplify_wsj_tag
from nltk.tag import pos_tag, map_tag
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('universal_tagset')
from nltk.classify import MaxentClassifier
from nltk import load_parser

[nltk_data] Downloading package punkt to /Users/andrewlb/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/andrewlb/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     /Users/andrewlb/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


In [2]:
df = pd.read_feather('03-post.feather')
df.shape

(9723, 8)

## Let's Copy Some Code
http://datadesk.latimes.com/posts/2013/12/natural-language-processing-in-the-kitchen/

In [20]:

training = {
    'units': [
        'five ISIS well heads',
        'an oil equipment piece',
        'a tactical vehicle',
        'a VBIED',
        "three VBIED factories",
        "two ISIS-held buildings"
    ],
    'locations': [
        'near raqqa',
        'near mosul'
    ],
    'verbs': [
        'one strike engaged',
        'two strikes destroyed',
        'four strikes destroyed',
        'and damaged',
        '13 strikes engaged',
        'six strikes engaged',
        'and destroyed'
    ]
}


In [23]:
# Set up a list that will contain all of our tagged examples,
# which we will pass into the classifier at the end.
training_set = []
for key, val in training.items():
    for i in val:
        # Set up a list we can use for all of our features,
        # which are just individual words in this case.
        feats = []
        # Before we can tokenize words, we need to break the
        # text out into sentences.
        sentences = nltk.sent_tokenize(i)
        for sentence in sentences:
            feats = feats + nltk.word_tokenize(sentence)

        # For this example, it's a good idea to normalize for case.
        # You may or may not need to do this.
        feats = [i.lower() for i in feats]
        # Each feature needs a value. A typical use for a case like this
        # is to use True or 1, though you can use almost any value for
        # a more complicated application or analysis.
        feats = dict([(i, True) for i in feats])
        # NLTK expects you to feed a classifier a list of tuples
        # where each tuple is (features, tag).
        training_set.append((feats, key))

# Train up our classifier
classifier = MaxentClassifier.train(training_set)

# Test it out!
# You need to feed the classifier your data in the same format you used
# to train it, in this case individual lowercase words.
classifier.classify({'VBIED': True, 'ISIS-held buildings': True})

# Save it to disk, if you want, because these can take a long time to train.
outfile = open('classifier.pickle', 'wb')
pickle.dump(classifier, outfile)
outfile.close()

  ==> Training (100 iterations)

      Iteration    Log Likelihood    Accuracy
      ---------------------------------------
             1          -1.09861        0.467
             2          -0.53080        1.000
             3          -0.35423        1.000
             4          -0.26650        1.000
             5          -0.21380        1.000
             6          -0.17858        1.000
             7          -0.15335        1.000
             8          -0.13439        1.000
             9          -0.11961        1.000
            10          -0.10776        1.000
            11          -0.09805        1.000
            12          -0.08995        1.000
            13          -0.08309        1.000
            14          -0.07720        1.000
            15          -0.07209        1.000
            16          -0.06762        1.000
            17          -0.06367        1.000
            18          -0.06016        1.000
            19          -0.05701        1.000
 

In [24]:
# text = nltk.word_tokenize("And now for something completely different")
# posTagged = pos_tag(text)
# simplifiedTags = [(word, map_tag('en-ptb', 'universal', tag)) for word, tag in posTagged]
# print(simplifiedTags)

def get_features(text):
    words = []
    # Same steps to start as before
    sentences = nltk.sent_tokenize(text)
    for sentence in sentences:
        words = words + nltk.word_tokenize(sentence)

    # part of speech tag each of the words
    pos = nltk.pos_tag(words)
    # Sometimes it's helpful to simplify the tags NLTK returns by default.
    # I saw an increase in accuracy if I did this, but you may not
    # depending on the application.
    pos = [map_tag('en-ptb', 'universal',tag) for word, tag in pos]
    # Then, convert the words to lowercase like before
    words = [i.lower() for i in words]
    # Grab the trigrams
    trigrams = nltk.trigrams(words)
    # We need to concatinate the trigrams into a single string to process
    trigrams = ["%s/%s/%s" % (i[0], i[1], i[2]) for i in trigrams]
    # Get our final dict rolling
    features = words + pos + trigrams
    # get our feature dict rolling
    features = dict([(i, True) for i in features])
    return features

# Try it out
text = "Near Tabqah, 13 strikes engaged nine ISIS tactical units; destroyed two fighting positions."
get_features(text)

{'near': True,
 'tabqah': True,
 ',': True,
 '13': True,
 'strikes': True,
 'engaged': True,
 'nine': True,
 'isis': True,
 'tactical': True,
 'units': True,
 ';': True,
 'destroyed': True,
 'two': True,
 'fighting': True,
 'positions': True,
 '.': True,
 'ADP': True,
 'NOUN': True,
 'NUM': True,
 'VERB': True,
 'ADJ': True,
 'near/tabqah/,': True,
 'tabqah/,/13': True,
 ',/13/strikes': True,
 '13/strikes/engaged': True,
 'strikes/engaged/nine': True,
 'engaged/nine/isis': True,
 'nine/isis/tactical': True,
 'isis/tactical/units': True,
 'tactical/units/;': True,
 'units/;/destroyed': True,
 ';/destroyed/two': True,
 'destroyed/two/fighting': True,
 'two/fighting/positions': True,
 'fighting/positions/.': True}

In [51]:
query = 'near detroit, two strikes destroyed'
cp = load_parser('file:atis.cfg')
trees = list(cp.parse(query.split()))
answer = trees[0].label()['SEM']
answer = [s for s in answer if s]
q = ' '.join(answer)
print(q)

ValueError: Grammar does not cover some of the input words: "'detroit,', 'strikes', 'destroyed'".

In [60]:
# https://github.com/readywater/text-analytics-w-python-2e/blob/master/Ch03%20-%20Processing%20and%20Understanding%20Text/Ch03a%20-%20Text%20Wrangling.ipynb

sents = 'Near Dayr Az Zawr, four strikes destroyed five ISIS well heads, two pump jacks, an oil storage tank, and an oil equipment piece'
grammar_file = 'atis.cfg'
print(sents)

from nltk.tag import RegexpTagger 
# define regex tag patterns 
patterns = [
    (r'.*ing$', 'VBG'), # gerunds         
    (r'.*ed$', 'VBD'), # simple past         
    (r'.*es$', 'VBZ'), # 3rd singular present         
    (r'.*ould$', 'MD'), # modals         
    (r'.*\'s$', 'NN$'), # possessive nouns         
    (r'.*s$', 'NNS'), # plural nouns         
    (r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers         
    (r'.*', 'NN') # nouns (default) ... 
] 
rt=RegexpTagger(patterns)

tagged_sent = nltk.pos_tag(nltk.word_tokenize(sents))
print(tagged_sent)
for results in nltk.interpret_sents(sents, grammar_file):
    for (synrep, semrep) in results:
        print(synrep)

Near Dayr Az Zawr, four strikes destroyed five ISIS well heads, two pump jacks, an oil storage tank, and an oil equipment piece
[('Near', 'IN'), ('Dayr', 'NNP'), ('Az', 'NNP'), ('Zawr', 'NNP'), (',', ','), ('four', 'CD'), ('strikes', 'NNS'), ('destroyed', 'VBN'), ('five', 'CD'), ('ISIS', 'NNP'), ('well', 'RB'), ('heads', 'NNS'), (',', ','), ('two', 'CD'), ('pump', 'NN'), ('jacks', 'NNS'), (',', ','), ('an', 'DT'), ('oil', 'NN'), ('storage', 'NN'), ('tank', 'NN'), (',', ','), ('and', 'CC'), ('an', 'DT'), ('oil', 'NN'), ('equipment', 'NN'), ('piece', 'NN')]


ValueError: Grammar does not cover some of the input words: "'N'".

In [44]:

original_grammar = nltk.data.load('atis.cfg')
original_parser = nltk.ChartParser(original_grammar)
sent = ['Nera Dayr Az Zawr, four strikes destroyed five ISIS well heads, two pump jacks, an oil storage tank, and an oil equipment piece.', 
         'Near Tabqah, 13 strikes engaged nine ISIS tactical units; destroyed two fighting positions, two vehicles, a tactical vehicle, and a tunnel; and damaged two supply routes.',
        'Near Al Qaim, one strike destroyed two anti-air artillery systems and two ISIS heldbuildings.',
        'Near Haditha, one strike engaged an ISIS tactical unit; and destroyed a VBIED and a vehicle.',
        'Near Mosul, six strikes engaged four ISIS tactical units; destroyed five mortar systems, three VBIED factories, three ISIS-held buildings, two anti-air artillery systems, two supply caches, a tactical vehicle, a vehicle, and a weapons facility; damaged nine supply routes, and suppressed six ISIS mortar teams.',]
for i in original_parser.parse(sent):
    print(i)
    break
# original_grammar._rhs_index['mosul']




ValueError: Grammar does not cover some of the input words: "'Nera Dayr Az Zawr, four strikes destroyed five ISIS well heads, two pump jacks, an oil storage tank, and an oil equipment piece.', 'Near Tabqah, 13 strikes engaged nine ISIS tactical units; destroyed two fighting positions, two vehicles, a tactical vehicle, and a tunnel; and damaged two supply routes.', 'Near Al Qaim, one strike destroyed two anti-air artillery systems and two ISIS heldbuildings.', 'Near Haditha, one strike engaged an ISIS tactical unit; and destroyed a VBIED and a vehicle.', 'Near Mosul, six strikes engaged four ISIS tactical units; destroyed five mortar systems, three VBIED factories, three ISIS-held buildings, two anti-air artillery systems, two supply caches, a tactical vehicle, a vehicle, and a weapons facility; damaged nine supply routes, and suppressed six ISIS mortar teams.'".