In [1]:
import json
import codecs
from itertools import chain
import nltk
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelBinarizer
import sklearn
import pandas as pd

#pip install python-crfsuite

import pycrfsuite
import spacy
import en_core_web_sm
nlp = spacy.load("en_core_web_sm")

In [2]:
file_location = "../data/intent_queries.json"
file_stream = codecs.open(file_location, 'r', 'utf-8')
jdata = json.load(file_stream)

In [3]:
import nltk
wnl = nltk.WordNetLemmatizer()


def clean_text(text):
    tokens = wnl.lemmatize(t.lower(),pos='v')

In [4]:
def loadData(jdata, inp_intent = []):
    data_List = []
    if inp_intent == []:
        intent_keys = jdata.keys()
    else:
        intent_keys = inp_intent
    
    print(intent_keys)
    for intent in intent_keys:
        for data in jdata[intent]:

            wordList=[]
            tagList=[]
            posList=[]
            sentlist=[]

            for sequence in data['data']:   
                text = sequence['text'].lower()
                tokens = nltk.word_tokenize(text)
                tokens=[ wnl.lemmatize(t.lower(),pos='v') for t in tokens]
                text=" ".join(tokens)
                tokenList = text.split()

                if 'entity' not in sequence:
                    for tok in tokenList:
                        wordList.append(tok)
                        tagList.append('O')
                else:
                    for idx,tok in enumerate(tokenList):
                        wordList.append(tok)
                        if idx:
                            tagList.append('I-'+sequence['entity']) #entity token
                        else:
                            tagList.append('B-'+sequence['entity'])

            sent = ' '.join(wordList)
            sent_nlp = nlp(sent) #POS tag

            for token in sent_nlp:
                posList.append(token.tag_) #retrieve tag

            for idx,word in enumerate(wordList):
                sentlist.append((word,posList[idx],tagList[idx]))

            data_List.append(sentlist)
    return data_List

def input_prep(text):
    data_List = []

    for sequence in text:
        wordList=[]
        posList=[]
        tagList = []
        sentlist=[]

        text = sequence.strip().lower()
        tokens = nltk.word_tokenize(text)
        tokens=[ wnl.lemmatize(t.lower(),pos='v') for t in tokens]
        text=" ".join(tokens)
        tokenList = text.split()

        for tok in tokenList:
            wordList.append(tok)
            tagList.append('O')

        sent = ' '.join(wordList)
        sent_nlp = nlp(sent) #POS tag

        for token in sent_nlp:
            posList.append(token.tag_) #retrieve tag

        for idx,word in enumerate(wordList):
            sentlist.append((word,posList[idx],tagList[idx]))

        data_List.append(sentlist)
    return data_List

In [5]:
def word2features(sent, i): #function to create feature vector to represent each word
    word = sent[i][0]
    postag = sent[i][1]
    features = [  # for all words
        'bias',
        'word.lower=' + word.lower(),
        #'word[-3:]=' + word[-3:],
        'word.isupper=%s' % word.isupper(),
        'word.istitle=%s' % word.istitle(),
        'word.isdigit=%s' % word.isdigit(),
        'postag=' + postag,
        'postag[:2]=' + postag[:2], #what is the POS tag for the next 2 word token
    ]
    if i > 0: # if not <S>
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.extend([
            '-1:word.lower=' + word1.lower(),
            '-1:word.istitle=%s' % word1.istitle(),
            '-1:word.isupper=%s' % word1.isupper(),
            '-1:word.isdigit=%s' % word1.isdigit(),
            '-1:postag=' + postag1,
            '-1:postag[:2]=' + postag1[:2],
        ])
    else:
        features.append('BOS')  # beginning of statement
        
    if i < len(sent)-1:  # if not <\S>
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.extend([
            '+1:word.lower=' + word1.lower(),
            '+1:word.istitle=%s' % word1.istitle(),
            '+1:word.isupper=%s' % word1.isupper(),
            '+1:word.isdigit=%s' % word1.isdigit(),
            '+1:postag=' + postag1,
            '+1:postag[:2]=' + postag1[:2],
        ])
    else:
        features.append('EOS')
                
    return features

def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]

In [6]:
train_list = loadData(jdata)

dict_keys(['commonQ.assist', 'commonQ.how', 'commonQ.name', 'commonQ.wait', 'recommend.game', 'response.abusive', 'response.negative', 'response.incorrect', 'game.price', 'game.age', 'game.release_date', 'game.platforms', 'response.positive'])


In [7]:
X_train = [sent2features(s) for s in train_list]
Y_train = [sent2labels(s) for s in train_list]

In [8]:
trainer = pycrfsuite.Trainer(verbose=False)

for xseq, yseq in zip(X_train, Y_train):
    trainer.append(xseq, yseq)

trainer.set_params({
    'c1': 1.0,   # coefficient for L1 penalty
    'c2': 1e-3,  # coefficient for L2 penalty
    'max_iterations': 50,  # stop earlier

    # include transitions that are possible, but not observed
    'feature.possible_transitions': True
})

trainer.params()

trainer.train('recommend_game.crfsuite')

In [9]:
tagger = pycrfsuite.Tagger()
tagger.open('recommend_game.crfsuite')


<contextlib.closing at 0x273a78b7a88>

In [10]:
testfile_location = "../data/test_queries.json"
testfile_stream = codecs.open(testfile_location, 'r', 'utf-8')
testjdata = json.load(testfile_stream)

test_list = loadData(testjdata)

X_test = [sent2features(s) for s in test_list]
Y_test = [sent2labels(s) for s in test_list]

###################################################################
def bio_classification_report(y_true, y_pred):
    """
    Classification report for a list of BIO-encoded sequences.
    It computes token-level metrics and discards "O" labels.
    
    Note that it requires scikit-learn 0.15+ (or a version from github master)
    to calculate averages properly!
    """
    lb = LabelBinarizer()
    y_true_combined = lb.fit_transform(list(chain.from_iterable(y_true)))
    y_pred_combined = lb.transform(list(chain.from_iterable(y_pred)))
        
    tagset = set(lb.classes_) - {'O'}
    tagset = sorted(tagset, key=lambda tag: tag.split('-', 1)[::-1])
    class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)}
    
    return classification_report(
        y_true_combined,
        y_pred_combined,
        labels = [class_indices[cls] for cls in tagset],
        target_names = tagset,
    )

Y_pred = [tagger.tag(xseq) for xseq in X_test]
print(bio_classification_report(Y_test, Y_pred))

dict_keys(['recommend.game', 'response.incorrect', 'game.price'])


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


              precision    recall  f1-score   support

       B-age       1.00      0.50      0.67         2
 B-character       1.00      0.67      0.80         6
     B-genre       1.00      0.78      0.88         9
     B-price       1.00      0.83      0.91         6
     I-price       1.00      0.80      0.89         5

   micro avg       1.00      0.75      0.86        28
   macro avg       1.00      0.72      0.83        28
weighted avg       1.00      0.75      0.85        28
 samples avg       0.23      0.23      0.23        28



In [11]:
new_test_list = test_list
for idx_sent, sent in enumerate(Y_pred):
    for idx_word, word in enumerate(sent):
        if word != 'O' :
            words = new_test_list[idx_sent][idx_word]
            words = (words[0], words[1],word)
            new_test_list[idx_sent][idx_word] = words
new_test_list

[[('i', 'PRP', 'O'),
  ('be', 'VBP', 'O'),
  ('10', 'CD', 'B-age'),
  ('years', 'NNS', 'I-age'),
  ('old', 'JJ', 'I-age'),
  ('and', 'CC', 'O'),
  ('look', 'VBP', 'O'),
  ('for', 'IN', 'O'),
  ('free', 'JJ', 'B-price'),
  ('shoot', 'NN', 'B-genre'),
  ('game', 'NN', 'O'),
  ('with', 'IN', 'O'),
  ('zombies', 'NNS', 'B-character'),
  ('.', '.', 'O'),
  ('can', 'MD', 'O'),
  ('you', 'PRP', 'O'),
  ('recommend', 'VB', 'O')],
 [('i', 'PRP', 'O'),
  ('be', 'VBP', 'O'),
  ('look', 'VB', 'O'),
  ('for', 'IN', 'O'),
  ('kill', 'NN', 'B-genre'),
  ('game', 'NN', 'O'),
  ('with', 'IN', 'O'),
  ('animals', 'NNS', 'B-character'),
  ('for', 'IN', 'O'),
  ('under', 'IN', 'B-price'),
  ('$', '$', 'I-price'),
  ('10', 'CD', 'I-price')],
 [('any', 'DT', 'O'),
  ('action', 'NN', 'B-genre'),
  ('game', 'NN', 'O'),
  ('with', 'IN', 'O'),
  ('animal', 'NN', 'B-character'),
  ('under', 'IN', 'B-price'),
  ('10', 'CD', 'I-price'),
  ('buck', 'NN', 'I-price')],
 [('run', 'NN', 'B-genre'), ('game', 'NN', 'O')]

In [24]:
#example_sent = "I am 10 years old and looking for free shooting game with zombies. Can you recommend"
#example_sent = "RPG game shooting monster"
example_sent = "look for rpg game kill monster free"
example_sent_split = example_sent.replace(' and', '.').split('.')
print(example_sent)
print(example_sent_split)


look for rpg game kill monster free
['look for rpg game kill monster free']


In [25]:
cl_example_sent = input_prep(example_sent_split)
print(cl_example_sent)


[[('look', 'VB', 'O'), ('for', 'IN', 'O'), ('rpg', 'NN', 'O'), ('game', 'NN', 'O'), ('kill', 'NN', 'O'), ('monster', 'NN', 'O'), ('free', 'JJ', 'O')]]


In [26]:
x = [sent2features(s) for s in cl_example_sent]
print(x)

[[['bias', 'word.lower=look', 'word.isupper=False', 'word.istitle=False', 'word.isdigit=False', 'postag=VB', 'postag[:2]=VB', 'BOS', '+1:word.lower=for', '+1:word.istitle=False', '+1:word.isupper=False', '+1:word.isdigit=False', '+1:postag=IN', '+1:postag[:2]=IN'], ['bias', 'word.lower=for', 'word.isupper=False', 'word.istitle=False', 'word.isdigit=False', 'postag=IN', 'postag[:2]=IN', '-1:word.lower=look', '-1:word.istitle=False', '-1:word.isupper=False', '-1:word.isdigit=False', '-1:postag=VB', '-1:postag[:2]=VB', '+1:word.lower=rpg', '+1:word.istitle=False', '+1:word.isupper=False', '+1:word.isdigit=False', '+1:postag=NN', '+1:postag[:2]=NN'], ['bias', 'word.lower=rpg', 'word.isupper=False', 'word.istitle=False', 'word.isdigit=False', 'postag=NN', 'postag[:2]=NN', '-1:word.lower=for', '-1:word.istitle=False', '-1:word.isupper=False', '-1:word.isdigit=False', '-1:postag=IN', '-1:postag[:2]=IN', '+1:word.lower=game', '+1:word.istitle=False', '+1:word.isupper=False', '+1:word.isdigit=F

In [27]:
tagList = [tagger.tag(s) for s in x]
tagList

[['O', 'O', 'B-genre', 'O', 'B-genre', 'B-character', 'B-price']]

In [16]:
for idx_sent, sent in enumerate(tagList):
    for idx_word, word in enumerate(sent):
        if word != 'O' :
            words = cl_example_sent[idx_sent][idx_word]
            words = (words[0], words[2],word)
            cl_example_sent[idx_sent][idx_word] = words

In [17]:
cl_example_sent

[[('look', 'VB', 'O'),
  ('for', 'IN', 'O'),
  ('rpg', 'O', 'B-genre'),
  ('game', 'NN', 'O'),
  ('shoot', 'O', 'B-genre'),
  ('monster', 'O', 'B-character'),
  ('free', 'O', 'B-price')]]

In [18]:
genreList = []
priceList = []
ageList = []
for idx_sent, sent in enumerate(cl_example_sent):
    for idx_word, word in enumerate(sent):
        print(word)
        if 'genre' in word[2]:
            genreList.append(word[0])
        elif 'age' in word[2]:
            if word[0].isdigit():
                ageList.append(word[0])
        elif 'price' in word[2]:
            if 'free' in word[0]:
                priceList.append(0)
            else:
                if word[0].replace('$','').isdigit():
                    priceList.append(int(word[0].replace('$','')))

entitylist = {'genre': genreList, 'price': priceList, 'age': ageList}

('look', 'VB', 'O')
('for', 'IN', 'O')
('rpg', 'O', 'B-genre')
('game', 'NN', 'O')
('shoot', 'O', 'B-genre')
('monster', 'O', 'B-character')
('free', 'O', 'B-price')


In [19]:
entitylist

{'genre': ['rpg', 'shoot'], 'price': [0], 'age': []}