In [1]:
import pandas as pd
import re
import json
import codecs

encoding = 'utf-8'

file_location = "../data/intent_queries.json"
file_stream = codecs.open(file_location, 'r', 'utf-8')
jdata = json.load(file_stream)


In [2]:
intent_enc = {
    'commonQ.assist':0,
    'commonQ.how':1,
    'commonQ.name':2,
    'commonQ.wait':3,
    'recommend.game':4,
    'game.age':5,
    'game.price':6,
    'response.abusive':7,
    'response.negative':8,
    'response.incorrect':9,
    'game.release_date':10,
    'game.platforms':11,
    'response.positive':12
}

intent_dec = {
    0:'commonQ.assist',
    1:'commonQ.how',
    2:'commonQ.name',
    3:'commonQ.wait',
    4:'recommend.game',
    5:'game.age',
    6:'game.price',
    7:'response.abusive',
    8:'response.negative',
    9:'response.incorrect',
    10:'game.release_date',
    11:'game.platforms',
    12:'response.positive'
}

In [3]:
train_list = []
for intent in jdata:
    for data in jdata[intent]:
        line=""
        for sequence in data['data']:
            line += sequence['text']
        train_list.append([line.lower(),intent_enc[intent]])
print(len(train_list))

199


In [4]:
trainDF = pd.DataFrame(train_list, columns=['Query','Intent'] )
print(trainDF.head())
print(trainDF.shape)

              Query  Intent
0  need help please       0
1             hello       0
2                hi       0
3         need help       0
4  will you help me       0
(199, 2)


In [5]:
import nltk
wnl = nltk.WordNetLemmatizer()
from nltk.corpus import stopwords
mystopwords=stopwords.words("english")
wnl = nltk.WordNetLemmatizer()

import string
string.punctuation

print(mystopwords)
print(string.punctuation)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [6]:

def clean_text(text):
    text = text.lower()
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "can not ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"\'scuse", " excuse ", text)
    text = re.sub('\W', ' ', text)
    text = re.sub('\s+', ' ', text)
    text = text.strip(' ')
    #tokens = nltk.word_tokenize(text)
    #tokens=[ wnl.lemmatize(t.lower(),pos='v') for t in tokens]
    #text=" ".join(tokens)
    return text

In [7]:
trainDF['Query'] = trainDF['Query'].map(lambda query : clean_text(query))
trainDF['Query'][:5]

0    need help please
1               hello
2                  hi
3           need help
4    will you help me
Name: Query, dtype: object

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer,TfidfTransformer
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
from sklearn.pipeline import Pipeline

In [9]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(trainDF.Query)
X_train_counts.shape

(199, 195)

In [10]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(199, 195)

In [11]:
from sklearn.linear_model import SGDClassifier
text_clf_svm = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf-svm', SGDClassifier(loss='log', penalty='l2',
                                          alpha=1e-3, max_iter=100, random_state=42)),
])
_ = text_clf_svm.fit(trainDF['Query'], trainDF['Intent'])


In [12]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn import metrics
import numpy as np

testfile_location = "../data/test_queries.json"
testfile_stream = codecs.open(testfile_location, 'r', 'utf-8')
testjdata = json.load(testfile_stream)
test_list = []
for intent in testjdata:
    for data in testjdata[intent]:
        line=""
        for sequence in data['data']:
            line += sequence['text']
        test_list.append([line.lower(),intent_enc[intent]])
print(len(test_list))
testDF = pd.DataFrame(test_list, columns=['Query','Intent'])
testDF['Query'] = testDF['Query'].map(lambda query : clean_text(query))
predicted = text_clf_svm.predict(testDF.Query)
print(metrics.confusion_matrix(testDF.Intent, predicted))
print(np.mean(predicted == testDF.Intent) )
print(metrics.classification_report(testDF.Intent, predicted))
testDF['Predicted'] = predicted
print(testDF)

14
[[8 0 0]
 [0 2 0]
 [0 0 4]]
1.0
              precision    recall  f1-score   support

           4       1.00      1.00      1.00         8
           6       1.00      1.00      1.00         2
           9       1.00      1.00      1.00         4

    accuracy                           1.00        14
   macro avg       1.00      1.00      1.00        14
weighted avg       1.00      1.00      1.00        14

                                                Query  Intent  Predicted
0   i am 10 years old and looking for free shootin...       4          4
1   i am looking for killing game with animals for...       4          4
2          any action game with animal under 10 bucks       4          4
3                                        running game       4          4
4                              airplane shooting game       4          4
5   i am 10 and looking for free shooting zombie g...       4          4
6                           any rpg game that is free       4          4


In [13]:
import pickle
pickle.dump(text_clf_svm, open('intent_SGDClassifier_v2.pkl', 'wb'))

In [14]:
test_query = ['Any action game that is free']
testDF = pd.DataFrame(test_query, columns=['Query'])

In [15]:
intent_model_svm = pickle.load(open('intent_SGDClassifier_v2.pkl', 'rb'))
result_svm = intent_model_svm.predict(testDF.Query)
result_svm_proba = intent_model_svm.predict_proba(testDF.Query)
print(intent_model_svm.classes_)
print(result_svm)
print(result_svm_proba)
print(intent_dec[result_svm[0]])
print(result_svm_proba[0][result_svm[0]])

[ 0  1  2  3  4  5  6  7  8  9 10 11 12]
[4]
[[0.00850771 0.01114962 0.01329406 0.00741142 0.8160136  0.00895915
  0.05152187 0.00921815 0.01237884 0.02831408 0.01410111 0.01031009
  0.00882029]]
recommend.game
0.8160136032103338


In [16]:
text_clf_svm.classes_

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12], dtype=int64)

In [17]:
svc_classes = list(text_clf_svm.classes_)
result_svm_proba[0][svc_classes.index(result_svm[0])]

0.8160136032103338

In [18]:
import pandas as pd
import pickle
example_sent = ["I am 10 years old and looking for free shooting game with zombies. Can you recommend"]
intent_model_svm = pickle.load(open('intent_SGDClassifier_v2.pkl', 'rb'))
example_sentDF = pd.DataFrame(example_sent, columns=['Query'])
predict = intent_model_svm.predict(example_sentDF.Query)
predict_proba = intent_model_svm.predict_proba(example_sentDF.Query)
print(intent_model_svm.classes_)
print(predict[0])
print(intent_dec[predict[0]])
print(predict_proba[0][predict[0]])

[ 0  1  2  3  4  5  6  7  8  9 10 11 12]
4
recommend.game
0.8242869941635382
