In [1]:
import pandas as pd
import re

encoding = 'utf-8'

trainDF = pd.read_csv("../data/intent_queries.csv", encoding=encoding)
trainDF.head()

Unnamed: 0,Query,Intent
0,Need help please,commonQ.assist
1,Need help,commonQ.assist
2,I need some info,commonQ.assist
3,Will you help me?,commonQ.assist
4,Need help plz,commonQ.assist


In [2]:
unique_intents = list(set(trainDF.Intent))
unique_intents

['commonQ.wait',
 'commonQ.name',
 'recommend.age',
 'response.abusive',
 'commonQ.assist',
 'recommend',
 'recommend.price',
 'response.negative',
 'commonQ.how']

In [3]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "can not ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"\'scuse", " excuse ", text)
    text = re.sub('\W', ' ', text)
    text = re.sub('\s+', ' ', text)
    text = text.strip(' ')
    return text

In [4]:
trainDF['Query'] = trainDF['Query'].map(lambda query : clean_text(query))
trainDF['Query'][5]

'can you help me'

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer,TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
from sklearn.pipeline import Pipeline

In [6]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(trainDF.Query)
X_train_counts.shape

(107, 123)

In [7]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(107, 123)

In [8]:
# Define a pipeline combining a text feature extractor with classifier
clf = MultinomialNB().fit(X_train_tfidf, trainDF.Intent)
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tdidf', TfidfTransformer()),
    ('clf', MultinomialNB())]
)
text_clf = text_clf.fit(trainDF['Query'], trainDF['Intent'])

In [9]:
validate_query = ['who am I speaking to']
validateDF = pd.DataFrame(validate_query, columns=['Query'])
predict = text_clf.predict(validateDF.Query)
predict[0]

'commonQ.name'

In [10]:
from sklearn.linear_model import SGDClassifier
text_clf_svm = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf-svm', SGDClassifier(loss='log', penalty='l2',
                                          alpha=1e-3, max_iter=100, random_state=42)),
])
_ = text_clf_svm.fit(trainDF['Query'], trainDF['Intent'])


In [11]:
predict_svm = text_clf_svm.predict(validateDF.Query)
predict_svm[0]

'commonQ.name'

In [12]:
import pickle
pickle.dump(text_clf, open('./intent_MultinomialNB.pkl', 'wb'))
pickle.dump(text_clf_svm, open('./intent_SGDClassifier.pkl', 'wb'))

In [13]:
test_query = ['what a complete waste of time']
testDF = pd.DataFrame(test_query, columns=['Query'])

In [14]:
intent_model = pickle.load(open('./intent_MultinomialNB.pkl', 'rb'))
result = intent_model.predict(testDF.Query)
result_proba = intent_model.predict_proba(testDF.Query)
print(intent_model.classes_)
print(result)
print(result_proba)

['commonQ.assist' 'commonQ.how' 'commonQ.name' 'commonQ.wait' 'recommend'
 'recommend.age' 'recommend.price' 'response.abusive' 'response.negative']
['commonQ.name']
[[0.07405917 0.11522875 0.17112725 0.07332483 0.13076088 0.11598263
  0.08398046 0.08942391 0.14611211]]


In [15]:
intent_model_svm = pickle.load(open('./intent_SGDClassifier.pkl', 'rb'))
result_svm = intent_model_svm.predict(testDF.Query)
result_svm_proba = intent_model_svm.predict_proba(testDF.Query)
print(intent_model_svm.classes_)
print(result_svm)
print(result_svm_proba)

['commonQ.assist' 'commonQ.how' 'commonQ.name' 'commonQ.wait' 'recommend'
 'recommend.age' 'recommend.price' 'response.abusive' 'response.negative']
['response.negative']
[[0.05465825 0.08087787 0.13661008 0.07210862 0.07427255 0.08174025
  0.08392614 0.06143841 0.35436783]]


In [16]:
text_clf_svm.classes_

array(['commonQ.assist', 'commonQ.how', 'commonQ.name', 'commonQ.wait',
       'recommend', 'recommend.age', 'recommend.price',
       'response.abusive', 'response.negative'], dtype='<U17')

In [17]:
svc_classes = list(text_clf_svm.classes_)
result_svm_proba[0][svc_classes.index(result_svm[0])]

0.35436782895565977