## Usual imports

In [20]:
from gensim.models import Word2Vec
import pickle
import numpy as np
import pandas as pd
import os
from gensim.test.utils import get_tmpfile
import os
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

In [21]:
from gensim.parsing.preprocessing import preprocess_string
import re
import string
from gensim.parsing.preprocessing import remove_stopwords
from gensim.parsing.preprocessing import strip_multiple_whitespaces
from gensim.parsing.preprocessing import stem_text
from gensim.parsing.preprocessing import strip_numeric
def remove_ip(s):
    # Replace all ip adresses with '<ip>' tag
    ip_regexp = r"\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b"
    return re.sub(ip_regexp, '<ip>', s)
def remove_email(s):
    # Replace all email adresses with '<email>' tag
    email_regexp = r"([a-zA-Z0-9_\-\.]+)@([a-zA-Z0-9_\-\.]+)\.([a-zA-Z]{2,5})"
    return re.sub(email_regexp, '<email>', s)
def remove_mailto(s):
    # Replace all "<mailto:<email>>" with <email>. Email adresses should be replaced by remove_email first.
    return s.replace("<mailto:<email>>", "<email>")
def remove_url(s):
    # Replace all url's with '<url>' tag
    url_regexp = r"((http|ftp|https):\/\/)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)"
    s = re.sub(url_regexp, '<url>', s)
    # Sometimes url's are inside <> so we need to replace <<url>> with <url>
    return s.replace("<<url>>", "<url>")
def remove_punc(s, exceptions):
    # Remove all punctuation from string with exceptions in list exceptions
    remove = string.punctuation
    for exception in exceptions:
        remove = remove.replace(exception, "")
    # Create the pattern
    pattern = r"[{}]".format(remove)

    return re.sub(pattern, "", s)
def remove_custom_stopwords(s, stopwords):
    for stopword in stopwords:
        s = s.replace(stopword, "")
    return s
def lower_case(s):
    return s.lower()
def preprocess_sentence_fn(s):
    # Preprocess a single sentence to a list of tokens
    punc_exceptions = ['<', '>']
    custom_stopwords = ['dear', 'sincerely', 'thanks', 'yours', 'regards']
    filters = [lower_case,
               remove_ip,
               remove_email,
               remove_mailto,
               #remove_url,
               lambda x: remove_punc(x, punc_exceptions),
               remove_stopwords,
               lambda x: remove_custom_stopwords(x, custom_stopwords),
               strip_multiple_whitespaces,
               stem_text,
               strip_numeric]
    out = preprocess_string(s, filters=filters)
    return out
def preprocess_docs_fn(docs):
    # Apply preprocess_sentence_fn to a list of sentances (docs) to get a list of lists
    return [preprocess_sentence_fn(s) for s in docs]

In [22]:
# Read in the data
ticket_dat = pd.read_csv('../data/12-04-ticket_dat.csv')
faq_dat = pd.read_csv('../data/12-04-faq_dat.csv')
# Replace the NaNs
ticket_dat.fillna('', inplace=True)
faq_dat.fillna('', inplace=True)

In [23]:
# FAQ question
faq_ques = list(faq_dat.question)
n_faq_ques = len(faq_ques)
# FAQ answer
faq_ans = list(faq_dat.answer_title + " " + faq_dat.answer)
n_faq_ans = len(faq_ans)
#ticket question
ticket_ques = list(ticket_dat.question)
n_ticket_ques = len(ticket_ques)
#ticket ans
ticket_ans = list(ticket_dat.answer)
n_ticket_ans = len(ticket_ans)

In [24]:
# Load the Word2Vec model
model_path = '../code/embedding/models/word2vec_ticket_ques.model'
model = Word2Vec.load(model_path)

In [25]:
with open('../code/similarity/mappings/ticket_faq_map_word2vec.pkl', 'rb') as fp:
    Classes = pickle.load(fp)
mapping = Classes['mapping']

## Random forest 

In [26]:
ticket_ques_prepro = preprocess_docs_fn(ticket_ques)

In [27]:
def doc_emb(dat):
    mean_ans = np.empty((len(dat), 128), dtype=float)
    for j in range(len(dat)):
        sentence = dat[j]
        words = np.empty((len(sentence), 128), dtype=float)
        for i in range(len(sentence)):
            words[i] = model[sentence[i]]
        mean_ans[j] = np.apply_along_axis(np.mean, 0, words)
    return mean_ans

In [28]:
ticket_question_embeddings = doc_emb(ticket_ques_prepro)

  import sys


In [29]:
print('Running CV on Classifier...')
classifier_CV = RandomForestClassifier()
scores = cross_val_score(classifier_CV, ticket_question_embeddings, mapping, cv=5)
cv_score = scores.mean()

Running CV on Classifier...




In [31]:
print('Training Classifier...')
classifier = RandomForestClassifier()
classifier.fit(X=ticket_question_embeddings, y=mapping)
#dump(classifier, 'classifier/models/RF_word2vec.joblib')
train_score = classifier.score(X=ticket_question_embeddings, y=mapping)

print('Training Score: {0} \n Cross Val Score: {1}'.format(train_score, cv_score))

Training Classifier...




Training Score: 0.9873989538754161 
 Cross Val Score: 0.21176382289560491


## XgBoost

In [33]:
from sklearn.ensemble import GradientBoostingClassifier

In [34]:
Bclassifier_CV = GradientBoostingClassifier()
scores = cross_val_score(Bclassifier_CV, ticket_question_embeddings, mapping, cv=5)
cv_score = scores.mean()



In [36]:
print('Training Classifier...')
Bclassifier = GradientBoostingClassifier()
Bclassifier.fit(X=ticket_question_embeddings, y=mapping)
#dump(classifier, 'classifier/models/RF_word2vec.joblib')
train_score = Bclassifier.score(X=ticket_question_embeddings, y=mapping)

print('Training Score: {0} \nCross Val Score: {1}'.format(train_score, cv_score))

Training Classifier...
Training Score: 0.8927722301474085 
 Cross Val Score: 0.2171630096085304
