In [35]:
import warnings
warnings.filterwarnings('ignore')

In [66]:
from gensim.models import Word2Vec
import pickle
import numpy as np
import pandas as pd
import os
from gensim.test.utils import get_tmpfile
import os
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import GradientBoostingClassifier

In [37]:
from gensim.parsing.preprocessing import preprocess_string
import re
import string
from gensim.parsing.preprocessing import remove_stopwords
from gensim.parsing.preprocessing import strip_multiple_whitespaces
from gensim.parsing.preprocessing import stem_text
from gensim.parsing.preprocessing import strip_numeric
def remove_ip(s):
    # Replace all ip adresses with '<ip>' tag
    ip_regexp = r"\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b"
    return re.sub(ip_regexp, '<ip>', s)
def remove_email(s):
    # Replace all email adresses with '<email>' tag
    email_regexp = r"([a-zA-Z0-9_\-\.]+)@([a-zA-Z0-9_\-\.]+)\.([a-zA-Z]{2,5})"
    return re.sub(email_regexp, '<email>', s)
def remove_mailto(s):
    # Replace all "<mailto:<email>>" with <email>. Email adresses should be replaced by remove_email first.
    return s.replace("<mailto:<email>>", "<email>")
def remove_url(s):
    # Replace all url's with '<url>' tag
    url_regexp = r"((http|ftp|https):\/\/)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)"
    s = re.sub(url_regexp, '<url>', s)
    # Sometimes url's are inside <> so we need to replace <<url>> with <url>
    return s.replace("<<url>>", "<url>")
def remove_punc(s, exceptions):
    # Remove all punctuation from string with exceptions in list exceptions
    remove = string.punctuation
    for exception in exceptions:
        remove = remove.replace(exception, "")
    # Create the pattern
    pattern = r"[{}]".format(remove)

    return re.sub(pattern, "", s)
def remove_custom_stopwords(s, stopwords):
    for stopword in stopwords:
        s = s.replace(stopword, "")
    return s
def lower_case(s):
    return s.lower()
def preprocess_sentence_fn(s):
    # Preprocess a single sentence to a list of tokens
    punc_exceptions = ['<', '>']
    custom_stopwords = ['dear', 'sincerely', 'thanks', 'yours', 'regards']
    filters = [lower_case,
               remove_ip,
               remove_email,
               remove_mailto,
               #remove_url,
               lambda x: remove_punc(x, punc_exceptions),
               remove_stopwords,
               lambda x: remove_custom_stopwords(x, custom_stopwords),
               strip_multiple_whitespaces,
               stem_text,
               strip_numeric]
    out = preprocess_string(s, filters=filters)
    return out
def preprocess_docs_fn(docs):
    # Apply preprocess_sentence_fn to a list of sentances (docs) to get a list of lists
    return [preprocess_sentence_fn(s) for s in docs]

In [38]:
# Read in the data
ticket_dat = pd.read_csv('../../data/12-04-ticket_dat.csv')
faq_dat = pd.read_csv('../../data/12-04-faq_dat.csv')
# Replace the NaNs
ticket_dat.fillna('', inplace=True)
faq_dat.fillna('', inplace=True)

In [39]:
# FAQ question
faq_ques = list(faq_dat.question)
n_faq_ques = len(faq_ques)
# FAQ answer
faq_ans = list(faq_dat.answer_title + " " + faq_dat.answer)
n_faq_ans = len(faq_ans)
#ticket question
ticket_ques = list(ticket_dat.question)
n_ticket_ques = len(ticket_ques)
#ticket ans
ticket_ans = list(ticket_dat.answer)
n_ticket_ans = len(ticket_ans)

In [40]:
# Model assumption: same embedding for all
all_docs = faq_ques + faq_ans + ticket_ques + ticket_ans
# Model assumption: two different embeddings
all_ans = faq_ans + ticket_ans

In [41]:
# create a dictionary storing the cut points for the four datasets so we can re-split them after.
# use like all_docs[id_dict['faq_ques']] to get all faq questions.
id_dict = {
    'faq_ques': range(0, n_faq_ques),
    'faq_ans': range(n_faq_ques, n_faq_ques + n_faq_ans),
    'ticket_ques': range(n_faq_ques + n_faq_ans, n_faq_ques + n_faq_ans + n_ticket_ques),
    'ticket_ans': range(n_faq_ques + n_faq_ans + n_ticket_ques, n_faq_ques + n_faq_ans + n_ticket_ques + n_ticket_ans)
}
all_docs_sep = {
    'faq_ques': faq_ques,
    'faq_ans': faq_ans,
    'ticket_ques': ticket_ques,
    'ticket_ans': ticket_ans}

In [42]:
all_docs_prepro = preprocess_docs_fn(all_docs)
all_ans_prepro = preprocess_docs_fn(all_ans)

In [159]:
'''#FOR DEBUGGING PURPOSE

#check if datasets contain empty strings
faq_ques_prepro = preprocess_docs_fn(faq_ques)
for i in range(len(faq_ques_prepro)):
    if not faq_ques_prepro[i]:
        print('faq question {}'.format(i))'''

# 2 embeddings 

In [43]:
#ALL ANSWERs
# checking if embedding model already exists
exists = os.path.isfile('../../code/embedding/models/word2vec_ans.model')
if exists:
    print('Word2vec embedding model already existing')
# Create word embedding model
else:
    print('Training word2vec on all answers')
    word_path = "../../code/embedding/models/word2vec_ans.model"
    word_tempfile = get_tmpfile(word_path)
    word_model = Word2Vec(all_ans_prepro, size=128, window=5, min_count=1, workers=4)
    word_model.save(word_path)

Word2vec embedding model already existing


In [44]:
ticket_ques_prepro = preprocess_docs_fn(ticket_ques)

In [45]:
#TICKET QUESTIONS
exists = os.path.isfile('../../code/embedding/models/word2vec_ticket_ques.model')
if exists:
    print('Word2vec embedding model already existing')
else:
    #not checking if already exists because if the first doesn't this won't either
    print('Training word2vec on ticket questions')
    word_path = "../../code/embedding/models/word2vec_ticket_ques.model"
    word_tempfile = get_tmpfile(word_path)
    word_model = Word2Vec(ticket_ques_prepro, size=128, window=5, min_count=1, workers=4)
    word_model.save(word_path)

Word2vec embedding model already existing


## Similarity

In [46]:
print('Loading Word2vec model')
model_path = '../../code/embedding/models/word2vec_ans.model'
model = Word2Vec.load(model_path)

Loading Word2vec model


In [47]:
def doc_emb(dat):
    mean_ans = np.empty((len(dat), 128), dtype=float)
    for j in range(len(dat)):
        sentence = dat[j]
        words = np.empty((len(sentence), 128), dtype=float)
        for i in range(len(sentence)):
            words[i] = model[sentence[i]]
        mean_ans[j] = np.apply_along_axis(np.mean, 0, words)
    return mean_ans

In [49]:
def compute_sim(mean_ticket_ans, mean_faq_ans):
    print('Computing word2vec similarity')

    # create matrix with cosine distances from all ticket ans to all faq ans
    sim_matrix = cosine_similarity(mean_faq_ans, mean_ticket_ans)

    # most similar faq - ticket mapping
    FAQ_per_ticket = np.argmax(sim_matrix, axis=0)
    strength_FAQ_ticket = np.max(sim_matrix, axis=0)

    # small similarities are set to a separate class
    thres = 0.2
    FAQ_per_ticket[strength_FAQ_ticket < thres] = -1

    # some stats
    n_unique = len(np.unique(FAQ_per_ticket))
    n_nonassigned = np.shape(FAQ_per_ticket[strength_FAQ_ticket < thres])[0]
    n_tickets = len(FAQ_per_ticket)
    # How many tickets each FAQ is assigned
    counts_per_faq = pd.Series(FAQ_per_ticket).value_counts()
    #print(counts_per_faq)

    output = {
        'classes': n_tickets,
        'mapping': FAQ_per_ticket
    }
    print(n_unique, 'classes, with ', round(n_nonassigned / n_tickets, 2), '% non assigned tickets')
    return output

In [51]:
mean_ticket_ans = doc_emb(all_ans_prepro[len(faq_ans):len(all_ans)])
mean_faq_ans = doc_emb(all_ans_prepro[0:len(faq_ans)])

output = compute_sim(mean_ticket_ans=mean_ticket_ans, mean_faq_ans=mean_faq_ans)

Computing word2vec similarity
97 classes, with  0.0 % non assigned tickets


In [None]:
#debug: print the vectors with 0 
'''
for i in range(len(mean_ticket_ans)):
    zero = np.count_nonzero(mean_ticket_ans[i])
    if zero != 128: 
        print(i)
        
for i in range(len(mean_faq_ans)):
    zero = np.count_nonzero(mean_ticket_ans[i])
    if zero != 128: 
        print(i)
'''

## Classification

In [54]:
def classification(mean_ticket_ques, mapping):
    
    # RANDOM FOREST CLASSIFIER
    print('RANDOM FOREST CLASSIFIER')
    print('Running CV on Classifier...')
    classifier_CV = RandomForestClassifier()
    scores = cross_val_score(classifier_CV, mean_ticket_ques, mapping, cv=5)
    cv_score = scores.mean()
    print('Training Classifier...')
    classifier = RandomForestClassifier()
    classifier.fit(X=mean_ticket_ques, y=mapping)
    #dump(classifier, 'classifier/models/RF_word2vec.joblib')
    train_score = classifier.score(X=mean_ticket_ques, y=mapping)
    print('Training Score: {0} \n Cross Val Score: {1}'.format(train_score, cv_score))
    
    print('GRADIENT BOOSTING CLASSIEIR')
    print('Running CV on Classifier...')
    Bclassifier_CV = GradientBoostingClassifier()
    scores = cross_val_score(Bclassifier_CV, mean_ticket_ques, mapping, cv=5)
    cv_score = scores.mean()
    print('Training Classifier...')
    Bclassifier = GradientBoostingClassifier()
    Bclassifier.fit(X=mean_ticket_ques, y=mapping)
    #dump(classifier, 'classifier/models/RF_word2vec.joblib')
    train_score = Bclassifier.score(X=mean_ticket_ques, y=mapping)
    print('Training Score: {0} \nCross Val Score: {1}'.format(train_score, cv_score))

In [52]:
# Load the Word2Vec model
model_path = '../../code/embedding/models/word2vec_ticket_ques.model'
model = Word2Vec.load(model_path)

In [55]:
ticket_question_embeddings = doc_emb(ticket_ques_prepro)

In [67]:
with open('../../code/similarity/mappings/ticket_faq_map_word2vec.pkl', 'rb') as fp:
    Classes = pickle.load(fp)
mapping = Classes['mapping']

ticket_question_embeddings = doc_emb(ticket_ques_prepro)

classification(ticket_question_embeddings, mapping)

RANDOM FOREST CLASSIFIER
Running CV on Classifier...
Training Classifier...
Training Score: 0.9888254873989539 
 Cross Val Score: 0.15699187718282617
GRADIENT BOOSTING CLASSIEIR
Running CV on Classifier...
Training Classifier...
Training Score: 0.9365192582025678 
Cross Val Score: 0.1620538609730758


# 1 embedding

In [69]:
print('Loading Word2vec model')
model_path = '../../code/embedding/models/word2vec_all.model'
model = Word2Vec.load(model_path)

Loading Word2vec model


In [70]:
def doc_emb_one(name):
    mean_ans = np.empty((len(id_dict[name]), 128), dtype=float)
    for j in id_dict[name]:
        sentence = all_docs_prepro[j]
        words = np.empty((len(sentence), 128), dtype=float)
        for i in range(len(sentence)):
            words[i] = model[sentence[i]]
        mean_ans[j - id_dict[name][0]] = np.apply_along_axis(np.mean, 0, words)
    return mean_ans

## Similarity

In [71]:
print('Computing word2vec similarity')
#create doc vector for tickets answers i.e. average over each ticket ans the word2vec vector for each word
mean_ticket_ans = doc_emb_one('ticket_ans')
#create doc vector for faq ans i.e. average over each faq ans the word2vec vector for each word
mean_faq_ans = doc_emb_one('faq_ans')

output = compute_sim(mean_ticket_ans=mean_ticket_ans, mean_faq_ans=mean_faq_ans)

Computing word2vec similarity
Computing word2vec similarity
109 classes, with  0.0 % non assigned tickets


## Classification 

In [74]:
mapping = output['mapping']

ticket_question_embeddings = doc_emb_one('ticket_ques')

classification(ticket_question_embeddings, mapping)

RANDOM FOREST CLASSIFIER
Running CV on Classifier...
Training Classifier...
Training Score: 0.9866856871136471 
 Cross Val Score: 0.3335240993573032
GRADIENT BOOSTING CLASSIEIR
Running CV on Classifier...
Training Classifier...
Training Score: 0.9317641464574418 
Cross Val Score: 0.31579189729889257
