# Importing the ususal stuff

In [436]:
import warnings
warnings.filterwarnings('ignore')

In [63]:
import pickle
import joblib as jl
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import operator

from joblib import dump
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models.word2vec import Word2VecVocab
from gensim.models import Word2Vec

from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer

In [31]:
from gensim.parsing.preprocessing import preprocess_string
import re
import string
from gensim.parsing.preprocessing import remove_stopwords
from gensim.parsing.preprocessing import strip_multiple_whitespaces
from gensim.parsing.preprocessing import stem_text
from gensim.parsing.preprocessing import strip_numeric
def remove_ip(s):
    # Replace all ip adresses with '<ip>' tag
    ip_regexp = r"\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b"
    return re.sub(ip_regexp, '<ip>', s)
def remove_email(s):
    # Replace all email adresses with '<email>' tag
    email_regexp = r"([a-zA-Z0-9_\-\.]+)@([a-zA-Z0-9_\-\.]+)\.([a-zA-Z]{2,5})"
    return re.sub(email_regexp, '<email>', s)
def remove_mailto(s):
    # Replace all "<mailto:<email>>" with <email>. Email adresses should be replaced by remove_email first.
    return s.replace("<mailto:<email>>", "<email>")
def remove_url(s):
    # Replace all url's with '<url>' tag
    url_regexp = r"((http|ftp|https):\/\/)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)"
    s = re.sub(url_regexp, '<url>', s)
    # Sometimes url's are inside <> so we need to replace <<url>> with <url>
    return s.replace("<<url>>", "<url>")
def remove_punc(s, exceptions):
    # Remove all punctuation from string with exceptions in list exceptions
    remove = string.punctuation
    for exception in exceptions:
        remove = remove.replace(exception, "")
    # Create the pattern
    pattern = r"[{}]".format(remove)

    return re.sub(pattern, "", s)
def remove_custom_stopwords(s, stopwords):
    for stopword in stopwords:
        s = s.replace(stopword, "")
    return s
def lower_case(s):
    return s.lower()
def preprocess_sentence_fn(s):
    # Preprocess a single sentence to a list of tokens
    punc_exceptions = ['<', '>']
    custom_stopwords = ['dear', 'sincerely', 'thanks', 'yours', 'regards']
    filters = [lower_case,
               remove_ip,
               remove_email,
               remove_mailto,
               #remove_url,
               lambda x: remove_punc(x, punc_exceptions),
               remove_stopwords,
               lambda x: remove_custom_stopwords(x, custom_stopwords),
               strip_multiple_whitespaces,
               stem_text,
               strip_numeric]
    out = preprocess_string(s, filters=filters)
    return out
def preprocess_docs_fn(docs):
    # Apply preprocess_sentence_fn to a list of sentances (docs) to get a list of lists
    return [preprocess_sentence_fn(s) for s in docs]



In [33]:
# Read in the data
ticket_dat = pd.read_csv('../data/12-04-ticket_dat.csv')
faq_dat = pd.read_csv('../data/12-04-faq_dat.csv')
# Replace the NaNs
ticket_dat.fillna('', inplace=True)
faq_dat.fillna('', inplace=True)

In [34]:
# FAQ question
faq_ques = list(faq_dat.question)
n_faq_ques = len(faq_ques)
# FAQ answer
faq_ans = list(faq_dat.answer_title + " " + faq_dat.answer)
n_faq_ans = len(faq_ans)
#ticket question
ticket_ques = list(ticket_dat.question)
n_ticket_ques = len(ticket_ques)
#ticket ans
ticket_ans = list(ticket_dat.answer)
n_ticket_ans = len(ticket_ans)

In [35]:
# Model assumption: same embedding for all
all_docs = faq_ques + faq_ans + ticket_ques + ticket_ans
# Model assumption: two different embeddings
all_ans = faq_ans + ticket_ans

In [36]:
# create a dictionary storing the cut points for the four datasets so we can re-split them after.
# use like all_docs[id_dict['faq_ques']] to get all faq questions.
id_dict = {
    'faq_ques': range(0, n_faq_ques),
    'faq_ans': range(n_faq_ques, n_faq_ques + n_faq_ans),
    'ticket_ques': range(n_faq_ques + n_faq_ans, n_faq_ques + n_faq_ans + n_ticket_ques),
    'ticket_ans': range(n_faq_ques + n_faq_ans + n_ticket_ques, n_faq_ques + n_faq_ans + n_ticket_ques + n_ticket_ans)
}
all_docs_sep = {
    'faq_ques': faq_ques,
    'faq_ans': faq_ans,
    'ticket_ques': ticket_ques,
    'ticket_ans': ticket_ans}

In [59]:
all_ans_prepro = preprocess_docs_fn(all_ans)
ticket_ques_prepro = preprocess_docs_fn(ticket_ques)

In [192]:
print('Loading Word2vec model')
model_path = '../code/embedding/models/word2vec_ans.model'
w2v_ans = Word2Vec.load(model_path)
len(w2v_ans.wv.vocab)

Loading Word2vec model


13641

# Tfidf on separate datasets, w2v on separate datasets

### Average 5 most important words

In [127]:
from gensim.models import TfidfModel
from gensim.corpora import Dictionary

In [417]:
dct = Dictionary(all_ans_prepro)  # fit dictionary
corpus = [dct.doc2bow(line) for line in all_ans_prepro]  # convert corpus to BoW format
model_tfidf = TfidfModel(corpus)  # fit model
#vector = model_tfidf[corpus[0]]  # apply model to the first corpus document
#print(vector)

In [418]:
def top5(dat, corpus, dct, model_w2v, model_tfidf):
    if dat == 'faq_ans':
        ind = 0
        dat = all_ans_prepro[:199]
    elif dat == 'ticket_ans':
        ind = 199
        dat = all_ans_prepro[199:]
    else:
        ind = 0 
        dat = ticket_ques_prepro
    mean_ans = np.empty((len(dat), 128), dtype=float)
    for i in range(len(dat)):
        vector = model_tfidf[corpus[ind]]
        vector_s = sorted(vector, key=itemgetter(1), reverse=True)
        top5 = vector_s[:5]
        top5 = np.asarray(top5, dtype=int)[:,0]
        words = np.empty((len(top5), 128), dtype=float)
        for j in range(len(top5)):
            words[j] = model_w2v[dct[top5[j]]]
        mean_ans[i] = np.apply_along_axis(np.mean, 0, words)
        ind += 1
    return mean_ans

In [419]:
mean_faq_ans = top5(dat='faq_ans', corpus=corpus, dct=dct, model_w2v=model_w2v, model_tfidf=model_tfidf)
mean_faq_ans.shape, len(all_ans_prepro)



((199, 128), 4405)

### Weighted average over 5 most important words

In [420]:
def top5_average(dat, corpus, dct, model_w2v, model_tfidf):
    if dat == 'faq_ans':
        ind = 0
        dat = all_ans_prepro[:199]
    elif dat == 'ticket_ans':
        ind = 199
        dat = all_ans_prepro[199:]
    else:
        ind = 0 
        dat = ticket_ques_prepro
    mean_ans = np.empty((len(dat), 128), dtype=float)
    for i in range(len(dat)):
        vector = model_tfidf[corpus[ind]]
        vector_s = sorted(vector, key=itemgetter(1), reverse=True)
        top5 = vector_s[:5]
        top5 = np.asarray(top5, dtype=float)
        words = np.empty((len(top5), 128), dtype=float)
        for j in range(len(top5)):
            words[j] = model_w2v[dct[int(top5[j,0])]]
        mean_ans[i] = np.average(words, 0, weights=top5[:,1])
        ind += 1
    return mean_ans

In [421]:
mean_faq_ans = top5_average('faq_ans', corpus, dct, model_w2v, model_tfidf)
mean_faq_ans.shape, len(all_ans_prepro)



((199, 128), 4405)

### Weighted average over all vectors

In [422]:
def all_average(dat, corpus, dct, model_w2v, model_tfidf):
    if dat == 'faq_ans':
        ind = 0
        dat = all_ans_prepro[:199]
    elif dat == 'ticket_ans':
        ind = 199
        dat = all_ans_prepro[199:]
    else:
        ind = 0 
        dat = ticket_ques_prepro
    mean_ans = np.empty((len(dat), 128), dtype=float)
    for i in range(len(dat)):
        vector = np.asarray(model_tfidf[corpus[ind]], dtype=float)
        words = np.empty((len(vector), 128), dtype=float)
        for j in range(len(vector)):
            words[j] = model_w2v[dct[int(vector[j,0])]]
        mean_ans[i] = np.average(words, 0, weights=vector[:,1])
        ind += 1
    return mean_ans

In [423]:
mean_faq_ans = all_average('faq_ans', corpus, dct, model_w2v, model_tfidf)
mean_faq_ans.shape, len(all_ans_prepro)

  app.launch_new_instance()


((199, 128), 4405)

## Compute similarity for all the models

In [424]:
def compute_sim(mean_ticket_ans, mean_faq_ans):
    print('Computing word2vec similarity')

    # create matrix with cosine distances from all ticket ans to all faq ans
    sim_matrix = cosine_similarity(mean_faq_ans, mean_ticket_ans)

    # most similar faq - ticket mapping
    FAQ_per_ticket = np.argmax(sim_matrix, axis=0)
    strength_FAQ_ticket = np.max(sim_matrix, axis=0)

    # small similarities are set to a separate class
    thres = 0.2
    FAQ_per_ticket[strength_FAQ_ticket < thres] = -1

    # some stats
    n_unique = len(np.unique(FAQ_per_ticket))
    n_nonassigned = np.shape(FAQ_per_ticket[strength_FAQ_ticket < thres])[0]
    n_tickets = len(FAQ_per_ticket)
    # How many tickets each FAQ is assigned
    counts_per_faq = pd.Series(FAQ_per_ticket).value_counts()
    #print(counts_per_faq)

    output = {
        'classes': n_tickets,
        'mapping': FAQ_per_ticket
    }
    print(n_unique, 'classes, with ', round(n_nonassigned / n_tickets, 2), '% non assigned tickets')
    return output

In [425]:
print('AVERAGE 5 MOST IMPORTANT WORDS')
# create doc vector for tickets answers i.e. average over each ticket ans the word2vec vector for each word
mean_faq_ans = top5(dat='faq_ans', corpus=corpus, dct=dct, model_w2v=model_w2v, model_tfidf=model_tfidf)
# create doc vector for faq ans i.e. average over each faq ans the word2vec vector for each word
mean_ticket_ans = top5(dat='ticket_ans', corpus=corpus, dct=dct, model_w2v=model_w2v, model_tfidf=model_tfidf)
output = compute_sim(mean_ticket_ans, mean_faq_ans)

with open("../code/similarity/mappings/map_w2v_tfidf_5a.pkl", "wb") as fp:
    pickle.dump(output, fp)

AVERAGE 5 MOST IMPORTANT WORDS




Computing word2vec similarity
159 classes, with  0.0 % non assigned tickets


In [426]:
print('WEIGHTED AVERAGE OVER 5 MOST IMPORTANT WORDS')
mean_ticket_ans = top5_average(dat='ticket_ans', corpus=corpus, dct=dct, model_w2v=model_w2v, model_tfidf=model_tfidf)
mean_faq_ans = top5_average(dat='faq_ans', corpus=corpus, dct=dct, model_w2v=model_w2v, model_tfidf=model_tfidf)

output = compute_sim(mean_ticket_ans, mean_faq_ans)

with open("../code/similarity/mappings/map_w2v_tfidf_5w.pkl", "wb") as fp:
    pickle.dump(output, fp)

WEIGHTED AVERAGE OVER 5 MOST IMPORTANT WORDS




Computing word2vec similarity
156 classes, with  0.0 % non assigned tickets


In [427]:
print('WEIGHTED AVERAGE OVER ALL WORDS')
mean_ticket_ans = all_average(dat='ticket_ans', corpus=corpus, dct=dct, model_w2v=model_w2v, model_tfidf=model_tfidf)
mean_faq_ans = all_average(dat='faq_ans', corpus=corpus, dct=dct, model_w2v=model_w2v, model_tfidf=model_tfidf)

output = compute_sim(mean_ticket_ans, mean_faq_ans)

with open("../code/similarity/mappings/map_w2v_tfidf_all.pkl", "wb") as fp:
    pickle.dump(output, fp)

WEIGHTED AVERAGE OVER ALL WORDS


  app.launch_new_instance()


Computing word2vec similarity
105 classes, with  0.0 % non assigned tickets


## Classification 

In [437]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_val_score

In [438]:
# Load the tfidf model
model_path = '../code/embedding/models/tfidf_ticket_ques.model'
tfidf_ticket = TfidfModel.load(model_path)

# Load the Word2Vec model
model_path = '../code/embedding/models/word2vec_ticket_ques.model'
w2v_ticket = Word2Vec.load(model_path)

# Recompute dictionary and corpus
dct = Dictionary(ticket_ques_prepro)  # fit dictionary
corpus = [dct.doc2bow(line) for line in ticket_ques_prepro]

In [439]:
def classification(mean_ticket_ques, mapping):
    
    # RANDOM FOREST CLASSIFIER
    print('RANDOM FOREST CLASSIFIER')
    print('Running CV on Classifier...')
    classifier_CV = RandomForestClassifier()
    scores = cross_val_score(classifier_CV, mean_ticket_ques, mapping, cv=5)
    cv_score = scores.mean()
    print('Training Classifier...')
    classifier = RandomForestClassifier()
    classifier.fit(X=mean_ticket_ques, y=mapping)
    #dump(classifier, 'classifier/models/RF_word2vec.joblib')
    train_score = classifier.score(X=mean_ticket_ques, y=mapping)
    print('Training Score: {0} \n Cross Val Score: {1}'.format(train_score, cv_score))
    
    print('GRADIENT BOOSTING CLASSIEIR')
    print('Running CV on Classifier...')
    Bclassifier_CV = GradientBoostingClassifier()
    scores = cross_val_score(Bclassifier_CV, mean_ticket_ques, mapping, cv=5)
    cv_score = scores.mean()
    print('Training Classifier...')
    Bclassifier = GradientBoostingClassifier()
    Bclassifier.fit(X=mean_ticket_ques, y=mapping)
    #dump(classifier, 'classifier/models/RF_word2vec.joblib')
    train_score = Bclassifier.score(X=mean_ticket_ques, y=mapping)
    print('Training Score: {0} \nCross Val Score: {1}'.format(train_score, cv_score))

In [440]:
print('AVERAGE 5 MOST IMPORTANT WORDS')
with open('../code/similarity/mappings/map_w2v_tfidf_5a.pkl', 'rb') as fp:
    Classes = pickle.load(fp)
mapping = Classes['mapping']

mean_ticket_ques = top5('ticket_ques', corpus=corpus, dct=dct, model_w2v=w2v_ticket, model_tfidf=tfidf_ticket)

classification(mean_ticket_ques, mapping)

AVERAGE 5 MOST IMPORTANT WORDS
RANDOM FOREST CLASSIFIER
Running CV on Classifier...
Training Classifier...
Training Score: 0.9854969091773657 
 Cross Val Score: 0.07964483137596755
GRADIENT BOOSTING CLASSIEIR
Running CV on Classifier...
Training Classifier...
Training Score: 0.9155967665240133 
Cross Val Score: 0.07053478484620493


In [441]:
print('WEIGHTED AVERAGE OVER 5 MOST IMPORTANT WORDS')
with open('../code/similarity/mappings/map_w2v_tfidf_5w.pkl', 'rb') as fp:
    Classes = pickle.load(fp)
mapping = Classes['mapping']

mean_ticket_ques = top5_average('ticket_ques', corpus=corpus, dct=dct, model_w2v=w2v_ticket, model_tfidf=tfidf_ticket)

classification(mean_ticket_ques, mapping)

WEIGHTED AVERAGE OVER 5 MOST IMPORTANT WORDS
RANDOM FOREST CLASSIFIER
Running CV on Classifier...
Training Classifier...
Training Score: 0.9888254873989539 
 Cross Val Score: 0.07118744721095863
GRADIENT BOOSTING CLASSIEIR
Running CV on Classifier...
Training Classifier...
Training Score: 0.9210651450309082 
Cross Val Score: 0.07391828891184136


In [442]:
print('WEIGHTED AVERAGE OVER ALL WORDS')
with open('../code/similarity/mappings/map_w2v_tfidf_all.pkl', 'rb') as fp:
    Classes = pickle.load(fp)
mapping = Classes['mapping']

mean_ticket_ques = top5('ticket_ques', corpus=corpus, dct=dct, model_w2v=w2v_ticket, model_tfidf=tfidf_ticket)

classification(mean_ticket_ques, mapping)

WEIGHTED AVERAGE OVER ALL WORDS
RANDOM FOREST CLASSIFIER
Running CV on Classifier...
Training Classifier...
Training Score: 0.9835948644793152 
 Cross Val Score: 0.14091165026354704
GRADIENT BOOSTING CLASSIEIR
Running CV on Classifier...
Training Classifier...
Training Score: 0.8309557774607703 
Cross Val Score: 0.1413967456131142
