In [1]:
from gensim.models.doc2vec import Doc2Vec
from gensim.models import Word2Vec
import pickle
import numpy as np



In [2]:
from gensim.parsing.preprocessing import preprocess_string
import re
import string
from gensim.parsing.preprocessing import remove_stopwords
from gensim.parsing.preprocessing import strip_multiple_whitespaces
from gensim.parsing.preprocessing import stem_text
from gensim.parsing.preprocessing import strip_numeric
def remove_ip(s):
    # Replace all ip adresses with '<ip>' tag
    ip_regexp = r"\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b"
    return re.sub(ip_regexp, '<ip>', s)
def remove_email(s):
    # Replace all email adresses with '<email>' tag
    email_regexp = r"([a-zA-Z0-9_\-\.]+)@([a-zA-Z0-9_\-\.]+)\.([a-zA-Z]{2,5})"
    return re.sub(email_regexp, '<email>', s)
def remove_mailto(s):
    # Replace all "<mailto:<email>>" with <email>. Email adresses should be replaced by remove_email first.
    return s.replace("<mailto:<email>>", "<email>")
def remove_url(s):
    # Replace all url's with '<url>' tag
    url_regexp = r"((http|ftp|https):\/\/)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)"
    s = re.sub(url_regexp, '<url>', s)
    # Sometimes url's are inside <> so we need to replace <<url>> with <url>
    return s.replace("<<url>>", "<url>")
def remove_punc(s, exceptions):
    # Remove all punctuation from string with exceptions in list exceptions
    remove = string.punctuation
    for exception in exceptions:
        remove = remove.replace(exception, "")
    # Create the pattern
    pattern = r"[{}]".format(remove)

    return re.sub(pattern, "", s)
def remove_custom_stopwords(s, stopwords):
    for stopword in stopwords:
        s = s.replace(stopword, "")
    return s
def lower_case(s):
    return s.lower()
def preprocess_sentence_fn(s):
    # Preprocess a single sentence to a list of tokens
    punc_exceptions = ['<', '>']
    custom_stopwords = ['dear', 'sincerely', 'thanks', 'yours', 'regards']
    filters = [lower_case,
               remove_ip,
               remove_email,
               remove_mailto,
               remove_url,
               lambda x: remove_punc(x, punc_exceptions),
               remove_stopwords,
               lambda x: remove_custom_stopwords(x, custom_stopwords),
               strip_multiple_whitespaces,
               stem_text,
               strip_numeric]
    out = preprocess_string(s, filters=filters)
    return out
def preprocess_docs_fn(docs):
    # Apply preprocess_sentence_fn to a list of sentances (docs) to get a list of lists
    return [preprocess_sentence_fn(s) for s in docs]

In [3]:
with open("../../code/embedding/models/doc_data/id_dict.txt", "rb") as fp:
    id_dict = pickle.load(fp)
    
ticket_ans_ids = np.array(id_dict['ticket_ans'])
all_faq_ans = id_dict['faq_ans']

print('Loading Document Data...')
# Unpickle the document data
with open("../../code/embedding/models/doc_data/all_docs.txt", "rb") as fp:
    all_docs = pickle.load(fp)

print('Loading Model...')
model_path = '../../code/embedding/models/word2vec.model'
model = Word2Vec.load(model_path)

all_docs_prepro = preprocess_docs_fn(all_docs)

Loading Document Data...
Loading Model...


In [4]:
print(len(all_docs_prepro))
print(id_dict)

9054
{'faq_ques': range(0, 277), 'faq_ans': range(277, 554), 'ticket_ques': range(554, 4804), 'ticket_ans': range(4804, 9054)}


In [22]:
print(id_dict['ticket_ans'][0])

4804


In [23]:
mean_ticket_ans = np.empty((len(id_dict['ticket_ans']), 128),dtype = float)
for j in id_dict['ticket_ans']:
    sentence = all_docs_prepro[j]
    words = np.empty((len(sentence), 128), dtype = float)
    for i in range(len(sentence)):
        words[i] = model[sentence[i]]
    mean_ticket_ans[j-id_dict['ticket_ans'][0]] = np.apply_along_axis(np.mean, 0, words)

  
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


In [24]:
mean_faq_ans = np.empty((len(id_dict['faq_ans']), 128),dtype = float)
for j in id_dict['faq_ans']:
    sentence = all_docs_prepro[j]
    words = np.empty((len(sentence), 128), dtype = float)
    for i in range(len(sentence)):
        words[i] = model[sentence[i]]
    mean_faq_ans[j-id_dict['faq_ans'][0]] = np.apply_along_axis(np.mean, 0, words)

  


In [33]:
def doc_emb(name):
    mean_ans = np.empty((len(id_dict[name]), 128),dtype = float)
    for j in id_dict[name]:
        sentence = all_docs_prepro[j]
        words = np.empty((len(sentence), 128), dtype = float)
        for i in range(len(sentence)):
            words[i] = model[sentence[i]]
        mean_ans[j-id_dict[name][0]] = np.apply_along_axis(np.mean, 0, words)
    return mean_ans

In [34]:
mean_faq_ans = doc_emb('faq_ans')
mean_faq_ans

  import sys


array([[ 0.58662936,  0.63210138, -0.03319168, ..., -0.51580547,
        -0.26830086,  0.75055616],
       [ 0.29912805,  0.46380159, -0.28621869, ..., -0.8283119 ,
        -0.53647751,  0.9114836 ],
       [ 0.42702867,  0.71347047, -0.16933884, ..., -0.21429505,
        -0.19091241,  0.76486944],
       ...,
       [ 0.38859673,  0.57834958, -0.32152405, ..., -0.68374868,
        -0.62558026,  0.93899817],
       [ 0.23171848,  0.35882801, -0.27480772, ..., -0.54559615,
        -0.17777857,  0.63770821],
       [ 0.28393199,  0.48404811, -0.35095383, ..., -0.26550584,
        -0.05493604,  0.67021711]])

In [25]:
from scipy.spatial import distance

In [26]:
ticket_faq_dists = np.empty((len(mean_ticket_ans),len(mean_faq_ans)), dtype = float)
for i in range(len(mean_ticket_ans)):
    for j in range(len(mean_faq_ans)):
        ticket_faq_dists[i,j] = distance.cosine(mean_ticket_ans[i], mean_faq_ans[j])

In [27]:
ticket_faq_map = np.argmin(ticket_faq_dists, axis=1)

In [28]:
ticket_faq_map

array([34, 34, 34, ..., 34, 34, 34], dtype=int32)

In [None]:
big_dist = [ticket_faq_dists.min(axis=1) > 0.7]
ticket_faq_map[big_dist] = -1 # Set all thresholded distances to have label -1

with open("similarity/mappings/ticket_faq_map_word2vec_cosine.txt", "wb") as fp:
    pickle.dump(ticket_faq_map, fp)

In [36]:
from sklearn.metrics.pairwise import cosine_similarity

In [37]:
sim_matrix = cosine_similarity(mean_ticket_ans, mean_faq_ans)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [50]:
print(np.nonzero(np.isnan(mean_ticket_ans)))
print(np.count_nonzero(np.isnan(mean_faq_ans)))

(array([1291, 1291, 1291, 1291, 1291, 1291, 1291, 1291, 1291, 1291, 1291,
       1291, 1291, 1291, 1291, 1291, 1291, 1291, 1291, 1291, 1291, 1291,
       1291, 1291, 1291, 1291, 1291, 1291, 1291, 1291, 1291, 1291, 1291,
       1291, 1291, 1291, 1291, 1291, 1291, 1291, 1291, 1291, 1291, 1291,
       1291, 1291, 1291, 1291, 1291, 1291, 1291, 1291, 1291, 1291, 1291,
       1291, 1291, 1291, 1291, 1291, 1291, 1291, 1291, 1291, 1291, 1291,
       1291, 1291, 1291, 1291, 1291, 1291, 1291, 1291, 1291, 1291, 1291,
       1291, 1291, 1291, 1291, 1291, 1291, 1291, 1291, 1291, 1291, 1291,
       1291, 1291, 1291, 1291, 1291, 1291, 1291, 1291, 1291, 1291, 1291,
       1291, 1291, 1291, 1291, 1291, 1291, 1291, 1291, 1291, 1291, 1291,
       1291, 1291, 1291, 1291, 1291, 1291, 1291, 1291, 1291, 1291, 1291,
       1291, 1291, 1291, 1291, 1291, 1291, 1291, 1564, 1564, 1564, 1564,
       1564, 1564, 1564, 1564, 1564, 1564, 1564, 1564, 1564, 1564, 1564,
       1564, 1564, 1564, 1564, 1564, 1564, 1564, 1

In [48]:
for i in range(len(mean_ticket_ans)):
    zero = np.count_nonzero(mean_ticket_ans[i])
    if zero != 128: 
        print(i)

In [49]:
for i in range(len(mean_faq_ans)):
    zero = np.count_nonzero(mean_ticket_ans[i])
    if zero != 128: 
        print(i)