In [11]:
# The Usual Suspects
import pandas as pd
import numpy as np

from gensim import corpora
from gensim.parsing.preprocessing import preprocess_documents
from gensim.test.utils import common_texts, get_tmpfile
from gensim.models import Word2Vec

## Import Data

In [2]:
ticket_dat = pd.read_csv('../data/ticket_dat.csv')
faq_dat = pd.read_csv('../data/faq_dat.csv')

In [3]:
# Replace the NaNs
ticket_dat.fillna('', inplace=True)
faq_dat.fillna('', inplace=True)

In [4]:
ticket_dat.head()

Unnamed: 0,ticket_id,article_id,ticket_id.1,subject,content_original,content_cleaned,lang,content_translated
0,18014685,2177823,424446,Registrierung als Hörer,Sehr geehrte Damen und Herren\n\n \n\nIch habe...,Sehr geehrte Damen und Herren\n\n \n\nIch habe...,de,"Dear Ladies and Gentlemen, I still have about ..."
1,18014685,2177987,424446,Re: [ID#18014685] Registrierung als Hörer,Sehr geehrter Herr Daniel Aeppli \n\nGlücklich...,Sehr geehrter Herr Daniel Aeppli \n\nGlücklich...,de,"Dear Mr. Daniel Aeppli Fortunately, you have a..."
2,18014685,2178318,424446,Re: [ID#18014685] Registrierung als Hörer,Guten Tag Herr Greutee\n\nBesten Dank.\n\nDie ...,Guten Tag Herr Greutee\n\nBesten Dank.\n\nDie ...,de,Hello Mr. Greutee Thank you. The details: 25.0...
3,18014685,2178343,424446,Re: [ID#18014685] Registrierung als Hörer,Sehr geehrter Herr Daniel Aeppli\n \nIch habe ...,Sehr geehrter Herr Daniel Aeppli\n \nIch habe ...,de,Dear Mr. Daniel Aeppli I have sent you a passw...
4,18014685,2326946,424446,Re: [ID#18014685] Registrierung als Hörer,Guten Tag Herr Greuter\n\nMit dem abgeänderten...,Guten Tag Herr Greuter\n\nMit dem abgeänderten...,de,Hello Mr. Greuter With the changed password I ...


In [5]:
faq_dat.head()

Unnamed: 0,faq_id,lang,ques,ans_title,ans_content,ans_content_translated,ans_title_translation,ques_content_translation
0,8502,en,Received a phishing mail?,Phishing Mail warning,This is a phishing mail. You should delete it....,This is a phishing mail. You should delete it....,Phishing Mail warning,Received a phishing mail?
1,8503,en,Lockout on Account,My account is locked,You had a lockout on your account because you ...,You had a lockout on your account because you ...,My account is locked,Lockout on Account
2,8506,en,Blocking mailsender,Block a mailsender,You can add the sender on your personal blackl...,You can add the sender on your personal blackl...,Block a mailsender,Blocking mailsender
3,8509,en,Credit overdrawn,I can't print anymore,Your credit is 6.90.- in minus. You have to lo...,Your credit is 6.90.- in minus. You have to lo...,I can't print anymore,Credit overdrawn
4,8509,en,Credit overdrawn,Why is my account in minus,The reason why your account is minus is the fo...,The reason why your account is minus is the fo...,Why is my account in minus,Credit overdrawn


## Embedding Based on All Data

### Preprocessing

In [55]:
from gensim.parsing.preprocessing import preprocess_string
from gensim.parsing.preprocessing import DEFAULT_FILTERS

Function's that are applied are the defaults plus some customs. 

(more details at https://radimrehurek.com/gensim/parsing/preprocessing.html#gensim.parsing.preprocessing.preprocess_string) 

Defaults are:


In [82]:
DEFAULT_FILTERS

[<function gensim.parsing.preprocessing.<lambda>>,
 <function gensim.parsing.preprocessing.strip_tags>,
 <function gensim.parsing.preprocessing.strip_punctuation>,
 <function gensim.parsing.preprocessing.strip_multiple_whitespaces>,
 <function gensim.parsing.preprocessing.strip_numeric>,
 <function gensim.parsing.preprocessing.remove_stopwords>,
 <function gensim.parsing.preprocessing.strip_short>,
 <function gensim.parsing.preprocessing.stem_text>]

In [199]:
import re
import string

In [354]:
from gensim.parsing.preprocessing import remove_stopwords
from gensim.parsing.preprocessing import strip_multiple_whitespaces
from gensim.parsing.preprocessing import stem_text
from gensim.parsing.preprocessing import strip_numeric


def remove_ip(s):
    # Replace all ip adresses with '<ip>' tag
    ip_regexp = r"\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b"
    return re.sub(ip_regexp, '<ip>', s)


def remove_email(s):
    # Replace all email adresses with '<email>' tag
    email_regexp = r"([a-zA-Z0-9_\-\.]+)@([a-zA-Z0-9_\-\.]+)\.([a-zA-Z]{2,5})"
    return re.sub(email_regexp, '<email>', s)

def remove_mailto(s):
    # Replace all "<mailto:<email>>" with <email>. Email adresses should be replaced by remove_email first.
    return s.replace("<mailto:<email>>", "<email>")

def remove_url(s):
    # Replace all url's with '<url>' tag
    url_regexp=r"((http|ftp|https):\/\/)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)"
    s = re.sub(url_regexp, '<url>', s)
    # Sometimes url's are inside <> so we need to replace <<url>> with <url>
    return s.replace("<<url>>", "<url>")

def remove_punc(s, exceptions):
    # Remove all punctuation from string with exceptions in list exceptions
    remove = string.punctuation
    for exception in exceptions:
        remove = remove.replace(exception, "") 
    # Create the pattern
    pattern = r"[{}]".format(remove) 

    return re.sub(pattern, "", s) 

def lower_case(s):
    return s.lower()
    

def preprocess_sentence_fn(s):
    punc_exceptions = ['<', '>']
    filters = [remove_ip, 
               remove_email,
               remove_mailto,
               remove_url,
               lambda x: remove_punc(x, punc_exceptions),
               remove_stopwords,
               strip_multiple_whitespaces, 
               lower_case,
               stem_text,
               strip_numeric]
    out = preprocess_string(s, filters = filters)
    
    return out

def preprocess_docs_fn(docs):
    
    return [preprocess_sentence_fn(s) for s in docs]
    

Example of a few sentences

In [355]:
preprocess_docs_fn(['hey there DElilah! 99 try <https://regex101.com/> or facebook.com', 
                    'hey brianregan@gmail.eth.com, how is my friends <mailto:person@email.com>',
                    'hey my IP adresses is 129.01.001.01'])

[['hei', 'delilah', 'try', '<url>', '<url>'],
 ['hei', '<email>', 'friend', '<email>'],
 ['hei', 'ip', 'adress', '<ip>']]

In [356]:
faq_ques = list(faq_dat.ques_content_translation)
faq_ques_docs = preprocess_docs_fn(faq_ques)

faq_ans = list(faq_dat.ans_content_translated)
faq_ans_docs = preprocess_docs_fn(faq_ans)

ticket_content = list(ticket_dat.content_translated)
ticket_content_docs = preprocess_docs_fn(ticket_content)

all_docs = faq_ques_docs + faq_ans_docs + ticket_content_docs

### Word Embedding

In [357]:
path = get_tmpfile("word2vec.model")

model = Word2Vec(all_docs, size=128, window=5, min_count=1, workers=4)
model.save("word2vec.model")

In [377]:
query_words = "inbox"
prepro_query = preprocess_sentence_fn(query_words)
print('Results for "{0}", processed to {1}'.format(query_words, prepro_query))
model.wv.most_similar(positive = prepro_query)

Results for "inbox", processed to ['inbox']


[('rule', 0.9297240376472473),
 ('archiv', 0.9043952226638794),
 ('delet', 0.8946067094802856),
 ('filter', 0.8925820589065552),
 ('spell', 0.8921395540237427),
 ('spam', 0.8892513513565063),
 ('text', 0.8873597383499146),
 ('owa', 0.8873069286346436),
 ('larger', 0.8817001581192017),
 ('specifi', 0.8793443441390991)]

### Doc Embedding

In [561]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

tagged_documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(all_docs)]
doc_model = Doc2Vec(tagged_documents, vector_size=128, window=2, min_count=1, workers=4, epochs=100)

In [655]:
query_doc = ticket_dat.content_translated[74]
print(query_doc)

Dear Dr. Sebastiano Rossi
 
Your vpn service is already expired.

sincerely,
Rudolf Friederich




In [656]:
prepro_query_doc = preprocess_sentence_fn(query_doc)
print(' '.join(prepro_query_doc) + "\n ==============")

query_doc_vec = doc_model.infer_vector(prepro_query_doc, steps=10000)
most_similar_docs = doc_model.docvecs.most_similar([query_doc_vec])
for similar_doc in most_similar_docs[0:4]:
    print('[{}] '.format(round(similar_doc[1], 5))+' '.join(tagged_documents[similar_doc[0]].words) + "\n")

dear dr sebastiano rossi your vpn servic expir sincer rudolf friederich
[0.89225] dear dr sebastiano rossi your vpn servic expir sincer rudolf friederich

[0.67479] dear dr jochen müller we extend vpn servic month octob sincer rudolf friederich

[0.64669] dear mr david john gebhardt the vpn servic extend delet we reenabl vpn servic month octob other servic end decemb sincer rudolf friederich

[0.63549] dear mr neil docherti we servic extend year sincer rudolf friederich



In [410]:
print(most_similar_docs)

[(2160, 0.5896463394165039), (16122, 0.5832903981208801), (1127, 0.5809382200241089), (2156, 0.5793976783752441), (6953, 0.5733083486557007), (9548, 0.5608631372451782), (6463, 0.5576547384262085), (10449, 0.5530002117156982), (17964, 0.5506421327590942), (9037, 0.5496723651885986)]
