# Learning Word2Vec from work item titles only

In [1]:
import pandas as pd
import numpy as np
import pickle
import re
from tqdm import tqdm_pandas, tqdm_notebook

import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.stem import WordNetLemmatizer

from gensim.models import Word2Vec
from gensim.models import Phrases
from gensim.models.phrases import Phraser

from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

%matplotlib inline

nltk.download('stopwords')
nltk.download('wordnet')
pd.set_option('display.max_colwidth', -1)
tqdm_pandas(tqdm_notebook())



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\saritwik\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\saritwik\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [2]:
def remove_emails(text):
    return re.sub('[\w.+-]+@[\w-]+\.(?:[\w-]\.?)+[\w-]', ' ', text)

remove_emails('please mail to ritwik.saxena@gmail.com and saritwik@microsoft.com')

'please mail to   and  '

In [3]:
def remove_ip(text):
    return re.sub('(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)', ' ', text)

remove_ip('Reply from 67.171.48.237: time=36ms')

'Reply from  : time=36ms'

In [4]:
def remove_guids(text):
    return re.sub('[0-9a-f]{8}-[0-9a-f]{4}-[1-5][0-9a-f]{3}-[89ab][0-9a-f]{3}-[0-9a-f]{12}', ' ', text)

remove_guids('omg this guid business f2384208-ed53-4f9d-96a8-7696861c2427 is so annoyingf2384208-ed53-4f9d-96a8-7696861c2427')

'omg this guid business   is so annoying '

In [5]:
def keep_only_alpha_numeric(text):
    return re.sub('[\W_]+', ' ', text)
    
keep_only_alpha_numeric('what the 234234 Hell is shit!!! sdlkfjsdlkfj 000-909-0.')

'what the 234234 Hell is shit sdlkfjsdlkfj 000 909 0 '

In [6]:
def remove_non_ascii(text):
    return text.encode("ascii", errors="ignore").decode()

remove_non_ascii('accéder.=====')

'accder.====='

In [7]:
def remove_numbers(words):
    return re.sub(r'\w*\d\w*', ' ', words).strip()

remove_numbers('TFS.2013 is messed up')

'TFS.  is messed up'

In [8]:
def reduce_lengthening(text):
    """
    Replace repeated character sequences of length 3 or greater with sequences
    of length 3.
    """
    pattern = re.compile(r"(.)\1{2,}")
    return pattern.sub(r"\1\1\1", text)

reduce_lengthening('whaAAat is going onnnnn')

'whaAAat is going onnn'

In [9]:
def camel_case_split(text):
    s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', text)
    #return re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower().replace('_', ' ')
    return re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower().replace('_', ' ')

camel_case_split('This senetence will toLower and will handle camelCase, UpperCamelCase and also lowerCamelCase')
camel_case_split('[LabManagement]: LabServiceCryptography uses hardcoded key')

'[ lab management]:  lab service cryptography uses hardcoded key'

In [10]:
stops = set(stopwords.words("english"))
def remove_stop_words(text):
    words = text.split()
    non_stop_words = [word for word in words if word not in stops]
    return ' '.join(non_stop_words)

remove_stop_words('creating a work item is causing issues, due to metadata!!!')

'creating work item causing issues, due metadata!!!'

In [11]:
lemmer = WordNetLemmatizer()
def lemmatize(text):
    words = text.split()
    lemmatized_words = []
    for word in words:
        lemmatized_words.append(lemmer.lemmatize(word))
    return ' '.join(lemmatized_words)

def lemmatize_verbs(text):
    words = text.split()
    lemmatized_words = []
    for word in words:
        lemmatized_words.append(lemmer.lemmatize(word,'v'))
    return ' '.join(lemmatized_words)

print(lemmatize('creating a work item is causing issues , due to metadata please lover love loves !!!'))
print(lemmatize_verbs('creating a work item is causing issues , due to metadata lover love loves !!!'))
print(lemmatize_verbs('AllowReversePInvokeCallsAttribute'))

creating a work item is causing issue , due to metadata please lover love love !!!
create a work item be cause issue , due to metadata lover love love !!!
AllowReversePInvokeCallsAttribute


In [12]:
stemmer = SnowballStemmer('english')
def stem_words(text):
    words = text.split()
    stemmed_words = []
    for word in words:
        stemmed_words.append(stemmer.stem(word))
    return ' '.join(stemmed_words)

stem_words('abbrevation')

'abbrev'

In [13]:
def clean_title(text, stem=False):
    text = remove_emails(text)
    text = remove_guids(text)
    text = remove_ip(text)
    text = keep_only_alpha_numeric(text)
    text = remove_numbers(text)
    text = remove_non_ascii(text)
    text = reduce_lengthening(text)
    text = camel_case_split(text)
    text = remove_stop_words(text)
    text = lemmatize(text)
    text = lemmatize_verbs(text)
    if stem:
        text = stem_words(text)
    return text

In [14]:
%%time
import pickle

with open("raw_data.pickle", "rb") as raw_data:
    all_wits = pickle.load(raw_data)

Wall time: 14.4 s


In [15]:
vso_wits = [id for id in all_wits if all_wits[id].fields['System.TeamProject'] == 'VSOnline']
d = {'id': vso_wits, 'title': [all_wits[wit_id].fields['System.Title'] for wit_id in vso_wits]}
title_df = pd.DataFrame(d)

In [16]:
title_df.shape

(265670, 2)

In [17]:
title_df['title'] = title_df['title'].progress_apply(lambda x: clean_title(x)) 




In [18]:
from collections import Counter
cntr = Counter()
def count_words(text):
    words = text.split()
    cntr.update(words)

def remove_rare_words(text):
    words = text.split()
    return ' '.join([word for word in words if cntr[word]>2])
    
title_df['title'].progress_apply(lambda x: count_words(x))
title_df['title'] = title_df['title'].progress_apply(lambda x: remove_rare_words(x))

HBox(children=(IntProgress(value=0, max=265670), HTML(value='')))




HBox(children=(IntProgress(value=0, max=265670), HTML(value='')))




In [19]:
def isNoise(text):
    return len(text.split()) < 3

title_df['isNoise'] = title_df['title'].apply(lambda x: isNoise(x))

In [20]:
sentences = title_df[title_df.isNoise != True]

In [21]:
sentences

Unnamed: 0,id,title,isNoise
11,1051,git tool newly add project show pending add glyph se refresh,False
12,1053,git tool solution section show meaningless dropdown repository number,False
13,1054,git tool solution section placement repository count suggest new open button open repos instead solution,False
14,1055,git tool file disappear change section several second save first time,False
15,1056,git tool undo change solution cause several error,False
16,1057,git tool new branch flyout input field label,False
17,1058,git tool checkout branch flyout doesnt field labled,False
18,1060,git tool merge flyout doesnt repository picker label,False
19,1061,git tool checkout branch picker enable repo select,False
20,1073,git tool work branch across multiple repos pain,False


# Clearly we need more cleaning spell checks better lemmas :) avoid stemming make it last resort, for now we will stem until we get pattern to work :(

In [28]:
sentences['title'].map(lambda x: x.split())

11        [git, tool, newly, add, project, show, pending, add, glyph, se, refresh]                                                                                                                                                      
12        [git, tool, solution, section, show, meaningless, dropdown, repository, number]                                                                                                                                               
13        [git, tool, solution, section, placement, repository, count, suggest, new, open, button, open, repos, instead, solution]                                                                                                      
14        [git, tool, file, disappear, change, section, several, second, save, first, time]                                                                                                                                             
15        [git, tool, undo, change, solution, cause, several, error]

# Lets create phrases and do word2vec stuff

# expand sulochan to description and comments

# compare tfidf vs word2vec

# combine with pretrained word2vec