In [1]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
import pandas as pd
from nltk.corpus import stopwords
from textblob import Word, TextBlob
from collections import Counter
import six
import textblob
import gensim
import pickle
from multiprocessing import Pool
import numpy as np

[nltk_data] Downloading package stopwords to /home/r1/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/r1/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/r1/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /home/r1/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [19]:
def test_multicore():
    pool = Pool(8) # number of processes
    results = list(pool.map(get_lemma_textblob, corpus))
    pool.close()
    return(results)

def get_lemma_textblob(raw_text):
    """
    Get lemma from text
    :param raw_text: Raw text to get lemma from
    :type raw_text: str
    :rtype: list of lemmas
    """
    blob = TextBlob(raw_text)
    return [Word(tag[0], tag[1]).lemma.lower() for tag in blob.pos_tags]

def filter_corpus(text_corpus, words_to_keep = [], words_to_filter = [], n_most_common=0, 
                  rare_words_threshold=10, replace_rare = False):
    """
    Filter out most common, rare, and user supplied words
    :param corpus: a list of list of lemmas as a corpus
    :param words_to_filter: a set of words to filter
    :param n_most_common: top n most frequent words to remove
    :param rare_words_threshold: filter out words if frequency is less than the threshold
    :param replace_rare: if true replace rare words using "UNK", else remove rare words
    :return: a list of list of lemmas as a corpus
    """
    print("Start filtering")
    # remove any token that contains digit
    text_corpus = [[word for word in sub_list if not any(c.isdigit() for c in word)]for sub_list in text_corpus]
    all_tokens = [item for sub_list in text_corpus for item in sub_list]  # flatten the corpus list
    
    c = Counter(all_tokens)
    common_words = [pair[0] for pair in c.most_common(n_most_common) if pair[0] not in words_to_keep] + [""]
    #print("The most %d common words:"%(n_most_common))
    #print(common_words)
    #print("Filtered stop words:")
    #print(words_to_filter)
    if replace_rare is True:
        filtered = set(common_words + list(words_to_filter))
        rare_words = set([k for k, v in six.iteritems(c) if v <= rare_words_threshold and k not in words_to_keep])
        text_corpus = [['UNK' if word in rare_words else word for word in text] for text in text_corpus]
    else:
        rare_words = [k for k, v in six.iteritems(c) if v <= rare_words_threshold and k not in words_to_keep]
        filtered = set(common_words + rare_words + list(words_to_filter))
    #print("Rare words:")
    #print(rare_words)
    texts = [[str(word) for word in text if word not in filtered] for text in text_corpus]
    print("Done filtering")
    return texts

def text_processing(additional_stop_words = [], words_to_keep = [], n_most_common=100, 
                    rare_words_threshold=100, replace_rare=False, gram='unigram', bigram=10):
    """
    - load data
    - lemmatization
    - remove common words and rare words
    - get n_gram tokens
    :param additional_stop_words: tailor-made words to remove for the corpus
    :type additional_stoop_words: list of str
    :param n_most_common,rare_words_threshold,replace_rare: same with function filter_corpus()
    :param gram: 'unigram', 'bigram', 'trigram', phrases length using gensim
    :type gram: str
    :param bigram: threshold for genism phrase
    :type bigram: int
    :return: clean tokenized documents
    :rtype: list of list
    """
#     print('Loading raw text data')
#     sec_10k = pd.read_pickle('data/10k_raw.pickle')
#     #raw = sec_10k.head().copy()
#     #raw_documents = raw['mda_text'].tolist()
#     raw_documents = sec_10k['mda_text'].tolist()
#     #raw_documents = raw_documents[0:10]
#     print('Lemmatizating')
#     lemma_documents = [get_lemma_textblob(document) for document in raw_documents]
#     pickle.dump(lemma_documents, file=open("data/lemma_documents.pickle", 'wb'))
    print('Loading lemmatized documents')
    lemma_documents = pickle.load(file=open("data/glassdoor_lemmatized.pickle", 'rb'))
    print('Cleaninng')
    stop_words = stopwords.words('english') + additional_stop_words
    clean_documents = filter_corpus(lemma_documents, words_to_keep, stop_words, n_most_common, rare_words_threshold, 
                                   replace_rare)
    print('N_gram')
    if gram == 'unigram':
        return clean_documents
    if gram == 'bigram':
        bigram_transformer = gensim.models.Phrases(clean_documents, min_count=1, threshold=bigram)
        bigram = list(bigram_transformer[clean_documents])
        return bigram
    if gram == 'trigram':
        bigram_transformer = gensim.models.Phrases(clean_documents, min_count=1)
        bigram = list(bigram_transformer[clean_documents])
        trigram_transformer = gensim.models.Phrases(bigram, min_count=1)
        trigram = list(trigram_transformer[bigram])
        return trigram

def similar_words(word2vec_model, dimension, n=5):
    """
    given a dimension of seed word or list of seed words find most similar words in word2vec corpus, based on cosine similarity
    :param word2vec_model: gensim word2vec model
    :type dimension: str
    :param n: the most n similar words
    :type n: int
    :rtype: list of (similar_word, similarity) tuples
    """
    similar_words = []
    for word in seed_words[dimension]:
        if isinstance(word, list):
            #make sure every word in seed word list is in word2vec corpus
            updated_word = [item for item in word if item in word2vec_model.vocab]
            try:
                for pair in word2vec_model.most_similar(updated_word, topn=n):
                    similar_words.append(pair)
            except:
                pass
                #similar_words[', '.join(word)] = ('All the words in this seed word list not found in corpus', 0)
        else:
            try:
                for pair in word2vec_model.similar_by_word(word, topn=n):
                    similar_words.append(pair)
            except:
                pass
                #similar_words[word] = ('Seed word not found in corpus', 0)
    return similar_words

def expand_words(word2vec_model, n=50, restrict=None):
    vocab_number = len(word2vec_model.vocab)
    expanded_words = {}
    if restrict != None:
        restrict = int(vocab_number*restrict)
    for dimension in seed_words:
        dimension_words = [word for word in seed_words[dimension] if word in word2vec_model.vocab]
        similar_words = [pair[0] for pair in word2vec_model.most_similar(dimension_words, topn=n, restrict_vocab=restrict)]
        expanded_words[dimension] = similar_words
    return expanded_words
                
def train_word2vec_model(documents, fname, min_count=1, size=100, window=5, workers=16):
    print('Building word2vec model')
    model = gensim.models.Word2Vec(documents, min_count=min_count, size=size, window=window, workers=workers)
    model.save(fname)
    model.init_sims(replace=True)
    return model


# def output(expanded_words_list, seed_words, n, fname):
#     combined_list = {}
#     for dimension in seed_words:
#         combined_list[dimension] = []
#     for item in expanded_words_list:
#         for k,v in item.items():
#             for expanded_word_tuple in v:
#                 current_expanded = [tup[0]for tup in combined_list[k]]
#                 if expanded_word_tuple[0] not in current_expanded:
#                     combined_list[k].append(expanded_word_tuple)
#     with open(fname, 'w') as text_file:
#         for k in combined_list:
#             combined_list[k].sort(key=lambda tup: tup[1], reverse=True)
#             combined_list[k] = [tup[0] for tup in combined_list[k]]
#             if len(combined_list[k]) >= n:
#                 combined_list[k] = combined_list[k][0:n]
#                 print("The dimension is: {} {} words".format(k, n), file=text_file)
#                 print("===========================================================================", file=text_file)
#                 print(combined_list[k], file=text_file)
#                 print('\n', file=text_file)
#             else:
#                 print("The dimension is: {}. Less than {}words, output {}.".format(k, n, len(combined_list[k])), file=text_file)
#                 print("===========================================================================", file=text_file)
#                 print(combined_list[k], file=text_file)
#                 print('\n', file=text_file)
#     return combined_list

def output_result(expanded_words, seed_words, n, fname):
    result = {}
    for dimension in seed_words:
        result[dimension] = []
    for k, v in expanded_words.items():
        for expanded_word_tuple in v:
            result[k].append(expanded_word_tuple)
        result[k].sort(key=lambda tup: tup[1], reverse=True)
    with open(fname, 'w') as text_file:
        for k in result:
            result[k] = [tup[0] for tup in result[k]]
            if len(result[k]) >= n:
                result[k] = result[k][0:n]
                print("The dimension is: {} {} words".format(k, n), file=text_file)
                print("===========================================================================", file=text_file)
                print(result[k], file=text_file)
                print('\n', file=text_file)
            else:
                print("The dimension is: {}. Less than {}words, output {}.".format(k, n, len(result[k])), file=text_file)
                print("===========================================================================", file=text_file)
                print(result[k], file=text_file)
                print('\n', file=text_file)
    return result

def output_dimension_result(expanded_words, n, fname):
    with open(fname, 'w') as text_file:
        for k, v in expanded_words.items():
            print("The dimension is: {} {} expanded words".format(k, n), file=text_file)
            print("===========================================================================", file=text_file)
            print(v, file=text_file)
            print('\n', file=text_file)
    
            

### Combine `pros`, `cons`, `feedback` and get lemmatized reviews, saved as  `glassdoor_lemmatized.pickle`, `glassdoor_index.pickle`

In [9]:
# raw_data = pd.read_csv('data/glassdoor_reviews_only.csv').fillna('')
# raw_data['review'] = raw_data.pros + ' ' + raw_data.cons + ' ' + raw_data.feedback
# corpus = raw_data['review'].tolist()
# lemmatized = test_multicore()
# index = raw_data['FK_reviewId'].tolist()
# pickle.dump(lemmatized, file=open("data/glassdoor_lemmatized.pickle", 'wb'))
# pickle.dump(index, file=open('data/glassdoor_index.pickle', 'wb'))

### Get expanded words

In [10]:
seed_words = {"integrity":["integrity", "ethics", "accountability", "trust", "honesty", 
                           "responsibility", "fairness", "transparency", 
                           "ownership", "fair", "honest", 
                           "ethical", "transparent"],
"teamwork":["teamwork", "collaboration", "cooperation", "collaborative", "cooperative"],
"innovation":["innovation", "creativity", "excellence", "improvement", "passion", 
              "pride", "leadership", "growth", "performance", "efficiency", 
              "efficient", "results", "result", 'innovative', 'creative'],
"respect":["respect", "diversity", "inclusion", "development", 
           "talent", "employees", "employee", 
           "dignity", "empowerment"],
"quality":["quality", "customer", "meet_needs", 'meet_need', 
           "commitment", "make_a_difference", "dedication", 
           "value", "exceed_expectations", 'exceed_expectation'],
"safety":["safety", "health", "healthy", "work_life_balance", "flexibility"],
"community":["community", "environment", "caring", "citizenship"],
"communication":["communication", "openness"],
"hard_work":["hard_work", "reward", "fun", "energy"]}

words_to_keep = []
for dimension, word_list in seed_words.items():
    for word in word_list:
        if isinstance(word, str):
            words_to_keep.append(word)
words_to_remove = ['rx', 'chmp', 'spse', '-results', 'cia', 'openness', 'and/or', 'apb', 'cpt', 'qt', 'cte', 'mkg', 'nhs', 
                  '1350', 'rep', 'iplex', 'hap-tm-', 'hap']



In [11]:
# %%time
# # after adding words_to_keep unigram result
# documents = text_processing(additional_stop_words=words_to_remove, words_to_keep=words_to_keep)
# expanded_words_unigram = expand_seed_words(documents=documents, fname='data/glassdoor_unigram')

# pickle.dump(expanded_words_unigram, file=open("data/glassdoor_expanded_unigram.pickle", 'wb'))
# del documents

In [31]:
%%time
# after adding words_to_keep bigram result phrase threshold=10
#documents = text_processing(additional_stop_words=words_to_remove, words_to_keep=words_to_keep, n_most_common=100, 
#                            rare_words_threshold=100, replace_rare=False, gram='bigram', bigram=10)
#model = train_word2vec_model(documents=documents, fname='data/glassdoor_bigram_10')

expanded_words_100 = expand_words(word2vec_model=model, n=50, restrict=None)
expanded_words_10 = expand_words(word2vec_model=model, n=50, restrict=0.1)

output_dimension_result(expanded_words_100, fname='data/glassdoor_expanded_words_100.txt', n=50)
output_dimension_result(expanded_words_10, fname='data/glassdoor_expanded_words_10.txt', n=50)
#pickle.dump(expanded_words_bigram_10, file=open("data/glassdoor_expanded_bigram_10.pickle", 'wb'))
#del documents

CPU times: user 768 ms, sys: 1.32 s, total: 2.08 s
Wall time: 71.9 ms


In [21]:
frequency = {}
for word in model.vocab:
    if model.vocab[word].count in frequency:
        frequency[model.vocab[word].count] +=1
    else:
        frequency[model.vocab[word].count] = 1

In [22]:
print(len(model.vocab))
frequency[1]+frequency[2]+frequency[3]+frequency[4]+frequency[5]+frequency[6]

200188


126085

In [32]:
symmetric_difference = {}
for dimension in seed_words:
    res_100 = set(expanded_words_100[dimension])
    res_10 = set(expanded_words_10[dimension])
    symmetric_difference[dimension] = res_10 ^ res_100
output_dimension_result(symmetric_difference, fname='data/glassdoor_expanded_symmetric_difference.txt', n=50)

In [4]:
# %%time
# # after adding words_to_keep bigram result phrase threshold=1
# documents = text_processing(additional_stop_words=words_to_remove, words_to_keep=words_to_keep, n_most_common=100, 
#                     rare_words_threshold=100, replace_rare=False, gram='bigram', bigram=1)
# expanded_words_bigram_1 = expand_seed_words(documents=documents, fname='data/glassdoor_bigram_1')

# pickle.dump(expanded_words_bigram_1, file=open("data/glassdoor_expanded_bigram_1.pickle", 'wb'))
# del documents