In [5]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
import pandas as pd
from nltk.corpus import stopwords
from textblob import Word, TextBlob
from collections import Counter
import six
import textblob
import gensim
import pickle
import itertools
import numpy as np


[nltk_data] Downloading package stopwords to /home/r1/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/r1/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/r1/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /home/r1/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Helper Functions

In [7]:
def get_lemma_textblob(raw_text):
    """
    Get lemma from text
    :param raw_text: Raw text to get lemma from
    :type raw_text: str
    :rtype: list of lemmas
    """
    blob = TextBlob(raw_text)
    return [Word(tag[0], tag[1]).lemma.lower() for tag in blob.pos_tags]

def filter_corpus(text_corpus, words_to_keep = [], words_to_filter = [], n_most_common=0, 
                  rare_words_threshold=10, replace_rare = False):
    """
    Filter out most common, rare, and user supplied words
    :param corpus: a list of list of lemmas as a corpus
    :param words_to_filter: a set of words to filter
    :param n_most_common: top n most frequent words to remove
    :param rare_words_threshold: filter out words if frequency is less than the threshold
    :param replace_rare: if true replace rare words using "UNK", else remove rare words
    :return: a list of list of lemmas as a corpus
    """
    print("Start filtering")
    # remove any token that contains only digit and punctuations
    text_corpus = [[word for word in sub_list if any(c.isalpha() for c in word)] for sub_list in text_corpus]
    all_tokens = [item for sub_list in text_corpus for item in sub_list]  # flatten the corpus list
    
    c = Counter(all_tokens)
    common_words = [pair[0] for pair in c.most_common(n_most_common) if pair[0] not in words_to_keep] + [""]
    #print("The most %d common words:"%(n_most_common))
    #print(common_words)
    #print("Filtered stop words:")
    #print(words_to_filter)
    if replace_rare is True:
        filtered = set(common_words + list(words_to_filter))
        rare_words = set([k for k, v in six.iteritems(c) if v <= rare_words_threshold and k not in words_to_keep])
        text_corpus = [['UNK' if word in rare_words else word for word in text] for text in text_corpus]
    else:
        rare_words = [k for k, v in six.iteritems(c) if v <= rare_words_threshold and k not in words_to_keep]
        filtered = set(common_words + rare_words + list(words_to_filter))
    #print("Rare words:")
    #print(rare_words)
    texts = [[str(word) for word in text if word not in filtered] for text in text_corpus]
    print("Done filtering")
    return texts

def text_processing(additional_stop_words = [], words_to_keep = [], n_most_common=100, 
                    rare_words_threshold = 5, replace_rare=False, gram='bigram', bigram=10):
    """
    - load data
    - lemmatization
    - remove common words and rare words
    - get n_gram tokens
    :param additional_stop_words: tailor-made words to remove for the corpus
    :type additional_stoop_words: list of str
    :param n_most_common,rare_words_threshold,replace_rare: same with function filter_corpus()
    :param gram: 'unigram', 'bigram', 'trigram', phrases length using gensim
    :type gram: str
    :param bigram: threshold for genism phrase
    :type bigram: int
    :return: clean tokenized documents
    :rtype: list of list
    """
#     print('Loading raw text data')
#     sec_10k = pd.read_pickle('data/10k_raw.pickle')
#     #raw = sec_10k.head().copy()
#     #raw_documents = raw['mda_text'].tolist()
#     raw_documents = sec_10k['mda_text'].tolist()
#     #raw_documents = raw_documents[0:10]
#     print('Lemmatizating')
#     lemma_documents = [get_lemma_textblob(document) for document in raw_documents]
#     pickle.dump(lemma_documents, file=open("data/lemma_documents.pickle", 'wb'))
    print('Loading lemmatized documents')
    lemma_documents = pickle.load(file=open("data/10k_lemmatized.pickle", 'rb'))
    
    print('Cleaninng')
    stop_words = stopwords.words('english') + additional_stop_words
    clean_documents = filter_corpus(lemma_documents, words_to_keep, stop_words, n_most_common, rare_words_threshold, 
                                   replace_rare)
    
    print('N_gram')
    if gram == 'unigram':
        return clean_documents
    if gram == 'bigram':
        bigram_transformer = gensim.models.Phrases(clean_documents, min_count=10, threshold=bigram)
        bigram = list(bigram_transformer[clean_documents])
        return bigram
    if gram == 'trigram':
        bigram_transformer = gensim.models.Phrases(clean_documents, min_count=1)
        bigram = list(bigram_transformer[clean_documents])
        trigram_transformer = gensim.models.Phrases(bigram, min_count=1)
        trigram = list(trigram_transformer[bigram])
        return trigram


def similar_words(word2vec_model, dimension, n=5):
    """
    Note: not used
    given a dimension of seed word or list of seed words find most similar words in word2vec corpus, based on cosine similarity
    :param word2vec_model: gensim word2vec model
    :type dimension: str
    :param n: the most n similar words
    :type n: int
    :rtype: list of (similar_word, similarity) tuples
    """
    similar_words = []
    for word in seed_words[dimension]:
        if isinstance(word, list):
            #make sure every word in seed word list is in word2vec corpus
            updated_word = [item for item in word if item in word2vec_model.vocab]
            try:
                for pair in word2vec_model.most_similar(updated_word, topn=n):
                    similar_words.append(pair)
            except:
                pass
                #similar_words[', '.join(word)] = ('All the words in this seed word list not found in corpus', 0)
        else:
            try:
                for pair in word2vec_model.similar_by_word(word, topn=n):
                    similar_words.append(pair)
            except:
                pass
                #similar_words[word] = ('Seed word not found in corpus', 0)
    return similar_words


def train_word2vec_model(documents, fname, min_count=5, size=100, window=5, workers=16):
    print('Building word2vec model')
    model = gensim.models.Word2Vec(documents, min_count=min_count, size=size, window=window, workers=workers)
    model.save(fname)
    model.init_sims(replace=True)
    return model


def expand_words(word2vec_model, n=50, restrict=None):
    vocab_number = len(word2vec_model.vocab)
    expanded_words = {}
    if restrict != None:
        restrict = int(vocab_number*restrict)
    for dimension in seed_words:
        dimension_words = [word for word in seed_words[dimension] if word in word2vec_model.vocab]
        similar_words = [pair[0] for pair in word2vec_model.most_similar(dimension_words, topn=n, restrict_vocab=restrict)]
        expanded_words[dimension] = similar_words
    return expanded_words
                

def output_result(expanded_words, seed_words, n, fname):
    result = {}
    for dimension in seed_words:
        result[dimension] = []
    for k, v in expanded_words.items():
        for expanded_word_tuple in v:
            result[k].append(expanded_word_tuple)
        result[k].sort(key=lambda tup: tup[1], reverse=True)
    with open(fname, 'w') as text_file:
        for k in result:
            result[k] = [tup[0] for tup in result[k]]
            if len(result[k]) >= n:
                result[k] = result[k][0:n]
                print("The dimension is: {} {} words".format(k, n), file=text_file)
                print("===========================================================================", file=text_file)
                print(result[k], file=text_file)
                print('\n', file=text_file)
            else:
                print("The dimension is: {}. Less than {}words, output {}.".format(k, n, len(result[k])), file=text_file)
                print("===========================================================================", file=text_file)
                print(result[k], file=text_file)
                print('\n', file=text_file)
    return result

def output_dimension_result(expanded_words, n, fname):
    with open(fname, 'w') as text_file:
        for k, v in expanded_words.items():
            print("The dimension is: {} {} expanded words".format(k, n), file=text_file)
            print("===========================================================================", file=text_file)
            print(v, file=text_file)
            print('\n', file=text_file)
    
            

construct original seed words based on six culture dimensions, for every dimension, only choose words with positive loadings greater than 0.4 and update using self judgement

In [3]:
seed_words_oreilly = {"adaptability":
["being_innovative", "be_innovative", "risk_taking", "risk_taken","take_risk", "being_willing_to_experiment", "fast_moving",
"being_quick_to_take_advantage_of_opportunities", "not_being_constrained_by_many_rules", "adaptability"],
"integrity":
["having_integrity", 'have_integrity', 'be_honest', "having_high_ethical_standards", "being_honest", "respecting_individuals",
"being_fair", 'be_fair', 'be_supportive'],
"collaborative":["working_in_collaboration_with_others", "being_team_oriented", "cooperative", "being_supportive",
"avoiding_conflict", 'be_supportive', 'avoid_conflict'],
"results_oriented":['results_oriented', "being_results_oriented", "having_high_expectations_for_performance", "achievement_oriented"],
"customer_oriented":['customer_oriented', "being_customer_oriented", "listening_to_customers", "being_market_driven", 'market_driven'],
"detail_oriented":["paying_attention_to_detail", "emphasizing_quality", 'emphasize_quality', "being_precise", 'be_precise', 'detail', 'precise', 
                  'detail_oriented']}

In [2]:
seed_words = {"integrity":["integrity", "ethics", "accountability", "trust", "honesty", 
                           "responsibility", "fairness", "transparency", 
                           "ownership", "fair", "honest", 
                           "ethical", "transparent"],
"teamwork":["teamwork", "collaboration", "cooperation", "collaborative", "cooperative"],
"innovation":["innovation", "creativity", "excellence", "improvement", "passion", 
              "pride", "leadership", "growth", "performance", "efficiency", 
              "efficient", "results", "result", 'innovative', 'creative'],
"respect":["respect", "diversity", "inclusion", "development", 
           "talent", "employees", "employee", 
           "dignity", "empowerment"],
"quality":["quality", "customer", "meet_needs", 'meet_need', 
           "commitment", "make_a_difference", "dedication", 
           "value", "exceed_expectations", 'exceed_expectation'],
"safety":["safety", "health", "healthy", "work_life_balance", "flexibility"],
"community":["community", "environment", "caring", "citizenship"],
"communication":["communication", "openness"],
"hard_work":["hard_work", "reward", "fun", "energy"]}

bigram = {"bigrams":['being_innovative', 'be_innovative', 'risk_taking', 'risk_taken', 'take_risk', 'fast_moving', 'move_fast', 
          'have_integrity', 'having_integrity', 'be_honest', 'being_honest', 'respecting_individuals', 'respecting_individual', 
          'respect_individuals', 'respect_individual', 'being_fair', 'be_fair', 'be_supportive', 'avoiding_conflict', 
          'avoid_conflict', 'results_oriented', 'result_oriented', 'achievement_oriented', 'customer_oriented', 
          'market_driven', 'detail_oriented', 'emphasizing_quality', 'emphasize_quality', 'being_precise', 
          'be_precise']}


In [3]:
words_to_keep = []
for dimension, word_list in seed_words.items():
    for word in word_list:
        if isinstance(word, str):
            words_to_keep.append(word)
words_to_remove = ['rx', 'chmp', 'spse', '-results', 'cia', 'openness', 'and/or', 'apb', 'cpt', 'qt', 'cte', 'mkg', 'nhs', 
                  '1350', 'rep', 'iplex', 'hap-tm-', 'hap']

In [6]:
# before adding words_to_keep result
# documents = text_processing()
# expanded_words = expand_seed_words(documents=documents)
# for k, v in expanded_words.items():
#     print('Culture dimension: %s' %(k))
#     print('=========================================================================================================')
#     print(v)

In [16]:
# %%time
# # after adding words_to_keep unigram result
# documents = text_processing(additional_stop_words=words_to_remove, words_to_keep=words_to_keep)
# expanded_words_unigram = expand_seed_words(documents=documents, fname='data/10k_unigram')

# pickle.dump(expanded_words_unigram, file=open("data/10k_expanded_unigram.pickle", 'wb'))
# del documents

In [None]:
%%time
# after adding words_to_keep bigram result phrase threshold=10
documents = text_processing(additional_stop_words=words_to_remove, words_to_keep=words_to_keep, n_most_common=100, 
                            rare_words_threshold=5, replace_rare=False, gram='bigram', bigram=20)
model = train_word2vec_model(documents=documents, fname='model/10k_bigram_10')

expanded_words_100 = expand_words(word2vec_model=model, n=50, restrict=None)
expanded_words_80 = expand_words(word2vec_model=model, n=50, restrict=0.8)
expanded_words_50 = expand_words(word2vec_model=model, n=50, restrict=0.5)
expanded_words_10 = expand_words(word2vec_model=model, n=50, restrict=0.1)

output_dimension_result(expanded_words_100, fname='data/outputs/10k_expanded_words_100.txt', n=50)
output_dimension_result(expanded_words_80, fname='data/outputs/10k_expanded_words_80.txt', n=50)
output_dimension_result(expanded_words_50, fname='data/outputs/10k_expanded_words_50.txt', n=50)
output_dimension_result(expanded_words_10, fname='data/outputs/10k_expanded_words_10.txt', n=50)

#del documents

Loading lemmatized documents


In [13]:
# dump results expanded words as a list
with open("data/model/10k_w2v.pickle", "wb") as file:
    pickle.dump(expand_words(word2vec_model=model, n=50, restrict=0.5), file= file)

In [9]:
expanded_words_50

{'communication': ['telecommunication',
  'connectivity',
  'networking',
  'wireless',
  'network',
  'mobile',
  'telephone',
  'internet',
  'wireless_wireline',
  'wired_wireless',
  'video_conferencing',
  'voice',
  'broadband',
  'piv',
  'machine-to-machine',
  'infrastructure',
  'high-speed',
  'electronic',
  'datacenter',
  'gps_tracking',
  'terrestrial-based',
  'hailing',
  'voice_message',
  'networked',
  'messaging',
  'message',
  'telephony',
  'digital',
  'instant_messaging',
  'encrypt',
  'satellite-based_communication',
  'intelligence',
  'telemetry',
  'wireline_wireless',
  'automation',
  'computing',
  'wirelessly',
  'server_router',
  'notebook_webcam',
  'router_switch',
  'chat',
  'solution',
  'intranet',
  'interface',
  'communications',
  'server',
  'telecom',
  'greeting',
  'telepresence',
  'in-building_wireless'],
 'community': ['foster',
  'well-being',
  'environmental_stewardship',
  'culture',
  'nonprofit_organization',
  'low-_moderate-

In [75]:
model.vocab['coffee_coffee'].count

117

In [76]:
model.vocab['coffee_coffee'].index

35890

In [40]:
frequency = {}
for word in model.vocab:
    if model.vocab[word].count in frequency:
        frequency[model.vocab[word].count] +=1
    else:
        frequency[model.vocab[word].count] = 1

In [42]:
frequency[1]+frequency[2]+frequency[3]+frequency[4]+frequency[5]+frequency[6]

209958

In [72]:
symmetric_difference = {}
for dimension in seed_words:
    res_100 = set(expanded_words_100[dimension])
    res_10 = set(expanded_words_10[dimension])
    symmetric_difference[dimension] = res_100 ^ res_10
#output_dimension_result(symmetric_difference, fname='data/10k_expanded_symmetric_difference.txt', n=50)
symmetric_difference

{'communication': {'archive',
  'automation',
  'automotive_infotainment',
  'backplane_optical',
  'carrier-class',
  'communications',
  'cybersecurity',
  'e-mail_instant',
  'exterior_module',
  'gogo_biz',
  'gps_tracking',
  'high-speed_broadband',
  'internet_connectivity',
  'machine-to-machine',
  'message',
  'messaging',
  'notebook_webcam',
  'router_avionics',
  'satellite-based',
  'shenzhen_tianyin',
  'switch',
  'telecom_datacom',
  'telemetry',
  'voice_messaging',
  'web-enabled_phone',
  'wireline_wireless'},
 'community': {'addiction_on-line',
  'advocacy',
  'affordability',
  'animal_welfare',
  'at-risk_youth',
  'attitude_toward',
  'backdrop',
  'clientele',
  'community_low-',
  'counseling',
  'developmental_disability',
  'diagnose_hiv',
  'falls_neighboring',
  'for-profit',
  'foundation_non-profit',
  'gaming_sensory',
  'gynecologic',
  'harassment_discrimination',
  'health',
  'healthcare-related',
  'homeownership',
  'housing',
  'importantly',
  'i