In [1]:
import pandas as pd
import numpy as np
import sys, time
sys.path.append("../") # go to parent dir
from src.data.diversity_document import DiversityDocument

from gensim.models import Phrases
from gensim.corpora import Dictionary
from gensim.models.ldamodel import LdaModel
from gensim.models.coherencemodel import CoherenceModel
from gensim.models.word2vec import Word2Vec
import gensim.utils
import itertools
import pyLDAvis.gensim
from IPython.display import clear_output

df = pd.read_csv('../data/external/2017-18stats.csv')
with open('../data/external/download_whitelist.txt', 'r') as f:
    whitelist = [line.strip() for line in f]
df['CompanyLinkToGPGInfo'] = df['CompanyLinkToGPGInfo'].astype(str)

df.describe()


Unnamed: 0,DiffMeanHourlyPercent,DiffMedianHourlyPercent,DiffMeanBonusPercent,DiffMedianBonusPercent,MaleBonusPercent,FemaleBonusPercent,MaleLowerQuartile,FemaleLowerQuartile,MaleLowerMiddleQuartile,FemaleLowerMiddleQuartile,MaleUpperMiddleQuartile,FemaleUpperMiddleQuartile,MaleTopQuartile,FemaleTopQuartile
count,10159.0,10159.0,10159.0,10159.0,10159.0,10159.0,10159.0,10159.0,10159.0,10159.0,10159.0,10159.0,10159.0,10159.0
mean,14.470046,11.963943,15.259415,-13.79243,35.844,34.328861,46.427434,53.572566,50.636047,49.363953,54.995196,45.004804,60.959228,39.040772
std,15.087346,15.687637,242.144758,1258.967565,36.398861,36.102908,24.114317,24.114317,26.063711,26.063711,26.172304,26.172304,24.31617,24.31617
min,-400.0,-164.0,-21092.0,-120695.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,5.3,0.9,0.0,0.0,1.0,0.4,27.0,35.55,29.6,27.5,34.0,21.2,41.8,17.05
50%,13.5,9.6,22.5,5.7,20.5,18.0,44.4,55.6,48.5,51.5,54.0,46.0,63.0,37.0
75%,23.0,21.1,53.0,38.3,73.8,70.9,64.45,73.0,72.5,70.4,78.8,66.0,82.95,58.2
max,159.0,75.7,3848.2,5566.7,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0


In [2]:
#split the corpus into quartiles by the "best" companies on diversity numbers. 
upper_q = df.DiffMeanHourlyPercent.quantile(0.80)
lower_q = df.DiffMeanHourlyPercent.quantile(0.20)
df_bad = df.loc[df.DiffMeanHourlyPercent > upper_q]
df_good = df.loc[df.DiffMeanHourlyPercent < lower_q]
print('Number of companies in worst quarter: %s' % df_bad.DiffMeanHourlyPercent.count())
print('Number of companies in best quarter: %s' % df_good.DiffMeanHourlyPercent.count())

Number of companies in worst quarter: 2026
Number of companies in best quarter: 2029


In [3]:
def iter_diversity_data(df_divers):
    number_rows = df_divers.shape[0]
    counter = 0
    for index, row in df_divers.iterrows():
        counter = counter + 1
        percent_complete = (counter / number_rows) * 100
    
        if counter % 100 == 0:
            #clear_output(wait=True)
            print('%.2f percent complete' % (percent_complete))        

        doc = DiversityDocument(row)

        if doc.url_hash in whitelist:
            continue

        if not doc.has_downloaded_file and not doc.company_link == 'nan':
            doc.download()
            if doc.raw_text is None:
                with open('../data/external/download_whitelist.txt', 'a') as whitelist_file:
                    whitelist_file.write(doc.url_hash + '\n')    

        if doc.raw_text is not None and doc.clean_text is not None:
            yield doc.clean_text

    print('Processing Finished: 100%')        
    

In [4]:
class CorpusDiversity(object):
    def __init__(self, df):
        self.df = df
        self.dictionary = Dictionary(iter_diversity_data(df))

    def __iter__(self):
        for doc in iter_diversity_data(self.df):
            # tokenize each message; simply lowercase & match alphabetic chars, for now
            yield self.dictionary.doc2bow(doc)
           
#    def __len__(self):
#        return len(self.dictionary)
   
class CorpusSentanceDiversity(object):
    def __init__(self, df):
        self.df = df
        self.dictionary = Dictionary(iter_diversity_data(df))

    def __iter__(self):
        for doc in iter_diversity_data(self.df):
            # tokenize each message; simply lowercase & match alphabetic chars, for now
            yield doc    

bad_tokenized_corpus = CorpusDiversity(df_bad)
good_tokenized_corpus = CorpusDiversity(df_good)

bad_sentance_corpus = CorpusSentanceDiversity(df_bad)
good_sentance_corpus = CorpusSentanceDiversity(df_good)

4.94 percent complete
Downloading (https://www.axa-im.com/en/empowering-female-talent)....
9.87 percent complete
14.81 percent complete
19.74 percent complete
24.68 percent complete
29.62 percent complete
34.55 percent complete
39.49 percent complete
Downloading (https://wolffkran.com)....
44.42 percent complete
49.36 percent complete
54.29 percent complete
Downloading (https://www.mizuho-emea.com/~/media/files/citizenship/gender-pay-gap-report.ashx?la=en)....
59.23 percent complete
64.17 percent complete
69.10 percent complete
74.04 percent complete
78.97 percent complete
83.91 percent complete
88.85 percent complete
93.78 percent complete
98.72 percent complete
Processing Finished: 100%
Downloading (https://www.adecco.co.uk/gender-pay-gap)....
4.93 percent complete
9.86 percent complete
14.79 percent complete
19.71 percent complete
24.64 percent complete
29.57 percent complete
34.50 percent complete
39.43 percent complete
44.36 percent complete
Downloading (https://uk.parkindigo.com/

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


In [5]:
bad_corpus = bad_tokenized_corpus
good_corpus = good_tokenized_corpus

# Add bigrams and trigrams to docs (only ones that appear 5 times or more).
#bigram = Phrases(docs, min_count=5)
#for idx in range(len(docs)):
#    for token in bigram[docs[idx]]:
#        if '_' in token:
            # Token is a bigram, add to document.
#            docs[idx].append(token)

# Create a dictionary representation of the documents, and filter out frequent and rare words.
bad_dictionary = bad_tokenized_corpus.dictionary
good_dictionary = good_tokenized_corpus.dictionary

# Remove rare and common tokens.
# Filter out words that occur too frequently or too rarely.
max_freq = 0.5
min_wordcount = 5
bad_dictionary.filter_extremes(no_below=min_wordcount, no_above=max_freq)
bad_dictionary.compactify()
good_dictionary.filter_extremes(no_below=min_wordcount, no_above=max_freq)
good_dictionary.compactify()

# This sort of "initializes" dictionary.id2token.
_ = bad_dictionary[0]
_ = good_dictionary[0]


In [6]:
print('Number of unique good tokens: %d' % len(good_dictionary))
print('Number of unique bad tokens: %d' % len(bad_dictionary))

Number of unique good tokens: 7371
Number of unique bad tokens: 9175


In [7]:
bad_lda_model = LdaModel(corpus=bad_corpus,
               id2word=bad_dictionary,
               num_topics=12, 
               random_state=100,
               update_every=1,
               chunksize=100,
               passes=10,
               alpha='auto',
               per_word_topics=True)

good_lda_model = LdaModel(corpus=good_corpus,
               id2word=good_dictionary,
               num_topics=12, 
               random_state=100,
               update_every=1,
               chunksize=100,
               passes=10,
               alpha='auto',
               per_word_topics=True)

4.94 percent complete
Downloading (https://www.axa-im.com/en/empowering-female-talent)....
9.87 percent complete
14.81 percent complete
19.74 percent complete
24.68 percent complete
29.62 percent complete
34.55 percent complete
39.49 percent complete
Downloading (https://wolffkran.com)....
44.42 percent complete
49.36 percent complete
54.29 percent complete
Downloading (https://www.mizuho-emea.com/~/media/files/citizenship/gender-pay-gap-report.ashx?la=en)....
59.23 percent complete
64.17 percent complete
69.10 percent complete
74.04 percent complete
78.97 percent complete
83.91 percent complete
88.85 percent complete
93.78 percent complete
98.72 percent complete
Processing Finished: 100%
4.94 percent complete
Downloading (https://www.axa-im.com/en/empowering-female-talent)....
9.87 percent complete
14.81 percent complete
19.74 percent complete
24.68 percent complete
29.62 percent complete
34.55 percent complete
39.49 percent complete
Downloading (https://wolffkran.com)....
44.42 perce

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


In [8]:
# Compute Perplexity
bad_lda_corpus = [x for x in bad_corpus]
good_lda_corpus = [x for x in good_corpus]
# a measure of how good the model is. lower the better. -7.670575002821739
print('\nGood Perplexity: ', bad_lda_model.log_perplexity(bad_lda_corpus)) 
print('\nBad Perplexity: ', good_lda_model.log_perplexity(good_lda_corpus)) 

4.94 percent complete
Downloading (https://www.axa-im.com/en/empowering-female-talent)....
9.87 percent complete
14.81 percent complete
19.74 percent complete
24.68 percent complete
29.62 percent complete
34.55 percent complete
39.49 percent complete
Downloading (https://wolffkran.com)....
44.42 percent complete
49.36 percent complete
54.29 percent complete
Downloading (https://www.mizuho-emea.com/~/media/files/citizenship/gender-pay-gap-report.ashx?la=en)....
59.23 percent complete
64.17 percent complete
69.10 percent complete
74.04 percent complete
78.97 percent complete
83.91 percent complete
88.85 percent complete
93.78 percent complete
98.72 percent complete
Processing Finished: 100%
Downloading (https://www.adecco.co.uk/gender-pay-gap)....
4.93 percent complete
9.86 percent complete
14.79 percent complete
19.71 percent complete
24.64 percent complete
29.57 percent complete
34.50 percent complete
39.43 percent complete
44.36 percent complete
Downloading (https://uk.parkindigo.com/

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


In [18]:
good_documents = list(good_sentance_corpus) 
bad_documents = list(bad_sentance_corpus) 

good_w2vmodel = Word2Vec (good_documents, size=150, window=10, min_count=2, workers=10)
good_w2vmodel.train(good_documents,total_examples=len(good_documents),epochs=10)

bad_w2vmodel = Word2Vec (bad_documents, size=150, window=10, min_count=2, workers=10)
bad_w2vmodel.train(bad_documents,total_examples=len(bad_documents),epochs=10)

Downloading (https://www.adecco.co.uk/gender-pay-gap)....
4.93 percent complete
9.86 percent complete
14.79 percent complete
19.71 percent complete
24.64 percent complete
29.57 percent complete
34.50 percent complete
39.43 percent complete
44.36 percent complete
Downloading (https://uk.parkindigo.com/en/about-us)....
49.29 percent complete
54.21 percent complete
59.14 percent complete
64.07 percent complete
69.00 percent complete
73.93 percent complete
78.86 percent complete
83.79 percent complete
88.71 percent complete
93.64 percent complete
98.57 percent complete
Processing Finished: 100%
4.94 percent complete
Downloading (https://www.axa-im.com/en/empowering-female-talent)....
9.87 percent complete
14.81 percent complete
19.74 percent complete
24.68 percent complete
29.62 percent complete
34.55 percent complete
39.49 percent complete
Downloading (https://wolffkran.com)....
44.42 percent complete
49.36 percent complete
54.29 percent complete
Downloading (https://www.mizuho-emea.com/~

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


(8064278, 10007960)

In [21]:
com = 'diversity'
print(good_w2vmodel.wv.most_similar(positive=com))
print(bad_w2vmodel.wv.most_similar(positive=com))

[('fund', 0.6929117441177368), ('local', 0.6766580939292908), ('partnership', 0.6601231098175049), ('futures', 0.653863787651062), ('communities', 0.646407961845398), ('wellbeing', 0.6353886723518372), ('customised', 0.6330523490905762), ('reinvest', 0.6305705904960632), ('patient', 0.627884566783905), ('county', 0.6169164776802063)]
[('charitable', 0.6746270656585693), ('partnership', 0.6643285155296326), ('serve', 0.6572934985160828), ('local', 0.6504461169242859), ('stakeholder', 0.6310160160064697), ('engagement', 0.6070657968521118), ('active', 0.6024819016456604), ('charity', 0.5930672883987427), ('volunteer', 0.5929437875747681), ('philanthropy', 0.5652163624763489)]


In [11]:
pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(bad_lda_model, bad_lda_corpus, bad_dictionary)

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  topic_term_dists = topic_term_dists.ix[topic_order]


In [12]:
pyLDAvis.gensim.prepare(good_lda_model, good_lda_corpus, good_dictionary)

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  topic_term_dists = topic_term_dists.ix[topic_order]
