In [17]:
import pandas as pd
import numpy as np
import sys, time
sys.path.append("../") # go to parent dir
from src.data.diversity_document import DiversityDocument
from src.data.diversity_corpus import CorpusSentenceDiversity
import collections

from gensim.models import Phrases
from gensim.corpora import Dictionary
from gensim.models.ldamodel import LdaModel
from gensim.models.coherencemodel import CoherenceModel
from gensim.models.word2vec import Word2Vec
import gensim.utils
import itertools
import pyLDAvis.gensim
from IPython.display import clear_output

In [2]:
df = pd.read_csv('../data/external/2017-18stats.csv')
with open('../data/external/download_whitelist.txt', 'r') as f:
    whitelist = [line.strip() for line in f]
df['CompanyLinkToGPGInfo'] = df['CompanyLinkToGPGInfo'].astype(str)

In [3]:
#split the corpus into quartiles by the "best" companies on diversity numbers. 
upper_q = df.DiffMeanHourlyPercent.quantile(0.75)
lower_q = df.DiffMeanHourlyPercent.quantile(0.25)
df_bad = df.loc[df.DiffMeanHourlyPercent > upper_q]
df_good = df.loc[df.DiffMeanHourlyPercent < lower_q]
print('Number of companies in worst quarter: %s' % df_bad.DiffMeanHourlyPercent.count())
print('Number of companies in best quarter: %s' % df_good.DiffMeanHourlyPercent.count())

Number of companies in worst quarter: 2513
Number of companies in best quarter: 2537


In [4]:
df_bad.describe()

Unnamed: 0,DiffMeanHourlyPercent,DiffMedianHourlyPercent,DiffMeanBonusPercent,DiffMedianBonusPercent,MaleBonusPercent,FemaleBonusPercent,MaleLowerQuartile,FemaleLowerQuartile,MaleLowerMiddleQuartile,FemaleLowerMiddleQuartile,MaleUpperMiddleQuartile,FemaleUpperMiddleQuartile,MaleTopQuartile,FemaleTopQuartile
count,2513.0,2513.0,2513.0,2513.0,2513.0,2513.0,2513.0,2513.0,2513.0,2513.0,2513.0,2513.0,2513.0,2513.0
mean,32.697971,27.591683,42.495464,4.630362,47.113251,44.185953,39.020414,60.979586,47.175965,52.824035,56.019021,43.980979,67.542897,32.457103
std,9.962001,13.969276,71.987421,682.670466,36.510131,37.197742,22.276897,22.276897,26.303221,26.303221,27.675003,27.675003,24.675538,24.675538
min,23.1,-28.6,-2483.3,-31550.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,26.1,18.0,23.8,0.0,8.1,3.6,21.0,45.0,25.0,32.0,33.2,20.0,50.0,12.0
50%,30.0,27.9,53.0,33.3,46.6,39.5,38.2,61.8,47.0,53.0,58.7,41.3,74.0,26.0
75%,35.7,36.6,69.7,56.7,84.2,82.9,55.0,79.0,68.0,75.0,80.0,66.8,88.0,50.0
max,159.0,75.7,330.8,272.2,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0


In [5]:
df_good.describe()

Unnamed: 0,DiffMeanHourlyPercent,DiffMedianHourlyPercent,DiffMeanBonusPercent,DiffMedianBonusPercent,MaleBonusPercent,FemaleBonusPercent,MaleLowerQuartile,FemaleLowerQuartile,MaleLowerMiddleQuartile,FemaleLowerMiddleQuartile,MaleUpperMiddleQuartile,FemaleUpperMiddleQuartile,MaleTopQuartile,FemaleTopQuartile
count,2537.0,2537.0,2537.0,2537.0,2537.0,2537.0,2537.0,2537.0,2537.0,2537.0,2537.0,2537.0,2537.0,2537.0
mean,-2.122546,-2.381671,-15.042333,-28.310406,26.72972,26.220536,55.149823,44.850177,55.333859,44.666141,54.645723,45.354277,54.408317,45.591683
std,12.541757,11.667925,195.432573,344.729166,33.940866,33.210092,26.207688,26.207688,27.503562,27.503562,27.144905,27.144905,25.641381,25.641381
min,-400.0,-164.0,-5614.0,-8267.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,-3.8,-4.1,-5.0,-0.4,0.0,0.0,33.8,21.0,33.0,18.5,32.8,19.6,33.8,22.4
50%,0.2,0.0,0.0,0.0,7.8,8.0,54.7,45.3,52.5,47.5,52.0,48.0,53.9,46.1
75%,2.9,2.0,23.1,12.3,51.8,50.0,79.0,66.2,81.5,67.0,80.4,67.2,77.6,66.2
max,5.2,50.8,431.0,5566.7,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0


In [6]:
bad_sentence_corpus = CorpusSentenceDiversity(df_bad)
good_sentence_corpus = CorpusSentenceDiversity(df_good)

3.98 percent complete
Downloading (https://www.axa-im.com/en/empowering-female-talent)....
7.96 percent complete
11.94 percent complete
15.92 percent complete
19.90 percent complete
23.88 percent complete
27.86 percent complete
31.83 percent complete
35.81 percent complete
39.79 percent complete
Downloading (https://wolffkran.com)....
43.77 percent complete
47.75 percent complete
51.73 percent complete
55.71 percent complete
Downloading (https://www.mizuho-emea.com/~/media/files/citizenship/gender-pay-gap-report.ashx?la=en)....
59.69 percent complete
63.67 percent complete
67.65 percent complete
71.63 percent complete
75.61 percent complete
79.59 percent complete
83.57 percent complete
87.54 percent complete
91.52 percent complete
95.50 percent complete
99.48 percent complete
Processing Finished: 100%
Downloading (https://www.adecco.co.uk/gender-pay-gap)....
3.94 percent complete
7.88 percent complete
11.82 percent complete
15.77 percent complete
19.71 percent complete
23.65 percent co

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


In [42]:
good_documents = list(good_sentence_corpus) 
bad_documents = list(bad_sentence_corpus) 

good_w2vmodel = Word2Vec (good_documents, size=150, window=10, min_count=3, workers=10, seed=100, sg=1)
good_w2vmodel.train(good_documents,total_examples=len(good_documents),epochs=10)

bad_w2vmodel = Word2Vec (bad_documents, size=150, window=10, min_count=3, workers=10, seed=100, sg=1)
bad_w2vmodel.train(bad_documents,total_examples=len(bad_documents),epochs=10)

Downloading (https://www.adecco.co.uk/gender-pay-gap)....
3.94 percent complete
7.88 percent complete
11.82 percent complete
15.77 percent complete
19.71 percent complete
23.65 percent complete
27.59 percent complete
31.53 percent complete
35.47 percent complete
39.42 percent complete
43.36 percent complete
Downloading (https://uk.parkindigo.com/en/about-us)....
47.30 percent complete
51.24 percent complete
55.18 percent complete
59.12 percent complete
63.07 percent complete
67.01 percent complete
70.95 percent complete
74.89 percent complete
78.83 percent complete
82.77 percent complete
86.72 percent complete
90.66 percent complete
94.60 percent complete
98.54 percent complete
Processing Finished: 100%
3.98 percent complete
Downloading (https://www.axa-im.com/en/empowering-female-talent)....
7.96 percent complete
11.94 percent complete
15.92 percent complete
19.90 percent complete
23.88 percent complete
27.86 percent complete
31.83 percent complete
35.81 percent complete
39.79 percent

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


(9265677, 12270810)

In [44]:
com = 'team'

good_words = dict(good_w2vmodel.wv.most_similar(positive=com, topn=10))
bad_words = dict(bad_w2vmodel.wv.most_similar(positive=com, topn=10))
print(good_words)
print(bad_words)

only_good = {x:good_words[x] for x in good_words if x not in bad_words}
only_bad = {x:bad_words[x] for x in bad_words if x not in good_words}

print('==================================================')
print('Only good words')
print(list(only_good.keys()))
print('Only bad words')
print(list(only_bad.keys()))


{'member': 0.6158281564712524, '-PRON-': 0.5957055687904358, 'leaders': 0.5900853872299194, 'couples': 0.5865137577056885, 'bitc': 0.5762864947319031, 'emily': 0.5695035457611084, 'dave': 0.5638670325279236, 'mackenzie': 0.5558559894561768, 'singles': 0.5541415810585022, 'pursuit': 0.549170970916748}
{'freelance': 0.568629801273346, 'member': 0.5617415308952332, 'voices': 0.5556679964065552, 'leadership': 0.5553327798843384, '-PRON-': 0.5509933829307556, 'gill': 0.5439556241035461, 'headteachers': 0.5427749156951904, 'milestones': 0.5256127119064331, 'experis': 0.5174738168716431, 'spokespeople': 0.5173295140266418}
Only good words
['leaders', 'couples', 'bitc', 'emily', 'dave', 'mackenzie', 'singles', 'pursuit']
Only bad words
['freelance', 'voices', 'leadership', 'gill', 'headteachers', 'milestones', 'experis', 'spokespeople']
