# preprocess the dataset

In [None]:
import sys
import os
import pprint
import matplotlib.pyplot as plt
import nltk
import numpy as np
import random
import scipy as sp
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import PCA
import pandas as pd
import cleantext
import scipy
import gensim
from collections import defaultdict
from nltk.stem import PorterStemmer 
import mittens
import tensorflow as tf
from tensorflow import bert

In [None]:
import genderdecoder

In [None]:
corpus = pd.read_csv('DataAnalyst.csv')

In [None]:
files = corpus["Job Description"]

def f(e):
    cleaned = cleantext.clean(e,
    fix_unicode=True,               # fix various unicode errors
    to_ascii=True,                  # transliterate to closest ASCII representation
    lower=True,                     # lowercase text
    no_line_breaks=True,           # fully strip line breaks as opposed to only normalizing them
    no_urls=True,                  # replace all URLs with a special token
    no_emails=True,                # replace all email addresses with a special token
    no_phone_numbers=True,         # replace all phone numbers with a special token
    no_numbers=True,               # replace all numbers with a special token
    no_digits=True,                # replace all digits with a special token
    no_currency_symbols=True,      # replace all currency symbols with a special token
    no_punct=True,                 # remove punctuations
    replace_with_punct="",          # instead of removing punctuations you may replace them
    replace_with_url="<URL>",
    replace_with_email="<EMAIL>",
    replace_with_phone_number="<PHONE>",
    replace_with_number="<NUMBER>",
    replace_with_digit="0",
    replace_with_currency_symbol="<CUR>",
    lang="en"                       # set to 'de' for German special handling
    )
    return cleaned
    
words = files.map(f)   
#files.map(str.split)


In [None]:
genderdecoder.assess(words[0])

# Dataset Plot

In [None]:
mas_stats = 0
s_mas_stats = 0
s_fem_stats = 0
fem_stats = 0
neu_stats = 0
for word in words:
    score = genderdecoder.assess(word)['result']
    if score == 'masculine-coded':
        mas_stats += 1
    elif score == 'strongly masculine-coded':
        s_mas_stats += 1
    elif score == 'feminine-coded':
        fem_stats += 1
    elif score == 'strongly feminine-coded':
        s_fem_stats += 1
        
    elif score == 'neutral':
        neu_stats += 1

In [None]:
print(mas_stats, s_mas_stats, fem_stats, s_fem_stats, neu_stats)

In [None]:
#pie chart
labels = 'masculine-coded','strongly masculine-coded', 'feminine-coded','strongly feminine-coded','neutral'
sizes = [1509, 178, 381, 24, 161]
explode = (0.1, 0, 0, 0, 0)


fig1, ax1 = plt.subplots()
ax1.pie(sizes, explode=explode, labels=labels, autopct='%1.1f%%',
        shadow=True, startangle=90)
ax1.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.

plt.show()

# Glove vector, sort based on cosine similarity

In [None]:
#cosine similarity between two words
def cosine_sim(v1, v2):
    return scipy.spatial.distance.cosine(v1, v2)


In [None]:
# load existing glove vectors
import gensim.downloader as api

wv_from_bin = api.load("glove-wiki-gigaword-200")

In [None]:
print(len(wv_from_bin.vocab.keys()))

In [None]:
mas_list = genderdecoder.masculine_coded_words
fem_list = genderdecoder.feminine_coded_words


In [None]:
#compute cosine_sim between all the glove vectors and mas/fem_list words
#order the words based on their similarity to either feminine/masculine words
femwords = fem_list
fem_results = {k: 0 for k in wv_from_bin.vocab}
for femword in femwords:
    for vocab_word in wv_from_bin.vocab:
        if vocab_word.startswith(femword):
            top_5 = wv_from_bin.similar_by_word(vocab_word, 10)
            for word, score in top_5:
                if not any([word.startswith(f) for f in femwords]): # don't update the words that starts with femwords
                    # Average score
                    
                    fem_results[word] += score

# Order by total similarity
total_similarity = sorted(fem_results.items(), key=lambda x: -x[1])


In [None]:
# Most similar words
total_similarity[:50]



In [None]:
stemmer = PorterStemmer()
stemmed_new = defaultdict(int)
for word, score in fem_results.items():
    word_stem = stemmer.stem(word)
    stemmed_new[word_stem] += score
total_similarity_per_stem = sorted(stemmed_new.items(), key=lambda x: -x[1])
total_similarity_per_stem[:10]

In [None]:
maswords = mas_list
mas_results = {k: 0 for k in wv_from_bin.vocab}
for masword in maswords:
    for vocab_word in wv_from_bin.vocab:
        if vocab_word.startswith(masword):
            top_5 = wv_from_bin.similar_by_word(vocab_word, 10)
            for word, score in top_5:
                if not any([word.startswith(m) for m in maswords]): # don't update the words that starts with femwords
                    # Average score
                    
                    mas_results[word] += score

# Order by total similarity
total_similarity = sorted(mas_results.items(), key=lambda x: -x[1])
total_similarity[:50]

In [None]:
stemmer = PorterStemmer()
stemmed_new = defaultdict(int)
for word, score in mas_results.items():
    word_stem = stemmer.stem(word)
    stemmed_new[word_stem] += score
total_similarity_per_stem = sorted(stemmed_new.items(), key=lambda x: -x[1])
total_similarity_per_stem[:10]

# Baseline: Train Glove vectors

In [None]:
def distinct_words(words):
    
    corpus_words = []
    num_corpus_words = -1
    
    corpus_words = sorted(set([word for lst in corpus for word in lst]))
    num_corpus_words = len(corpus_words)

    return corpus_words, num_corpus_words

In [None]:
def compute_co_occurrence_matrix(corpus, window_size=4):
   
    words_words, num_words = distinct_words(words)
    M = None
    word2ind = {}
        
    for i, word in enumerate(words_words):
        word2ind[word] = i
        
    M = np.zeros((num_words, num_words))
    for sentence in corpus:
        for i, word in enumerate(sentence):
            for j in range(max(i-window_size, 0), min(i+window_size, len(sentence))):
                # check i != j
                if i != j:
                    M[word2ind[word], word2ind[sentence[j]]] += 1
                    M[word2ind[sentence[j]], word2ind[word]] += 1

   
            
    return M, word2ind

In [None]:
allowed_words = wv_from_bin.vocab
corpus = [list(filter(lambda w: w in allowed_words, sentence.split())) for sentence in words]
M, word2ind = compute_co_occurrence_matrix(corpus, window_size = 4)
print(word2ind)

In [None]:
from mittens import GloVe
glove_model = GloVe(n=32, max_iter=100)
embeddings = glove_model.fit(M)


In [None]:
with open("gender-coded-glove.txt", "w") as f:
    for i, embed_word in enumerate(word2ind.keys()):
        print(embed_word, " ".join(map(str, embeddings[i])), file = f)

In [None]:
from gensim.scripts.glove2word2vec import glove2word2vec
glove_input_file = 'gender-coded-glove.txt'
word2vec_output_file = 'gender-coded-w2v.txt'
glove2word2vec(glove_input_file, word2vec_output_file)

In [None]:
from gensim.models import KeyedVectors
wv_from_dataset = model = KeyedVectors.load_word2vec_format('gender-coded-w2v.txt', binary=False)

In [None]:
femwords = fem_list
fem_results = {k: 0 for k in wv_from_dataset.vocab}
for femword in femwords:
    for vocab_word in wv_from_dataset.vocab:
        if vocab_word.startswith(femword):
            top_5 = wv_from_dataset.similar_by_word(vocab_word, 10)
            for word, score in top_5:
                if not any([word.startswith(f) for f in femwords]): # don't update the words that starts with femwords
                    # Average score
                    fem_results[word] += score
                    
# Order by total similarity
total_similarity = sorted(fem_results.items(), key=lambda x: -x[1])

In [None]:
total_similarity

In [None]:
stemmer = PorterStemmer()
stemmed_new = defaultdict(int)
for word, (score, count) in fem_results.items():
    word_stem = stemmer.stem(word)
    stemmed_new[word_stem] += score
total_similarity_per_stem = sorted(stemmed_new.items(), key=lambda x: -x[1])
total_similarity_per_stem[:10]

In [None]:
maswords = mas_list
mas_results = {k: 0 for k in wv_from_dataset.vocab}
for masword in maswords:
    for vocab_word in wv_from_dataset.vocab:
        if vocab_word.startswith(masword):
            top_5 = wv_from_dataset.similar_by_word(vocab_word, 10)
            for word, score in top_5:
                if not any([word.startswith(f) for f in maswords]): # don't update the words that starts with femwords
                    # Average score
                    
                    mas_results[word] += score

# Order by total similarity
total_similarity = sorted(mas_results.items(), key=lambda x: -x[1])

In [None]:
total_similarity[:50]

In [None]:
stemmer = PorterStemmer()
stemmed_new = defaultdict(int)
for word, (score, count) in mas_results.items():
    word_stem = stemmer.stem(word)
    stemmed_new[word_stem] += score
total_similarity_per_stem = sorted(stemmed_new.items(), key=lambda x: -x[1])
total_similarity_per_stem[:10]