In [None]:
import sys
import os
import pprint
import matplotlib.pyplot as plt
import nltk
import numpy as np
import random
import scipy as sp
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import PCA
import pandas as pd
from cleantext import clean
import scipy
import gensim
from collections import defaultdict
from nltk.stem import PorterStemmer  
import mittens
import tensorflow as tf
import bert
import genderdecoder
from bert import optimization
from bert import run_classifier
import tensorflow_hub as hub
from datetime import datetime
from sklearn import metrics
logger = tf.get_logger()
logger.propagate = False
from sklearn.model_selection import train_test_split
from bert import tokenization
bert_model_hub = "https://tfhub.dev/google/small_bert/bert_uncased_L-4_H-256_A-4/1"
max_seq_len = 128  # this is relatively small and helps keep the compute cost down
label_list = ['masculine-coded', 'feminine-coded', 'neutral']  

In [None]:
# read dataset
corpus = pd.read_csv('DataAnalyst.csv')
files = corpus["Job Description"]

def f(e):
    cleaned = clean.clean(e,
    fix_unicode=True,               # fix various unicode errors
    to_ascii=True,                  # transliterate to closest ASCII representation
    lower=True,                     # lowercase text
    no_line_breaks=True,           # fully strip line breaks as opposed to only normalizing them
    no_urls=True,                  # replace all URLs with a special token
    no_emails=True,                # replace all email addresses with a special token
    no_phone_numbers=True,         # replace all phone numbers with a special token
    no_numbers=True,               # replace all numbers with a special token
    no_digits=True,                # replace all digits with a special token
    no_currency_symbols=True,      # replace all currency symbols with a special token
    no_punct=True,                 # remove punctuations
    replace_with_punct="",          # instead of removing punctuations you may replace them
    replace_with_url="<URL>",
    replace_with_email="<EMAIL>",
    replace_with_phone_number="<PHONE>",
    replace_with_number="<NUMBER>",
    replace_with_digit="0",
    replace_with_currency_symbol="<CUR>",
    lang="en"                       # set to 'de' for German special handling
    )
    return cleaned
    
words = files.map(f)   

In [None]:
# label dataset
labeled_words = {}
for word in words:
    d = genderdecoder.assess(word)['result']
    if  d == 'strongly masculine-coded':
        d = 'masculine-coded'
    elif d == 'strongly feminine-coded':
        d = 'feminine-coded'
    labeled_words[word] = d

print(labeled_words.values())

In [None]:
#write labeled dataset as dataframe
all_text = list(labeled_words.keys())
all_labels = list(labeled_words.values())
labeled_data = pd.DataFrame(list(zip(all_text, all_labels)))
labeled_data.columns = ['Job Description', 'Label']
print(labeled_data)


In [None]:
all_text = labeled_data['Job Description']
all_labels = labeled_data['Label']
print(f"Total dataset length is {len(all_text)} samples")

n_test = 40
n_dev  = 200
train_text, test_text, train_scores, test_scores = train_test_split(all_text, all_labels, test_size=n_test, shuffle=True)
train_text, dev_text, train_scores, dev_scores = train_test_split(train_text, train_scores, test_size=n_dev, shuffle=True)
print(f"Train dataset has length {len(train_text)} samples")
print(f"Dev dataset has length    {len(dev_text)} samples")
print(f"Test dataset has length   {len(test_text)} samples")

# Save dataset splits to disk




In [None]:
def save_split(text, scores, split_name):
    data = {"Text": text, "Score": scores}
    df = pd.concat(data, axis=1)
    df.to_csv(f"Jobposting_{split_name}.csv")
    return

save_split(train_text, train_scores, "train")
save_split(test_text, test_scores, "test")
save_split(dev_text, dev_scores, "dev")


In [None]:
def load_split(split_name):
    df = pd.read_csv(f"Jobposting_{split_name}.csv")
    text = df["Text"]
    scores = df["Score"]
    return text, scores

train_text, train_scores = load_split("train")
test_text, test_scores = load_split("test")
dev_text, dev_scores = load_split("dev")


# Balance the label classes for training

In [None]:
count_f = sum([s=="feminine-coded" for s in train_scores] )
count_m = sum([s=="masculine-coded" for s in train_scores] )
count_n = sum([s=="neutral" for s in train_scores] )
print(count_f, count_m, count_n)
balanced_train_text = []
balanced_train_scores = []

repeat_f = int(count_m / count_f)
repeat_n = int(count_m/ count_n)
#tracking numbers of samples in each set
new_count_f = 0
new_count_m = 0
new_count_n = 0

for t, s in zip(train_text, train_scores):
    if s=="feminine-coded":
        new_count_f += repeat_f
        balanced_train_text += [t]*repeat_f
        balanced_train_scores += [s]*repeat_f
    if s=="neutral":
        new_count_n += repeat_n
        balanced_train_text += [t]*repeat_n
        balanced_train_scores += [s]*repeat_n
    if s=="masculine-coded":
        new_count_m += 1
        balanced_train_text += [t]
        balanced_train_scores += [s]
print(new_count_f, new_count_m, new_count_n)
    
    
    



In [None]:
#get the tokenizer
with tf.Graph().as_default():
    bert_module = hub.Module(bert_model_hub)
    tokenization_info = bert_module(signature="tokenization_info", as_dict=True)
    with tf.Session() as sess:
        vocab_file, do_lower_case = sess.run([tokenization_info["vocab_file"], tokenization_info["do_lower_case"]])      
    tokenizer = bert.tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case)
    
# Example of tokenization
print("\nEXAMPLE:")
print(tokenizer.tokenize("apply to this job with your unreasonable talents"))

In [None]:
#data preprocessing
def create_examples(text, scores):    
    examples = []
    for t, s in zip(text, scores):
        t = t.replace("<br />", "")   # custom data clean-up
        tokens = tokenizer.tokenize(t)
        
        # If the entire text fits, then there is no need to
        # use text b
        if len(tokens) < 128:
            text_a = t
            text_b = None
            
        # If text is long, split into the first 64 and last 64 tokens
        # only append complete sentences\
        else:
            target_length = max_seq_len//2
            text_a = ""
            for sentence in nltk.tokenize.sent_tokenize(t):
                if len(tokenizer.tokenize(text_a)) < target_length:
                    text_a += sentence + " "
                else:
                    break
            text_a = text_a.strip()  # remove trailing whitespace
                
            text_b = ""
            for sentence in reversed(nltk.tokenize.sent_tokenize(t)):
                if len(tokenizer.tokenize(sentence + " " + text_b)) <= target_length:
                    text_b = sentence + " " + text_b
                else:
                    break
            text_b = text_b.strip()  # remove trailing whitespace
            text_a += " " + text_b
            text_a = text_a.replace("  ", " ")
        
        example = bert.run_classifier.InputExample(guid=None, text_a = text_a, text_b = None, label = s)
        examples.append(example)
    return examples
train_examples = create_examples(train_text, train_scores)
dev_examples = create_examples(dev_text, dev_scores)
test_examples = create_examples(test_text, test_scores)        


In [None]:
#write the split data sets to disk
writer = bert.run_classifier.file_based_convert_examples_to_features
writer(train_examples, label_list, max_seq_len, tokenizer, "balanced_training")
writer(dev_examples, label_list, max_seq_len, tokenizer, "dev")
writer(test_examples, label_list, max_seq_len, tokenizer, "test")