In [1]:
import discrimination
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, Dense, Activation, Flatten, Dropout
from keras.models import Sequential
from keras import regularizers
import pickle

INFO: {'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; rv:2.2) Gecko/20110201'}
Using Theano backend.


In [2]:
# Initial Setup
texts = pickle.load(open("pickles/texts_keras.p", "rb"))
embeddings_path = "glove/glove.42B.300d.txt"
tokenizer = Tokenizer(lower = False)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
word_index = tokenizer.word_index
data = pad_sequences(sequences, maxlen=256)

# Parse the GloVe word embeddings
embeddings_index = {}
f = open(embeddings_path)
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype="float32")
    embeddings_index[word] = coefs
f.close()

# Create the embedding matrix and layer
embedding_matrix = np.zeros((len(word_index) + 1, 300))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
del embeddings_index
embedding_layer = Embedding(len(word_index) + 1, 300, input_length=256,
                            weights=[embedding_matrix],
                            trainable=False)

# Model setup - Make sure to update this if you update the model used!
model = Sequential()
model.add(embedding_layer)
model.add(Flatten())
model.add(Dropout(0.5))
model.add(Dense(256, activation="relu", kernel_regularizer = regularizers.l2(0.001)))
model.add(Dense(16, activation="relu", kernel_regularizer = regularizers.l2(0.001)))
model.add(Dense(1, activation="sigmoid"))
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 256, 300)          26588700  
_________________________________________________________________
flatten_1 (Flatten)          (None, 76800)             0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 76800)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 256)               19661056  
_________________________________________________________________
dense_2 (Dense)              (None, 16)                4112      
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 17        
Total params: 46,253,885
Trainable params: 19,665,185
Non-trainable params: 26,588,700
_________________________________

In [18]:
# Test the network
test = ["All women are beautiful"]

# Convert the test phrase to lowercase, tokenize, spellcheck, remove stopwords. 
test = discrimination.texts.lowercase(test)
test = discrimination.texts.tokenize(test)
test = discrimination.texts.spellcheck_tokens(test)
test = discrimination.texts.remove_stopwords(test)

# Convert the token back to text, sequence it, pad it, feed it into the model.
text = ""
for item in test:
    for word in item:
        text += word + " "   
test_sequence = tokenizer.texts_to_sequences([text])
x_test = pad_sequences(test_sequence, maxlen=256)
model.load_weights("pickles/model2.h5")

# Output
str(round(model.predict(x_test)[0,0]*100,0))[:-2] + "% sexist"

'28% sexist'

In [7]:
texts = pickle.load(open("pickles/record.p", "rb"))


In [13]:
sexist = []
for text in texts:
    if text["truth"]==1:
        sexist.append(text)

In [14]:
len(sexist)

97044