#### Libraries

In [1]:
%load_ext autoreload
%autoreload 2
import discrimination
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, Dense, Activation, Flatten, Dropout
from keras.models import Sequential
from keras import regularizers
import itertools
import pickle
import random
import re
import os

INFO: {'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; rv:2.2) Gecko/20110201'}
Using TensorFlow backend.


---
### Split texts into sentences, combine in two groups, and tokenize
---

In [2]:
# Load texts
nsex_txts = pickle.load(open("pickles/texts_diary.p", "rb"))
nsex_txts.extend(pickle.load(open("pickles/texts_mydiary.p", "rb")))
sex_txts = pickle.load(open("pickles/texts_everydaysexism.p", "rb"))

# Split each text into a list of sentences
nsex_temp = discrimination.texts.sentences_split(nsex_txts)
sex_temp = discrimination.texts.sentences_split(sex_txts)

# Combine all lists in one
nsex_sentences = []
for item in nsex_temp:
    for sentence in item:
        nsex_sentences.append(sentence)
sex_sentences = []
for item in sex_temp:
    for sentence in item:
        sex_sentences.append(sentence)

In [3]:
# Tokenize sentences and remove stop-words
sex_tokens = discrimination.texts.tokenize(sex_sentences)
nsex_tokens = discrimination.texts.tokenize(nsex_sentences)
# Spell-check tokens. This actually takes some time (not too much) so there's a timer every 20.000 tokens checked.
sex_tokens = discrimination.texts.spellcheck_tokens(sex_tokens)
nsex_tokens = discrimination.texts.spellcheck_tokens(nsex_tokens)
# Remove stop-words a second time, in case some stopwords where misspelled.
sex_tokens = discrimination.texts.remove_stopwords(sex_tokens)
nsex_tokens = discrimination.texts.remove_stopwords(nsex_tokens)

100000 tokens spell-checked.


KeyboardInterrupt: 

In [None]:
for 

In [8]:
print(len(sex_tokens),"sexist sentences\tokens and",len(nsex_tokens),"non-sexist sentences\tokens.")

375849 sexist sentences	okens and 1257041 non-sexist sentences	okens.


In [None]:
# Save
pickle.dump(nsex_tokens, open("pickles4/nsex_tokens.p", "wb"))
pickle.dump(sex_tokens, open("pickles4/sex_tokens.p", "wb"))
pickle.dump(sex_sentences, open("pickles4/sex_sentences.p", "wb"))
pickle.dump(nsex_sentences, open("pickles4/nsex_sentences.p", "wb"))

---
### Convert tokens back to text. Label the old texts. Save.
---

In [4]:
# Load
sex_tokens = pickle.load(open("pickles4/sex_tokens.p", "rb"))
nsex_tokens = pickle.load(open("pickles4/nsex_tokens.p", "rb"))

In [7]:
# Remove tokens with less than 3 words.
temp = []
for token in sex_tokens:
    if len(token) >= 3:
        temp.append(token)
sex_tokens = temp.copy()
temp.clear()
for token in nsex_tokens:
    if len(token) >= 3:
        temp.append(token)
nsex_tokens = temp.copy()

In [9]:
# Randomly keep only as many non-sexist tokens as sexist ones.
nsex_tokens = random.sample(nsex_tokens, len(sex_tokens))

In [10]:
# Convert tokens back to text for Keras to be happy
keras_sentences = []
for token in itertools.chain(sex_tokens, nsex_tokens):
    sentence = " ".join(token)    
    keras_sentences.append(sentence)

# Create labels
keras_labels = np.zeros(len(keras_sentences))
keras_labels[:len(sex_tokens)] = 1

In [11]:
# Save
pickle.dump(keras_sentences, open("pickles4/keras_sentences.p", "wb"))
pickle.dump(keras_labels, open("pickles4/keras_labels.p", "wb"))

___
### NN preparation
___

In [12]:
# Load
keras_sentences = pickle.load(open("pickles4/keras_sentences.p", "rb"))
keras_labels = pickle.load(open("pickles4/keras_labels.p", "rb"))

In [15]:
# Tokenizing - Sequencing
tokenizer = Tokenizer(lower = False)
tokenizer.fit_on_texts(keras_sentences)
sequences = tokenizer.texts_to_sequences(keras_sentences)
word_index = tokenizer.word_index

# Create and shuffle data and labels
data = pad_sequences(sequences, maxlen=50)

keras_labels = np.zeros(len(keras_sentences))
keras_labels[:len(sex_tokens)] = 1

labels = np.asarray(keras_labels)
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]

# Split 80-20
nb_validation_samples = int(0.2 * data.shape[0])
x_train = data[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]
x_val = data[-nb_validation_samples:]
y_val = labels[-nb_validation_samples:]

In [16]:
# Parse the GloVe word embeddings
glove_dir = "glove/"
embeddings_index = {}
f = open(os.path.join(glove_dir, "glove.42B.300d.txt"))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype="float32")
    embeddings_index[word] = coefs
f.close()

In [17]:
# Create the embedding matrix
embedding_matrix = np.zeros((len(word_index) + 1, 300))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
# Delete the embeddings index as it's no longer needed.
del embeddings_index
# Create the embedding layer
embedding_layer = Embedding(len(word_index) + 1, 300, input_length=50,
                            weights=[embedding_matrix],
                            trainable=False)

---
### NN setup and compilation
---

In [18]:
# Setup
model = Sequential()
model.add(embedding_layer)
model.add(Flatten())
# model.add(Dropout(0.1))
model.add(Dense(128, activation="relu", kernel_regularizer = regularizers.l2(0.001)))
model.add(Dense(16, activation="relu", kernel_regularizer = regularizers.l2(0.001)))
model.add(Dense(1, activation="sigmoid"))
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 50, 300)           30264900  
_________________________________________________________________
flatten_1 (Flatten)          (None, 15000)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 128)               1920128   
_________________________________________________________________
dense_2 (Dense)              (None, 16)                2064      
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 17        
Total params: 32,187,109
Trainable params: 1,922,209
Non-trainable params: 30,264,900
_________________________________________________________________


In [19]:
# Compilation
model.compile(optimizer = "Adam",
              loss = "binary_crossentropy",
              metrics = ["acc"])
history = model.fit(x_train, y_train,
                    epochs = 10,
                    batch_size = 512,
                    validation_data = (x_val, y_val))

# Save model weights
model.save_weights("pickles4/model4.h5")

Train on 601359 samples, validate on 150339 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [20]:
# Predictions
predictions = model.predict(data)
# Save
pickle.dump(predictions, open("pickles4/predictions.p", "wb"))

In [21]:
# Load labels and predictions
keras_labels = pickle.load(open("pickles4/keras_labels.p", "rb"))
predictions = pickle.load(open("pickles4/predictions.p", "rb"))

# Create a predicted labels list
labels_predicted = []
for prediction in predictions:
    labels_predicted.append( round(prediction[0]) )
# Calculate the confusion matrix
from sklearn.metrics import confusion_matrix, precision_score, recall_score, accuracy_score
CF = confusion_matrix(keras_labels, labels_predicted)
#"Disentangle" the matrix
TN = round((CF[0,0] / sum(CF[0,:])) * 100, 1)
FN = round((CF[0,1] / sum(CF[0,:])) * 100, 1)
TP = round((CF[1,1] / sum(CF[1,:])) * 100, 1)
FP = round((CF[1,0] / sum(CF[1,:])) * 100, 1)
GTN = round((CF[0,0] / (sum(CF[0,:]) + sum(CF[1,:]))) * 100, 1)
GFN = round((CF[0,1] / (sum(CF[0,:]) + sum(CF[1,:]))) * 100, 1)
GTP = round((CF[1,1] / (sum(CF[0,:]) + sum(CF[1,:]))) * 100, 1)
GFP = round((CF[1,0] / (sum(CF[0,:]) + sum(CF[1,:]))) * 100, 1)
# Print the results
print("True positives account for "+str(TP)+"% or "+str(GTP)+"% of the total (sexist texts labelled as sexist).")
print("True negatives account for "+str(TN)+"% or "+str(GTN)+"% of the total (non-sexist texts labelled as non-sexist).")
print("False positives account for "+str(FP)+"% or "+str(GFP)+"% of the total (sexist texts labelled as non-sexist).")
print("False negatives account for "+str(FN)+"% or "+str(GFN)+"% of the total (non-sexist texts labelled as sexist).")

True positives account for 47.3% or 23.7% of the total (sexist texts labelled as sexist).
True negatives account for 52.7% or 26.3% of the total (non-sexist texts labelled as non-sexist).
False positives account for 52.7% or 26.3% of the total (sexist texts labelled as non-sexist).
False negatives account for 47.3% or 23.7% of the total (non-sexist texts labelled as sexist).


## Test the model!

In [23]:
# Test the network
test = ['''I don't have an issue with anything except women.''']

# Convert the test phrase to lowercase, tokenize, spellcheck, remove stopwords. 
test = discrimination.texts.lowercase(test)
test = discrimination.texts.tokenize(test)
test = discrimination.texts.spellcheck_tokens(test)
test = discrimination.texts.remove_stopwords(test)

# Convert the token back to text, sequence it, pad it, feed it into the model.
text = ""
for item in test:
    for word in item:
        text += word + " "   
test_sequence = tokenizer.texts_to_sequences([text])

x_test = pad_sequences(test_sequence, maxlen=50)
model.load_weights("pickles4/model4.h5")
# Make the output look pretty... because it deserves it.
str(round(model.predict(x_test)[0,0]*100,0))[:-2] + "% sexist"

'96% sexist'