In [1]:
%load_ext autoreload
%autoreload 2
import discrimination
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, Dense, Activation, Flatten, Dropout
from keras.models import Sequential
from keras import regularizers
import itertools
import pickle
import random
import re
import os

INFO: {'User-Agent': 'Opera/9.80 (X11; Linux i686; Ubuntu/14.10) Presto/2.12.388 Version/12.16'}
Using TensorFlow backend.


---
### Get all texts and split into sexist and non-sexist. For each set, make a list with all sentences.
---

In [None]:
# Get all texts and combine
sex_txts = pickle.load(open("pickles2/sexist_new.p", "rb"))
nsex_txts = pickle.load(open("pickles2/nonsexist_new.p", "rb"))
texts_diary = pickle.load(open("pickles/texts_diary.p", "rb"))
texts_mydiary = pickle.load(open("pickles/texts_mydiary.p", "rb"))
texts_everydaysexism = pickle.load(open("pickles/texts_everydaysexism.p", "rb"))
# New texts come in tuples, with source info, not strings. Fix this
sex_txts = [item[0] for item in sex_txts]
nsex_txts = [item[0] for item in nsex_txts]
# Remove /n from texts (as much as possible) 
sex_txts = [item.replace(" /n ", "") for item in sex_txts]
nsex_txts = [item.replace(" /n ", "") for item in nsex_txts]
# Add the old texts
sex_txts.extend(texts_everydaysexism)
nsex_txts.extend(texts_diary)
nsex_txts.extend(texts_mydiary)

In [None]:
# Split into sentences
sex_sentences = discrimination.texts.sentences_split(sex_txts)
nsex_sentences = discrimination.texts.sentences_split(nsex_txts)
# Save
pickle.dump(sex_txts, open("pickles3/sex_txts.p", "wb"))
pickle.dump(nsex_txts, open("pickles3/nsex_txts.p", "wb"))
pickle.dump(sex_sentences, open("pickles3/sex_sentences.p", "wb"))
pickle.dump(nsex_sentences, open("pickles3/nsex_sentences.p", "wb"))

---
### Tokenize and check how the length distribution looks across the two groups.
---

In [None]:
# Tokenize texts and remove stop-words
sex_tkns = discrimination.texts.tokenize(sex_txts)
nsex_tkns = discrimination.texts.tokenize(nsex_txts)
# Remove stop-words a second time, in case some stopwords where misspelled.
sex_tkns = discrimination.texts.remove_stopwords(sex_tkns)
nsex_tkns = discrimination.texts.remove_stopwords(nsex_tkns)

In [None]:
# Check the average token length (words per token) in each group
sex_wd_cnt = 0
for tkn in sex_tkns:
    sex_wd_cnt += len(tkn)
nsex_wd_cnt = 0
for tkn in nsex_tkns:
    nsex_wd_cnt += len(tkn)

print("Av. number of words-per-token in non-sexist texts is", round(nsex_wd_cnt/len(nsex_tkns),1))
print("Av. number of words-per-token in sexist texts is", round(sex_wd_cnt/len(sex_tkns),1))

In [None]:
# Make some graphs of the number of words per token for more information
list_of_tokens = [sex_tkns, nsex_tkns]
legend = ["Sexist", "Non-Sexist"]
discrimination.texts.tokens_plot(list_of_tokens, (0,0.07), 135, [10,5], 90, legend)

First of all, the reason it appears that sexist texts have more words per token, when in reality they don't is due to the fatter right-tail of non-sexist tokens, for lengths larger than 100.

However, since extremely few tokens have lengths larger than 128 words, will pad tokens at 128. At the same time I will randomly discard non-sexist tokens with very few words and with more than 100, in an effort to match the distribution.

---
### Match the distribution of the two groups by discarding and splitting non-sexist texts, that are anyway more than the sexist ones.
---

In [None]:
# Load
sex_txts = pickle.load(open("pickles3/sex_txts.p", "rb"))
nsex_txts = pickle.load(open("pickles3/nsex_txts.p", "rb"))
sex_sentences = pickle.load(open("pickles3/sex_sentences.p", "rb"))
nsex_sentences = pickle.load(open("pickles3/nsex_sentences.p", "rb"))

In [None]:
# Split non-sexist sentences into two groups. Those with 50 or more sentneces, and the rest.
a = []
b = []
for item in nsex_sentences:
    if len(item) > 100:
        a.append(item)
    else:
        b.append(item)
# For the group with 40 or more sentences, split all items into 2 sentences.
temp = []
for item in a:
    for i in range(0, len(item), 2):
        temp.append(item[i: i+2])
# Join the group to form the "new" non-sexist sentences
nsex_sentences = b
nsex_sentences.extend(temp)
# Make the sentences into texts again
texts = []
for item in nsex_sentences:
    text = " ".join(item)
    texts.append(text)
# These are the "new" non-sexist texts
nsex_txts = texts

In [None]:
# Tokenize texts and remove stop-words
sex_tkns = discrimination.texts.tokenize(sex_txts)
nsex_tkns = discrimination.texts.tokenize(nsex_txts)
# Spell-check tokens. (Notificaiton every 20.000 tokens)
sex_tkns = discrimination.texts.spellcheck_tokens(sex_tkns)
nsex_tkns = discrimination.texts.spellcheck_tokens(nsex_tkns)
# Remove stop-words a second time, in case some stopwords where misspelled.
sex_tkns = discrimination.texts.remove_stopwords(sex_tkns)
nsex_tkns = discrimination.texts.remove_stopwords(nsex_tkns)

In [None]:
# Discard all tokens with length larger than 128 from both groups
sex_tkns = [tkn for tkn in sex_tkns if len(tkn) <=128]
nsex_tkns = [tkn for tkn in nsex_tkns if len(tkn) <=128]

# With a probability of 3% keep non-sexist tokens with a length of 1,2,3, and 4
a = [tkn for tkn in nsex_tkns if len(tkn) <= 3 and random.random() < 0.03]
# With a probability of 10% keep non-sexist tokens with a length of 4
b = [tkn for tkn in nsex_tkns if len(tkn) == 4 and random.random() < 0.1]
# With a probability of 20% keep non-sexist tokens with a length of 5
c = [tkn for tkn in nsex_tkns if len(tkn) == 5 and random.random() < 0.2]
# With a probability of 40% keep non-sexist tokens with a length of 6
d = [tkn for tkn in nsex_tkns if len(tkn) == 6 and random.random() < 0.4]
# With a probability of 60% keep non-sexist tokens with a length of 7
e = [tkn for tkn in nsex_tkns if len(tkn) == 7 and random.random() < 0.6]
# With a probability of 70% keep non-sexist tokens with a length of 8
f = [tkn for tkn in nsex_tkns if len(tkn) == 8 and random.random() < 0.7]
# Keep all non-sexist tokens with a length larger than 8
nsex_tkns = [tkn for tkn in nsex_tkns if len(tkn) > 8]

# Combine all
nsex_tkns.extend(a)
nsex_tkns.extend(b)
nsex_tkns.extend(c)
nsex_tkns.extend(d)
nsex_tkns.extend(e)
nsex_tkns.extend(f)

In [None]:
# With a probability of 30% separate non-sexist tokens with a length between 15 and 25
sep1 = []
sep2 = []
for tkn in nsex_tkns:
    if 15 <= len(tkn) <= 25 and random.random() < 0.3:
        sep1.append(tkn)
    else:
        sep2.append(tkn)
a = sep2
# Join each pair of the separated tokens into one larger token.
b = []
for i in range(0, len(sep1) - 1, 2):
    tkn = sep1[i]
    tkn.extend(sep1[i+1])
    b.append(tkn)
# Form the new non-sexist tokens
nsex_tkns = a
nsex_tkns.extend(b)

In [None]:
# With a probability of 20% separate non-sexist tokens with a length between 26 and 42
sep1 = []
sep2 = []
for tkn in nsex_tkns:
    if 26 <= len(tkn) <= 42 and random.random() < 0.2:
        sep1.append(tkn)
    else:
        sep2.append(tkn)
a = sep2
# Join each pair of the separated tokens into one larger token.
b = []
for i in range(0, len(sep1) - 1, 2):
    tkn = sep1[i]
    tkn.extend(sep1[i+1])
    b.append(tkn)
# Form the new non-sexist tokens
nsex_tkns = a
nsex_tkns.extend(b)

In [None]:
# With a probability of 22% separate non-sexist tokens with a length of 7
sep1 = []
sep2 = []
for tkn in nsex_tkns:
    if len(tkn) == 7 and random.random() < 0.22:
        sep2.append(tkn)
    else:
        sep1.append(tkn)
nsex_tkns = sep1
# With a probability of 18% separate non-sexist tokens with a length of 8
sep1 = []
sep3 = []
for tkn in nsex_tkns:
    if len(tkn) == 8 and random.random() < 0.18:
        sep3.append(tkn)
    else:
        sep1.append(tkn)
nsex_tkns = sep1
# With a probability of 30% separate non-sexist tokens with a length of 9
sep1 = []
sep4 = []
for tkn in nsex_tkns:
    if len(tkn) == 9 and random.random() < 0.3:
        sep4.append(tkn)
    else:
        sep1.append(tkn)
nsex_tkns = sep1
# With a probability of 14% separate non-sexist tokens with a length of 10
sep1 = []
sep5 = []
for tkn in nsex_tkns:
    if len(tkn) == 10 and random.random() < 0.14:
        sep5.append(tkn)
    else:
        sep1.append(tkn)
nsex_tkns = sep1

In [None]:
# Join all the separated tokens in a new variable and shuffle it
separated = sep2.copy()
separated.extend(sep3)
separated.extend(sep4)
separated.extend(sep5)
# Average token length is 8.5 so after randomizing join every 6 tokens together and add to the nsex_tkns
random.shuffle(separated)
new = []
for i in range(0, len(separated) - 5, 6):
    tkn = separated[i]
    tkn.extend(separated[i+1])
    tkn.extend(separated[i+2])
    tkn.extend(separated[i+3])
    tkn.extend(separated[i+4])
    tkn.extend(separated[i+5])
    new.append(tkn)
nsex_tkns.extend(new)

In [None]:
# Make some graphs of the number of words per token for more information
list_of_tokens = [sex_tkns, nsex_tkns]
legend = ["Sexist", "Non-Sexist"]
discrimination.texts.tokens_plot(list_of_tokens, (0,0.06), 128, [10,5], 90, legend)

In [None]:
# Save
pickle.dump(sex_tkns, open("pickles3/sex_tkns.p", "wb"))
pickle.dump(nsex_tkns, open("pickles3/nsex_tkns.p", "wb"))

___
### NN preparation
___

In [3]:
# Load tokens
sex_tkns = pickle.load(open("pickles3/sex_tkns.p", "rb"))
nsex_tkns = pickle.load(open("pickles3/nsex_tkns.p", "rb"))

In [4]:
# Match the length of sex_tkns by randomly dropping nsex_tkns
nsex_tkns = random.sample(nsex_tkns, len(sex_tkns))

In [5]:
# Convert tokens back to text for Keras
keras_txts = []
for tkn in itertools.chain(sex_tkns, nsex_tkns):
    txt = " ".join(tkn)     
    keras_txts.append(txt)
    
# Create labels
keras_labels = np.zeros(len(keras_txts))
keras_labels[:len(sex_tkns)] = 1

In [6]:
# Tokenizing - Sequencing
tokenizer = Tokenizer(lower = False)
tokenizer.fit_on_texts(keras_txts)
sequences = tokenizer.texts_to_sequences(keras_txts)
word_index = tokenizer.word_index

# Create and shuffle data and labels. MAXLEN = 128 
data = pad_sequences(sequences, maxlen=128)
labels = np.asarray(keras_labels)
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]

# Split 80-20
nb_validation_samples = int(0.2 * data.shape[0])
x_train = data[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]
x_val = data[-nb_validation_samples:]
y_val = labels[-nb_validation_samples:]

In [7]:
# Parse the GloVe word embeddings
glove_dir = "glove/"
embeddings_index = {}
f = open(os.path.join(glove_dir, "glove.42B.300d.txt"))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype="float32")
    embeddings_index[word] = coefs
f.close()

In [8]:
# Create the embedding matrix
embedding_matrix = np.zeros((len(word_index) + 1, 300))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
# Delete the embeddings index as it's no longer needed.
del embeddings_index
# Create the embedding layer. INPUT LENGTH 128
embedding_layer = Embedding(len(word_index) + 1, 300, input_length=128,
                            weights=[embedding_matrix],
                            trainable=False)

---
### NN setup and compilation
---

In [9]:
# Setup
model = Sequential()
model.add(embedding_layer)
model.add(Flatten())
model.add(Dropout(0.5))
model.add(Dense(256, activation="relu", kernel_regularizer = regularizers.l2(0.001)))
model.add(Dense(128, activation="relu", kernel_regularizer = regularizers.l2(0.001)))
model.add(Dense(32, activation="relu", kernel_regularizer = regularizers.l2(0.001)))
model.add(Dense(1, activation="sigmoid"))
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 128, 300)          43193700  
_________________________________________________________________
flatten_1 (Flatten)          (None, 38400)             0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 38400)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 256)               9830656   
_________________________________________________________________
dense_2 (Dense)              (None, 128)               32896     
_________________________________________________________________
dense_3 (Dense)              (None, 32)                4128      
_________________________________________________________________
dense_4 (Dense)              (None, 1)                

In [11]:
# Compilation
model.compile(optimizer = "Adam",
              loss = "binary_crossentropy",
              metrics = ["acc"])
history = model.fit(x_train, y_train,
                    epochs = 3,
                    batch_size = 256,
                    validation_data = (x_val, y_val))

# Save model weights
model.save_weights("pickles3/model3b2.h5")

Train on 325957 samples, validate on 81489 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


In [12]:
# Load weights
model.load_weights("pickles3/model3b2.h5")
# Predictions
predictions = model.predict(data)
# Save
pickle.dump(predictions, open("pickles3/predictions2.p", "wb"))

In [13]:
# Create a predicted labels list
labels_predicted = []
for prediction in predictions:
    labels_predicted.append( round(prediction[0]) )
# Calculate the confusion matrix
from sklearn.metrics import confusion_matrix, precision_score, recall_score, accuracy_score
CF = confusion_matrix(labels, labels_predicted)
#"Disentangle" the matrix
TN = round((CF[0,0] / sum(CF[0,:])) * 100, 1)
FN = round((CF[0,1] / sum(CF[0,:])) * 100, 1)
TP = round((CF[1,1] / sum(CF[1,:])) * 100, 1)
FP = round((CF[1,0] / sum(CF[1,:])) * 100, 1)
GTN = round((CF[0,0] / (sum(CF[0,:]) + sum(CF[1,:]))) * 100, 1)
GFN = round((CF[0,1] / (sum(CF[0,:]) + sum(CF[1,:]))) * 100, 1)
GTP = round((CF[1,1] / (sum(CF[0,:]) + sum(CF[1,:]))) * 100, 1)
GFP = round((CF[1,0] / (sum(CF[0,:]) + sum(CF[1,:]))) * 100, 1)
# Print the results
print("True positives account for "+str(TP)+"% or "+str(GTP)+"% of the total (sexist texts labelled as sexist).")
print("False positives account for "+str(FP)+"% or "+str(GFP)+"% of the total (sexist texts labelled as non-sexist).")
print("True negatives account for "+str(TN)+"% or "+str(GTN)+"% of the total (non-sexist texts labelled as non-sexist).")
print("False negatives account for "+str(FN)+"% or "+str(GFN)+"% of the total (non-sexist texts labelled as sexist).")

True positives account for 94.9% or 47.5% of the total (sexist texts labelled as sexist).
False positives account for 5.1% or 2.5% of the total (sexist texts labelled as non-sexist).
True negatives account for 95.8% or 47.9% of the total (non-sexist texts labelled as non-sexist).
False negatives account for 4.2% or 2.1% of the total (non-sexist texts labelled as sexist).


## Test the model!

In [2]:
# Test the network
test = ['''Women''']

# Convert the test phrase to lowercase, tokenize, spellcheck, remove stopwords. 
test = discrimination.texts.lowercase(test)
test = discrimination.texts.tokenize(test)
test = discrimination.texts.spellcheck_tokens(test)
test = discrimination.texts.remove_stopwords(test)

# Convert the token back to text, sequence it, pad it, feed it into the model.
text = ""
for item in test:
    for word in item:
        text += word + " "   
test_sequence = tokenizer.texts_to_sequences([text])

x_test = pad_sequences(test_sequence, maxlen=128)
model.load_weights("pickles3/model3b2.h5")
# Make the output look pretty... because it deserves it.
str(round(model.predict(x_test)[0,0]*100,0))[:-2] + "% sexist"

NameError: name 'tokenizer' is not defined