#### Libraries

In [1]:
%load_ext autoreload
%autoreload 2
import discrimination
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, Dense, Activation, Flatten, Dropout
from keras.models import Sequential
from keras import regularizers
import itertools
import pickle
import random
import re
import os

INFO: {'User-Agent': 'Mozilla/5.0 (compatible, MSIE 11, Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko'}
Using TensorFlow backend.


---
### Get the *first* batch of data and make some statistics about them
---

In [None]:
texts_diary = pickle.load(open("pickles/texts_diary.p", "rb"))
texts_mydiary = pickle.load(open("pickles/texts_mydiary.p", "rb"))
texts_everydaysexism = pickle.load(open("pickles/texts_everydaysexism.p", "rb"))
# split into sentences
sentences_diary = discrimination.texts.sentences_split(texts_diary)
sentences_mydiary = discrimination.texts.sentences_split(texts_mydiary)
sentences_everydaysexism = discrimination.texts.sentences_split(texts_everydaysexism)

In [None]:
# Number of sentences per set
nsd = []
nsmd = []
nses = []
for s in sentences_diary:
    nsd.append(len(s))
for s in sentences_mydiary:
    nsmd.append(len(s))
for s in sentences_everydaysexism:
    nses.append(len(s))

In [None]:
# Number of words per sentence per set
nwd = []
nwmd = []
nwes = []
for s in sentences_diary:
    item = []
    for i in s:
        n = len(re.findall('\w+', i)) 
        item.append(n)
    nwd.append(item)
for s in sentences_mydiary:
    item = []
    for i in s:
        n = len(re.findall('\w+', i)) 
        item.append(n)
    nwmd.append(item)
for s in sentences_everydaysexism:
    item = []
    for i in s:
        n = len(re.findall('\w+', i)) 
        item.append(n)
    nwes.append(item)

In [None]:
# Average sentences
print("Average number of sentences in diary is", sum(nsd)/len(nsd))
print("Average number of sentences in my-diary is", sum(nsmd)/len(nsmd))
print("Average number of sentences in everydaysexism is", sum(nses)/len(nses))

In [None]:
# Average number of words-per-sentence (each text carries an equal weight)
wpsewd = []
wpsewmd = []
wpsewes = []
for i, n in enumerate(nwd):
    if nsd[i] == 0:
        wpsewd.append(0)
    else:
        wpsewd.append(sum(n)/nsd[i])
for i, n in enumerate(nwmd):
    if nsmd[i] == 0:
        wpsewmd.append(0)
    else:
        wpsewmd.append(sum(n)/nsmd[i])
for i, n in enumerate(nwes):
    if nses[i] == 0:
        wpsewes.append(0)
    else:
        wpsewes.append(sum(n)/nses[i])
        
print("Av. number of words-per-sentence (each text has an equal weight) in diary is", round(sum(wpsewd)/len(wpsewd),2))
print("Av. number of words-per-sentence (each text has an equal weight) in my diary is", round(sum(wpsewmd)/len(wpsewmd),2))
print("Av. number of words-per-sentence (each text has an equal weight) in everyday sexism is", round(sum(wpsewes)/len(wpsewes), 2))

In [None]:
# Average number of words-per-sentence (each sentence carries an equal weight)
twd = []
twmd = []
twes = []
for n in nwd:
    twd.append(sum(n))
for n in nwmd:
    twmd.append(sum(n))
for n in nwes:
    twes.append(sum(n))
        
print("Av. number of words-per-sentence (each sentence has an equal weight) in diary is", round(sum(twd)/sum(nsd),2))
print("Av. number of words-per-sentence (each sentence has an equal weight) in my diary is", round(sum(twmd)/sum(nsmd),2))
print("Av. number of words-per-sentence (each sentence has an equal weight) in everyday sexism is", round(sum(twes)/sum(nses),2))

---
### Split my-diary texts in sentences and tokenize
---

This step is (re)done in order to split up the non-sexist texts from my-diary.org (since they are too long) in order to match the length of the sexist texts from everydaysexism. This greatly increases the number of texts obtained from my-diary.org. 

In [None]:
# Reload and resplit into sentences
texts_diary = pickle.load(open("pickles/texts_diary.p", "rb"))
texts_mydiary = pickle.load(open("pickles/texts_mydiary.p", "rb"))
texts_everydaysexism = pickle.load(open("pickles/texts_everydaysexism.p", "rb"))

sentences_diary = discrimination.texts.sentences_split(texts_diary)
sentences_mydiary = discrimination.texts.sentences_split(texts_mydiary)
sentences_everydaysexism = discrimination.texts.sentences_split(texts_everydaysexism)

Texts from everydaysexism.com and diary.com have way less sentences than my-diary.org. Use a hard limit and split all texts of my-diary.org until they are up to 20 sentences long at most.

In [None]:
texts = []
for sentences in sentences_mydiary:
    division = round( len(sentences) / 20 )
    for i in range(division + 1):
        text = "".join(sentences[20*i : 20*(i+1)])
        if len(text) >= 20:
            texts.append(text)
            
sentences_mydiary2 = discrimination.texts.sentences_split(texts)

This has created obviously a very large spike at 20 sentences (~45%) of the texts. What we are interested to do is to have a similar average token length between the sexist and the non-sexist tokens. Will need to draw values from a normal distribution, discard negative values, and split the texts collected from my-diary according to this number until a similar average is "generated". It will take some fiddling around..

..well it is slightly more complex than expected but it does an ok job in mimicking the distribution of words-per-token in the sexist texts **assuming** one disregards the spike observed in said text for tokens containing around 10 words.

In [None]:
# Split the texts according to a normal with:
mean = 7
st_dev = 8

texts.clear()

for sentences in sentences_mydiary2:   
    draw = -1
    while draw <= 0:
        draw = int(round(random.normalvariate(mean,st_dev),0))
        while draw > 10 and random.random() < 0.3:
            draw = int(round(random.normalvariate(mean,st_dev),0))
        while draw == 1 and random.random() < 0.9:
            draw = int(round(random.normalvariate(mean,st_dev),0))
        while draw >=20 and random.random() < 0.75:
            draw = int(round(random.normalvariate(mean,st_dev),0))
        while 2 <= draw <= 4 and random.random() < 0.2:
            draw = int(round(random.normalvariate(mean,st_dev),0))
    
    division = round( len(sentences) / draw )
    
    for i in range(division + 1):   
        text = "".join(sentences[draw*i : draw*(i+1)])
        if len(text) >= 20:
            texts.append(text)
texts_mydiary = texts

In [None]:
# Tokenize texts and remove stop-words
tokens_diary = discrimination.texts.tokenize(texts_diary)
tokens_mydiary = discrimination.texts.tokenize(texts_mydiary)
tokens_everydaysexism = discrimination.texts.tokenize(texts_everydaysexism)

# Spell-check tokens. This actually takes some time (not too much) so there's a timer every 20.000 tokens checked.
tokens_diary = discrimination.texts.spellcheck_tokens(tokens_diary)
tokens_mydiary = discrimination.texts.spellcheck_tokens(tokens_mydiary)
tokens_everydaysexism = discrimination.texts.spellcheck_tokens(tokens_everydaysexism)

# Remove stop-words a second time, in case some stopwords where misspelled.
tokens_diary = discrimination.texts.remove_stopwords(tokens_diary)
tokens_mydiary = discrimination.texts.remove_stopwords(tokens_mydiary)
tokens_everydaysexism = discrimination.texts.remove_stopwords(tokens_everydaysexism)

In [None]:
nw_diary = 0
nw_mydiary = 0
nw_everydaysexism = 0
for token in tokens_diary:
    nw_diary += len(token)
for token in tokens_mydiary:
    nw_mydiary += len(token)
for token in tokens_everydaysexism:
    nw_everydaysexism += len(token)
    
isit33 = (nw_diary + nw_mydiary ) / (len(tokens_diary)+len(tokens_mydiary))
    
print("Av. number of words-per-token in non-sexist is", round(isit33,1))
print("Av. number of words-per-token in sexist is", round(nw_everydaysexism/len(tokens_everydaysexism),1))

In [None]:
tokens_mydiary.extend(tokens_diary)
list_of_texts = [tokens_everydaysexism, tokens_mydiary]
legend = ["Sexist", "Non-Sexist"]
discrimination.texts.sentences_plot(list_of_texts, (0,0.06), 100, [10,5], 90, legend)

In [None]:
print("37K of English texts collected resulted in", len(texts_mydiary), "after splitting")

In [None]:
texts_mydiary.extend(texts_diary)

In [None]:
# Save
pickle.dump(tokens_everydaysexism, open("pickles2/tkn_sexist.p", "wb"))
pickle.dump(tokens_mydiary, open("pickles2/tkn_notsexist.p", "wb"))
pickle.dump(texts_mydiary, open("pickles2/txts_sexist.p", "wb"))
pickle.dump(texts_everydaysexism, open("pickles2/txts_notsexist.p", "wb"))

---
### Load and clean new texts
---

In [None]:
txts_mgtow = []
table = discrimination.mongo.collection(collection="mgtow")
for x in table.find():
    txts_mgtow.append(x["text"]) 
    
txts_breitbart = []   
table = discrimination.mongo.collection(collection="breitbart")
for x in table.find():
    txts_breitbart.append(x["text"])

txts_9gag = []
table = discrimination.mongo.collection(collection="9gag")
for x in table.find():
    txts_9gag.append(x["text"]) 

txts_misc = []
table = discrimination.mongo.collection(collection="misc_texts")
for x in table.find():
    txts_misc.append(x["text"]) 
    
txts_youtube = []   
table = discrimination.mongo.collection(collection="youtube")
for x in table.find():
    txts_youtube.append(x["text"])

In [None]:
# Keep English, clean, save
txts_mgtow = discrimination.texts.keep_english(txts_mgtow, notify = 25000)
txts_mgtow = discrimination.texts.clean(txts_mgtow)
txts_mgtow = discrimination.texts.lowercase(txts_mgtow)
pickle.dump(txts_mgtow, open("pickles2/txts_mgtow.p", "wb"))

txts_breitbart = discrimination.texts.keep_english(txts_breitbart, notify = 25000)
txts_breitbart = discrimination.texts.clean(txts_breitbart)
txts_mgtow = discrimination.texts.lowercase(txts_breitbart)
pickle.dump(txts_breitbart, open("pickles2/txts_breitbart.p", "wb"))

txts_9gag = discrimination.texts.keep_english(txts_9gag, notify = 25000)
txts_9gag = discrimination.texts.clean(txts_9gag)
txts_mgtow = discrimination.texts.lowercase(txts_9gag)
pickle.dump(txts_9gag, open("pickles2/txts_9gag.p", "wb"))

txts_misc = discrimination.texts.keep_english(txts_misc, notify = 25000)
txts_misc = discrimination.texts.clean(txts_misc)
txts_mgtow = discrimination.texts.lowercase(txts_misc)
pickle.dump(txts_misc, open("pickles2/txts_misc.p", "wb"))

txts_youtube = discrimination.texts.keep_english(txts_youtube, notify = 25000)
txts_youtube = discrimination.texts.clean(txts_youtube)
txts_mgtow = discrimination.texts.lowercase(txts_youtube)
pickle.dump(txts_youtube, open("pickles2/txts_youtube.p", "wb"))

---
### Tokenize new texts
---

In [None]:
# Load
txts_mgtow = pickle.load(open("pickles2/txts_mgtow.p", "rb"))
txts_breitbart = pickle.load(open("pickles2/txts_breitbart.p", "rb"))
txts_9gag = pickle.load(open("pickles2/txts_9gag.p", "rb"))
txts_misc = pickle.load(open("pickles2/txts_misc.p", "rb"))
txts_youtube = pickle.load(open("pickles2/txts_youtube.p", "rb"))

In [None]:
# Tokenize, remove stopwords, spellcheck
tkn_mgtow = discrimination.texts.tokenize(txts_mgtow)
tkn_mgtow = discrimination.texts.spellcheck_tokens(tkn_mgtow)
tkn_mgtow = discrimination.texts.remove_stopwords(tkn_mgtow)

tkn_breitbart = discrimination.texts.tokenize(txts_breitbart)
tkn_breitbart = discrimination.texts.spellcheck_tokens(tkn_breitbart)
tkn_breitbart = discrimination.texts.remove_stopwords(tkn_breitbart)

tkn_9gag = discrimination.texts.tokenize(txts_9gag)
tkn_9gag = discrimination.texts.spellcheck_tokens(tkn_9gag)
tkn_9gag = discrimination.texts.remove_stopwords(tkn_9gag)

tkn_misc = discrimination.texts.tokenize(txts_misc)
tkn_misc = discrimination.texts.spellcheck_tokens(tkn_misc)
tkn_misc = discrimination.texts.remove_stopwords(tkn_misc)

tkn_youtube = discrimination.texts.tokenize(txts_youtube)
tkn_youtube = discrimination.texts.spellcheck_tokens(tkn_youtube)
tkn_youtube = discrimination.texts.remove_stopwords(tkn_youtube)

In [None]:
# Save
pickle.dump(tkn_mgtow, open("pickles2/tkn_mgtow.p", "wb"))
pickle.dump(tkn_breitbart, open("pickles2/tkn_breitbart.p", "wb"))
pickle.dump(tkn_9gag, open("pickles2/tkn_9gag.p", "wb"))
pickle.dump(tkn_misc, open("pickles2/tkn_misc.p", "wb"))
pickle.dump(tkn_youtube, open("pickles2/tkn_youtube.p", "wb"))

---
### Convert all (new and old) tokens back to text. Label the old texts. Save.
---

In [None]:
# Load new tokens
tkn_mgtow = pickle.load(open("pickles2/tkn_mgtow.p", "rb"))
tkn_breitbart = pickle.load(open("pickles2/tkn_breitbart.p", "rb"))
tkn_9gag = pickle.load(open("pickles2/tkn_9gag.p", "rb"))
tkn_misc = pickle.load(open("pickles2/tkn_misc.p", "rb"))
tkn_youtube = pickle.load(open("pickles2/tkn_youtube.p", "rb"))

In [None]:
# Load labelled tokens
tkn_sexist = pickle.load(open("pickles2/tkn_sexist.p", "rb"))
tkn_notsexist = pickle.load(open("pickles2/tkn_notsexist.p", "rb"))

# Load labelled texts
txts_sexist = pickle.load(open("pickles2/txts_sexist.p", "rb"))
txts_notsexist = pickle.load(open("pickles2/txts_notsexist.p", "rb"))

In [None]:
# Convert tokens back to text for Keras
txts_keras_label = []
for tkn in itertools.chain(tkn_sexist, tkn_notsexist):
    txt = ""
    for wd in tkn:
        txt += wd + " "        
    txts_keras_label.append(txt)
    
txts_keras_nolabel = []
for tkn in itertools.chain(tkn_mgtow, tkn_breitbart, tkn_9gag, tkn_misc, tkn_youtube):
    txt = ""
    for wd in tkn:
        txt += wd + " "        
    txts_keras_nolabel.append(txt)
    
# Create labels
label_keras = np.zeros(len(txts_keras_label))
label_keras[:len(tkn_sexist)] = 1

In [None]:
# Create a single text list
txts_new = []
for txt in txts_mgtow:
    txts_new.append((txt, "mgtow"))
for txt in txts_breitbart:
    txts_new.append((txt, "breitbart"))
for txt in txts_9gag:
    txts_new.append((txt, "9gag"))
for txt in txts_misc:
    txts_new.append((txt, "misc"))
for txt in txts_youtube:
    txts_new.append((txt, "youtube"))

In [None]:
# Save
pickle.dump(txts_keras_label, open("pickles2/txts_keras_label.p", "wb"))
pickle.dump(txts_keras_nolabel, open("pickles2/txts_keras_nolabel.p", "wb"))
pickle.dump(label_keras, open("pickles2/label_keras.p", "wb"))
pickle.dump(txts_new, open("pickles2/txts_new.p", "wb"))

___
### NN preparation
___

In [None]:
# Load
txts_keras_label = pickle.load(open("pickles2/txts_keras_label.p", "rb"))
txts_keras_nolabel = pickle.load(open("pickles2/txts_keras_nolabel.p", "rb"))
label_keras = pickle.load(open("pickles2/label_keras.p", "rb"))

In [None]:
txts4fit = txts_keras_label.copy()
txts4fit.extend(txts_keras_nolabel) 
# Tokenizing - Sequencing
tokenizer = Tokenizer(lower = False)
tokenizer.fit_on_texts(txts4fit)
sequences = tokenizer.texts_to_sequences(txts_keras_label)
word_index = tokenizer.word_index

# Create and shuffle data and labels
data = pad_sequences(sequences, maxlen=256)
labels = np.asarray(label_keras)
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]

# Split 80-20
nb_validation_samples = int(0.2 * data.shape[0])
x_train = data[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]
x_val = data[-nb_validation_samples:]
y_val = labels[-nb_validation_samples:]

In [None]:
# Parse the GloVe word embeddings
glove_dir = "glove/"
embeddings_index = {}
f = open(os.path.join(glove_dir, "glove.42B.300d.txt"))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype="float32")
    embeddings_index[word] = coefs
f.close()

In [None]:
# Create the embedding matrix
embedding_matrix = np.zeros((len(word_index) + 1, 300))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
# Delete the embeddings index as it's no longer needed.
del embeddings_index
# Create the embedding layer
embedding_layer = Embedding(len(word_index) + 1, 300, input_length=256,
                            weights=[embedding_matrix],
                            trainable=False)

---
### NN setup and compilation
---

In [None]:
# Setup
model = Sequential()
model.add(embedding_layer)
model.add(Flatten())
model.add(Dropout(0.5))
model.add(Dense(256, activation="relu", kernel_regularizer = regularizers.l2(0.001)))
model.add(Dense(16, activation="relu", kernel_regularizer = regularizers.l2(0.001)))
model.add(Dense(1, activation="sigmoid"))
model.summary()

In [None]:
# Compilation
model.compile(optimizer = "Adam",
              loss = "binary_crossentropy",
              metrics = ["acc"])
history = model.fit(x_train, y_train,
                    epochs = 3,
                    batch_size = 256,
                    validation_data = (x_val, y_val))

# Save model weights
model.save_weights("pickles2/model3.h5")

---
### Confusion Matrix
---

In [None]:
# Load
txts_keras_nolabel = pickle.load(open("pickles2/txts_keras_nolabel.p", "rb"))
txts_keras_label = pickle.load(open("pickles2/txts_keras_label.p", "rb"))
label_keras = pickle.load(open("pickles2/label_keras.p", "rb"))

In [None]:
# Setup Tokenizer, sequence, etcρίξει την αγνωστικιστική εκκλ
txts4fit = txts_keras_label.copy()
txts4fit.extend(txts_keras_nolabel) 
tokenizer = Tokenizer(lower = False)
tokenizer.fit_on_texts(txts4fit)
del txts_keras_nolabel
sequences = tokenizer.texts_to_sequences(txts_keras_label)
word_index = tokenizer.word_index
data = pad_sequences(sequences, maxlen=256)

In [None]:
# Parse the GloVe word embeddings
glove_dir = "glove/"
embeddings_index = {}
f = open(os.path.join(glove_dir, "glove.42B.300d.txt"))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype="float32")
    embeddings_index[word] = coefs
f.close()

In [None]:
# Embedding layer
embedding_matrix = np.zeros((len(word_index) + 1, 300))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
del embeddings_index
embedding_layer = Embedding(len(word_index) + 1, 300, input_length=256,
                            weights=[embedding_matrix],
                            trainable=False)

In [None]:
# Load model and weights
model = Sequential()
model.add(embedding_layer)
model.add(Flatten())
model.add(Dropout(0.5))
model.add(Dense(256, activation="relu", kernel_regularizer = regularizers.l2(0.001)))
model.add(Dense(16, activation="relu", kernel_regularizer = regularizers.l2(0.001)))
model.add(Dense(1, activation="sigmoid"))
model.summary()
model.load_weights("pickles2/model3.h5")

In [None]:
# Predictions
predictions_CF = model.predict(data)
# Save
pickle.dump(predictions_CF, open("pickles2/predictions_CF.p", "wb"))

In [13]:
# Load labels and predictions
label_keras = pickle.load(open("pickles2/label_keras.p", "rb"))
predictions_CF = pickle.load(open("pickles2/predictions_CF.p", "rb"))

# Create a predicted labels list
labels_predicted = []
for prediction in predictions_CF:
    labels_predicted.append( round(prediction[0]) )
# Calculate the confusion matrix
from sklearn.metrics import confusion_matrix, precision_score, recall_score, accuracy_score
CF = confusion_matrix(label_keras, labels_predicted)
#"Disentangle" the matrix
TN = round((CF[0,0] / sum(CF[0,:])) * 100, 1)
FN = round((CF[0,1] / sum(CF[0,:])) * 100, 1)
TP = round((CF[1,1] / sum(CF[1,:])) * 100, 1)
FP = round((CF[1,0] / sum(CF[1,:])) * 100, 1)
GTN = round((CF[0,0] / (sum(CF[0,:]) + sum(CF[1,:]))) * 100, 1)
GFN = round((CF[0,1] / (sum(CF[0,:]) + sum(CF[1,:]))) * 100, 1)
GTP = round((CF[1,1] / (sum(CF[0,:]) + sum(CF[1,:]))) * 100, 1)
GFP = round((CF[1,0] / (sum(CF[0,:]) + sum(CF[1,:]))) * 100, 1)
# Print the results
print("True positives account for "+str(TP)+"% or "+str(GTP)+"% of the total (sexist texts labelled as sexist).")
print("True negatives account for "+str(TN)+"% or "+str(GTN)+"% of the total (non-sexist texts labelled as non-sexist).")
print("False positives account for "+str(FP)+"% or "+str(GFP)+"% of the total (sexist texts labelled as non-sexist).")
print("False negatives account for "+str(FN)+"% or "+str(GFN)+"% of the total (non-sexist texts labelled as sexist).")

True positives account for 83.9% or 22.8% of the total (sexist texts labelled as sexist).
True negatives account for 97.0% or 70.7% of the total (non-sexist texts labelled as non-sexist).
False positives account for 16.1% or 4.4% of the total (sexist texts labelled as non-sexist).
False negatives account for 3.0% or 2.2% of the total (non-sexist texts labelled as sexist).


___
### Check new texts for sexism
___

In [3]:
# Load
txts_keras_nolabel = pickle.load(open("pickles2/txts_keras_nolabel.p", "rb"))
txts_keras_label = pickle.load(open("pickles2/txts_keras_label.p", "rb"))

In [4]:
# Setup Tokenizer, sequence, etc
txts4fit = txts_keras_label.copy()
txts4fit.extend(txts_keras_nolabel) 
tokenizer = Tokenizer(lower = False)
tokenizer.fit_on_texts(txts4fit)
del txts_keras_label
sequences = tokenizer.texts_to_sequences(txts_keras_nolabel)
word_index = tokenizer.word_index
data = pad_sequences(sequences, maxlen=256)

In [6]:
# Parse the GloVe word embeddings
glove_dir = "glove/"
embeddings_index = {}
f = open(os.path.join(glove_dir, "glove.42B.300d.txt"))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype="float32")
    embeddings_index[word] = coefs
f.close()

In [7]:
# Embedding layer
embedding_matrix = np.zeros((len(word_index) + 1, 300))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
del embeddings_index
embedding_layer = Embedding(len(word_index) + 1, 300, input_length=256,
                            weights=[embedding_matrix],
                            trainable=False)

In [8]:
# Load model and weights
model = Sequential()
model.add(embedding_layer)
model.add(Flatten())
model.add(Dropout(0.5))
model.add(Dense(256, activation="relu", kernel_regularizer = regularizers.l2(0.001)))
model.add(Dense(16, activation="relu", kernel_regularizer = regularizers.l2(0.001)))
model.add(Dense(1, activation="sigmoid"))
model.summary()
model.load_weights("pickles2/model3.h5")

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 256, 300)          81110400  
_________________________________________________________________
flatten_1 (Flatten)          (None, 76800)             0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 76800)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 256)               19661056  
_________________________________________________________________
dense_2 (Dense)              (None, 16)                4112      
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 17        
Total params: 100,775,585
Trainable params: 19,665,185
Non-trainable params: 81,110,400
________________________________

In [None]:
# Predictions
predictions = model.predict(data)
# Save
pickle.dump(predictions, open("pickles2/predictions.p", "wb"))

---
### Texts predicted to be 90%, or more, sexist are kept as sexist. Texts predicted to be 5%, or less, sexist are kept as non-sexist.
---

In [2]:
# Load
predictions = pickle.load(open("pickles2/predictions.p", "rb"))
txts_new = pickle.load(open("pickles2/txts_new.p", "rb"))

In [5]:
high_cut = 0.95
low_cut = 0.1

sexist_new = []
nonsexist_new = []
for p, t in zip(predictions, txts_new):
    if p > high_cut:
        sexist_new.append(t)
    elif p < low_cut:
        nonsexist_new.append(t)

In [6]:
print("There are", len(sexist_new), "new sexist texts and", len(nonsexist_new), "new non-sexist texts.")

There are 72318 new sexist texts and 408254 new non-sexist texts.


In [None]:
texts = [] 
for item in sexist_new:
    if not re.search('woman|women|man|men', item[0]):
        texts.append(item[0])

In [None]:
len(texts)

In [None]:
random.sample(texts, 10)

In [7]:
# Save
pickle.dump(sexist_new, open("pickles2/sexist_new.p", "wb"))
pickle.dump(nonsexist_new, open("pickles2/nonsexist_new.p", "wb"))

## Test the model!

In [12]:
# Test the network
test = ['''She looks like a slut''']

# Convert the test phrase to lowercase, tokenize, spellcheck, remove stopwords. 
test = discrimination.texts.lowercase(test)
test = discrimination.texts.tokenize(test)
test = discrimination.texts.spellcheck_tokens(test)
test = discrimination.texts.remove_stopwords(test)

# Convert the token back to text, sequence it, pad it, feed it into the model.
text = ""
for item in test:
    for word in item:
        text += word + " "   
test_sequence = tokenizer.texts_to_sequences([text])

x_test = pad_sequences(test_sequence, maxlen=256)
model.load_weights("pickles2/model3.h5")
# Make the output look pretty... because it deserves it.
str(round(model.predict(x_test)[0,0]*100,0))[:-2] + "% sexist"

'27% sexist'