In [1]:
# Importing the dependencies

import glob
import numpy as np
import os
import re

import os.path
import xml.etree.ElementTree as ET                  # necessary to process .xml files

from random import shuffle
from keras.preprocessing import sequence            # necessary for padding
from keras.models import Sequential                 # Base Keras NN model
from keras.layers import Conv1D, GlobalMaxPooling1D # Convolution layer and pooling
from keras.layers import Dense, Dropout, Activation # The objects for each layer
from nltk.tokenize import TreebankWordTokenizer     # Tokenizer
from nltk.stem.porter import PorterStemmer          # Stemmer
from gensim.models.keyedvectors import KeyedVectors

In [2]:
# Let's load this data

def pre_process_data(filepath):
    male_path = os.path.join(filepath, 'male')
    female_path = os.path.join(filepath, 'female')
    male_label = 1
    female_label = 0
    dataset = []
    for filename in glob.glob(os.path.join(male_path, '*.txt')):
        with open(filename, 'r', encoding="utf-8") as f:
            dataset.append((male_label, f.read()))
    for filename in glob.glob(os.path.join(female_path, '*.txt')):
        with open(filename, 'r', encoding="utf-8") as f:
            dataset.append((female_label, f.read()))
    shuffle(dataset)
    return dataset

def collect_expected(dataset):
    return [sample[0] for sample in dataset]

In [3]:
corpus_path = "C:\\Users\\morzm\\jup_txts\\merged_corpus\\"
#google_vectors = "C:\\Users\\morzm\\jup_txts\\GoogleNews-vectors-negative300.bin.gz"

In [4]:
# Preprocessing the data
dataset = pre_process_data(corpus_path)
#dataset[0]

In [5]:
# Loading the fasttext embeddings

fasttext_vectors = 'crawl-300d-2M.vec'
word_vectors = KeyedVectors.load_word2vec_format(fasttext_vectors, binary=False, limit=400000)

In [6]:
# NO STEMMING AND STOPWORDS

my_word_stop = ['the', 'in', 'of', 'is', 'a', 'to', 'an', 'be']

def tokenize_and_vectorize(dataset):
    tokenizer = TreebankWordTokenizer()
    vectorized_data = []

    for sample in dataset:
        lowercase = sample[1].lower()
        tokens = tokenizer.tokenize(lowercase)
        sample_vecs = []
        for token in tokens:
            if token not in my_word_stop:
                #print(token)
                try:
                    sample_vecs.append(word_vectors[token])
                except KeyError:
                    pass # No matching token in the Google w2v vocab
        vectorized_data.append(sample_vecs)

    return vectorized_data

In [7]:
# Method to get the target labels
def collect_expected(dataset):
    """ Peel off the target values from the dataset """
    expected = []
    for sample in dataset:
        expected.append(sample[0])
    return expected

In [8]:
# Loading the dataset

vectorized_data = tokenize_and_vectorize(dataset)
expected = collect_expected(dataset)

#vectorized_data[0]

In [207]:
# Creating partitions
# 70% train | 15% validate | 15% test

split_point = int(len(vectorized_data)*.7)
further_split = int(split_point + len(vectorized_data)*.15)
print(len(vectorized_data))
print(split_point)
print(further_split)
x_train = vectorized_data[:split_point]
y_train = expected[:split_point]
x_val = vectorized_data[split_point:further_split]
y_val = expected[split_point:further_split]
x_test = vectorized_data[further_split:]
y_test = expected[further_split:]
# should work. go from 0 to a then from a to b then from b to end

# FOR REFERENCE
# split_point = int(len(vectorized_data)*.7)
# x_train = vectorized_data[:split_point] # there's a typo in this line, if copying from the book
# y_train = expected[:split_point]
# x_test = vectorized_data[split_point:]
# y_test = expected[split_point:]

3001
2100
2550


In [208]:
# Network parameters

maxlen = 600          # 600
batch_size = 32       # no difference on accuracy
embedding_dims = 300  # Same as Google's
filters = 250
kernel_size = 3       # 3 or 5, no difference on accuracy
hidden_dims = 250
epochs = 4            # 2 or 3



In [209]:
# Method to pad or truncate the input
# (notice that this code is quite verbose)
def pad_trunc(data, maxlen):
    """
    For a given dataset pad with zero vectors or truncate to maxlen
    """
    new_data = []
    # Create a vector of 0s the length of our word vectors
    zero_vector = []
    for _ in range(len(data[0][0])):
        zero_vector.append(0.0)

    for sample in data:
        if len(sample) > maxlen:
            temp = sample[:maxlen]
        elif len(sample) < maxlen:
            temp = sample
            # Append the appropriate number 0 vectors to the list
            additional_elems = maxlen - len(sample)
            for _ in range(additional_elems):
                temp.append(zero_vector)
        else:
            temp = sample
        new_data.append(temp)

    return new_data

In [210]:
# Padding/truncating the data (if necessary)

x_train = pad_trunc(x_train, maxlen)
x_val = pad_trunc(x_val, maxlen)
x_test = pad_trunc(x_test, maxlen)

# The shape is [number of samples, sequence length, word vector]  CUBE 
x_train = np.reshape(x_train, (len(x_train), maxlen, embedding_dims))
y_train = np.array(y_train)
x_val = np.reshape(x_val, (len(x_val), maxlen, embedding_dims))
y_val = np.array(y_val)
x_test = np.reshape(x_test, (len(x_test), maxlen, embedding_dims))
y_test = np.array(y_test)

In [211]:
print('Building model...')
model = Sequential()   # The standard NN model
model.add(Conv1D(      # Adding a convolutional layer
    filters,
    kernel_size,
    padding='valid',   # in this example the output is going to be lightly smaller
    activation='relu',
    strides=1,         # the shift
    input_shape=(maxlen, embedding_dims))
    )

#model.summary()
# Formulation: max (0, dot(filter, 3-gram))

Building model...


In [212]:
# Adding the max pooling. What is max pooling? For each filter one new version of the instance is produced, Pooling evenly divides the output of each filter into subsections
#It selects (or computes) a representative value for each subsection.
# Alternatives 
# - GlobalMaxPooling1D() (the max for the entire filter's output)
# - MaxPooling1D(n)  (the max for a specific area of n; default n=2)
# - AvgPooling1D(n)

model.add(GlobalMaxPooling1D())

In [213]:
# Adding dropout (20% of the data will be "cancelled")
model.add(Dense(hidden_dims))
model.add(Dropout(0.5))
model.add(Activation('relu'))

In [214]:
# The classification layer!
# sigmoid range: [0,1]
model.add(Dense(1))
model.add(Activation('sigmoid'))

In [215]:
# Compiling the CNN
model.compile(loss='binary_crossentropy',
        optimizer='adam',   # don't question it, just use adam
        metrics=['accuracy']
        )

model.summary()

Model: "sequential_18"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv1d_18 (Conv1D)          (None, 598, 250)          225250    
                                                                 
 global_max_pooling1d_18 (Gl  (None, 250)              0         
 obalMaxPooling1D)                                               
                                                                 
 dense_36 (Dense)            (None, 250)               62750     
                                                                 
 dropout_18 (Dropout)        (None, 250)               0         
                                                                 
 activation_36 (Activation)  (None, 250)               0         
                                                                 
 dense_37 (Dense)            (None, 1)                 251       
                                                     

In [216]:
# Fitting (training) the model
model.fit(x_train, y_train,
    batch_size=batch_size,
    epochs=epochs,
    validation_data=(x_val, y_val)
    )

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x2588597bd60>

In [217]:
# Evaluate the model on the test data using `evaluate`
print("Evaluate on test data")
results = model.evaluate(x_test, y_test, batch_size=128)
print("test loss, test acc:", results)

# Generate predictions (probabilities -- the output of the last layer)
# on new data using `predict`
print("Generate predictions for 3 samples")
predictions = model.predict(x_test[:3])
print("predictions shape:", predictions.shape)


Evaluate on test data
test loss, test acc: [0.45132648944854736, 0.8181818127632141]
Generate predictions for 3 samples
predictions shape: (3, 1)


In [144]:
from sklearn.metrics import confusion_matrix

y_pred = model.predict(x_test)
result = confusion_matrix = confusion_matrix(y_test, np.rint(y_pred))
result

array([[164,  52],
       [ 29, 205]], dtype=int64)

In [219]:
# Saving the model
model_structure = model.to_json()
with open("fasttext_cnn_model.json", "w") as json_file:
    json_file.write(model_structure)  # saves just the architecture
model.save_weights("fasttext_cnn_weights.h5")  # saves the weights

In [220]:
# Predicting a new instance

sample_1 = "C:\\Users\\morzm\\jup_txts\\corpus\\male\\aa4b605f6679148ff186c46a616bfe8a.txt"

vec_list = tokenize_and_vectorize([(1, sample_1)])
test_vec_list = pad_trunc(vec_list, maxlen)
test_vec = np.reshape(test_vec_list, (len(test_vec_list), maxlen,\
        embedding_dims))
model.predict(test_vec)

array([[0.5532177]], dtype=float32)

In [103]:
# Get the class
print((model.predict(test_vec) > 0.5).astype("int32"))

if model.predict(test_vec) > 0.5:
    print("Male")
else:
    print("Female")

[[1]]
Male
