In [1]:
# Importing the dependencies

import glob
import numpy as np
import os
import re

import os.path
import xml.etree.ElementTree as ET

from random import shuffle
from keras.preprocessing import sequence   # necessary for padding
from keras.models import Sequential        # Base Keras NN model
from keras.layers import Conv1D, GlobalMaxPooling1D # Convolution layer and pooling
from keras.layers import Dense, Dropout, Activation # The objects for each layer
from nltk.tokenize import TreebankWordTokenizer
from gensim.models.keyedvectors import KeyedVectors


In [2]:
xmls_directory = "C:\\Users\\morzm\\jup_txts\\text\\" # This is the path to the .xml files of the PAN2018 twitter corpus

truth_path = "C:\\Users\\morzm\\jup_txts\\en.txt" # This is the path to the .txt file containing the ids and genders of each twitter user

In [4]:
# This bit of code transforms the .xml files containing the tweets
# on the base of a truth file into .txts, depending on gender
# thus creating a corpus

with open(truth_path, 'r') as f:
    for line in f.readlines():
        string = line.partition(":::") # This line divides the author_id from the gender of the author
        if "male\n" in string:
            save_path = "C:\\Users\\morzm\\jup_txts\\merged_corpus\\male\\"
            author_id = string[0]
            for file in os.listdir(xmls_directory):
                if file.endswith(author_id+".xml"):
                    os.chdir("C:\\Users\\morzm\\jup_txts\\text\\")
                    xml_file = file # so that the file can be parsed by ElementTree
                    tree = ET.parse(xml_file)
                    root = tree.getroot()
                    n = 0
                    
                    with open(os.path.join("C:\\Users\\morzm\\jup_txts\\merged_corpus\\male\\", author_id+".txt"), "w"):
                        for tweets in root.find('documents'):
                            tweet = tweets.text
                            text = re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '', tweet)
                            line_to_write = text + '\n'
                            with open(os.path.join(save_path, author_id+".txt"), 'a', encoding="utf-8") as f:
                              f.write(line_to_write)
                              
        elif "female\n" in string: # same for female authors
            save_path = "C:\\Users\\morzm\\jup_txts\\merged_corpus\\female\\"
            author_id = string[0]
            for file in os.listdir(xmls_directory):
                if file.endswith(author_id+".xml"):
                    os.chdir("C:\\Users\\morzm\\jup_txts\\text\\")
                    xml_file = file
                    tree = ET.parse(xml_file)
                    root = tree.getroot()
                    n = 0
                    
                    with open(os.path.join("C:\\Users\\morzm\\jup_txts\\merged_corpus\\female\\", author_id+".txt"), "w"):
                        for tweets in root.find('documents'):
                            tweet = tweets.text
                            text = re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '', tweet)
                            line_to_write = text + '\n'
                            with open(os.path.join(save_path, author_id+".txt"), 'a', encoding="utf-8") as f:
                              f.write(line_to_write)

In [3]:
# Let's load this data

def pre_process_data(filepath):
    male_path = os.path.join(filepath, 'male')
    female_path = os.path.join(filepath, 'female')
    male_label = 1
    female_label = 0
    dataset = []
    for filename in glob.glob(os.path.join(male_path, '*.txt')):
        with open(filename, 'r', encoding="utf-8") as f:
            dataset.append((male_label, f.read()))
    for filename in glob.glob(os.path.join(female_path, '*.txt')):
        with open(filename, 'r', encoding="utf-8") as f:
            dataset.append((female_label, f.read()))
    shuffle(dataset)
    return dataset

def collect_expected(dataset):
    return [sample[0] for sample in dataset]

In [4]:
corpus_path = "C:\\Users\\morzm\\jup_txts\\merged_corpus\\"
google_vectors = "C:\\Users\\morzm\\jup_txts\\GoogleNews-vectors-negative300.bin.gz"

In [5]:
# Preprocessing the data
dataset = pre_process_data(corpus_path)
dataset[0]

(0,
 '@water_io @Yoav_Hoshen do you sell the smart cap for disposable bottles to the general public? Such a great idea - would love to have one.\n@chellemcquaid save me a spot - keen to join you Nov 2016 but just waiting on time travel ;) #autotweet?\nGood morning #camping #castlepoint #kiwisummer \nProlonging the agony for #ncea results a bit further, the NZQA website appears to be overloaded ...\n@antoniam no issues at all - they came out great. I downloaded design from @canva as "PDF-Print" which I think is max quality?\nMy personalized greeting cards arrived today - designed on @canva and printed by @Vistaprint ... those two are match made in heaven.\n@NZAA Just used the Roadservice app for the first time - really great! #flatbattery\nLoving @edsheeran\'s stories about New Zealand with @scott_mills and @Chris_Stark on bbcradio1 while listening from Wellington, New Zealand\nThe only reason I prune the rose bush #hategardening #lovefreshroses \n@catalystpacific you might be intereste

In [6]:
# Loading the word2vec embeddings

word_vectors = KeyedVectors.load_word2vec_format(google_vectors,
    binary=True, limit=400000)

In [7]:
# Method to tokenise and vectorise all the training data

def tokenize_and_vectorize(dataset):
    tokenizer = TreebankWordTokenizer()
    vectorized_data = []
#    expected = [] this line appears in the book, but it's not necessary here!
    for sample in dataset:
        tokens = tokenizer.tokenize(sample[1])
        sample_vecs = []
        for token in tokens:
            try:
                sample_vecs.append(word_vectors[token])
            except KeyError:
                pass # No matching token in the Google w2v vocab
        vectorized_data.append(sample_vecs)

    return vectorized_data

In [8]:
# Method to get the target labels
def collect_expected(dataset):
    """ Peel off the target values from the dataset """
    expected = []
    for sample in dataset:
        expected.append(sample[0])
    return expected

In [9]:
# Loading the dataset
vectorized_data = tokenize_and_vectorize(dataset)
expected = collect_expected(dataset)

vectorized_data[0]

[array([-0.21289062, -0.09814453,  0.16015625,  0.18457031,  0.19726562,
         0.09521484,  0.04370117, -0.14550781,  0.01361084, -0.04370117,
        -0.26171875, -0.15039062, -0.16503906, -0.26367188,  0.00662231,
         0.01519775,  0.26367188,  0.20019531, -0.16210938,  0.02429199,
        -0.24023438, -0.08056641,  0.49804688, -0.12353516, -0.24804688,
        -0.01153564, -0.3984375 ,  0.08837891,  0.05957031,  0.09619141,
         0.12890625,  0.0111084 , -0.16503906, -0.45703125, -0.06835938,
        -0.17871094, -0.04345703,  0.234375  ,  0.140625  ,  0.00726318,
        -0.078125  , -0.25195312, -0.02087402,  0.17773438,  0.17382812,
        -0.1328125 , -0.15917969, -0.13867188,  0.08837891,  0.18457031,
        -0.21289062,  0.3359375 ,  0.18554688,  0.51171875,  0.14453125,
         0.05834961, -0.06347656,  0.19921875, -0.04931641, -0.34375   ,
        -0.01794434, -0.06445312, -0.02062988,  0.00976562, -0.21875   ,
        -0.37695312, -0.09472656, -0.00567627,  0.2

In [18]:
# Creating training and validation partitions
# 70% train | 15% validate | 15% test

split_point = int(len(vectorized_data)*.7)
further_split = int(split_point + len(vectorized_data)*.15)
print(len(vectorized_data))
print(split_point)
print(further_split)
x_train = vectorized_data[:split_point]
y_train = expected[:split_point]
x_val = vectorized_data[split_point:further_split]
y_val = expected[split_point:further_split]
x_test = vectorized_data[further_split:]
y_test = expected[further_split:]
# should work? go from 0 to a then from a to b then from b to end

# FOR REFERENCE
# split_point = int(len(vectorized_data)*.7)
# x_train = vectorized_data[:split_point] # there's a typo in this line, if copying from the book
# y_train = expected[:split_point]
# x_test = vectorized_data[split_point:]
# y_test = expected[split_point:]

3000
2100
2550


In [20]:
# Network parameters

maxlen = 400          # think some more about this
batch_size = 32
embedding_dims = 300  # Same as Google's
filters = 250
kernel_size = 3
hidden_dims = 250
epochs = 3            # 2 or 3



In [21]:
# Method to pad or truncate the input
# (notice that this code is quite verbose)
def pad_trunc(data, maxlen):
    """
    For a given dataset pad with zero vectors or truncate to maxlen
    """
    new_data = []
    # Create a vector of 0s the length of our word vectors
    zero_vector = []
    for _ in range(len(data[0][0])):
        zero_vector.append(0.0)

    for sample in data:
        if len(sample) > maxlen:
            temp = sample[:maxlen]
        elif len(sample) < maxlen:
            temp = sample
            # Append the appropriate number 0 vectors to the list
            additional_elems = maxlen - len(sample)
            for _ in range(additional_elems):
                temp.append(zero_vector)
        else:
            temp = sample
        new_data.append(temp)

    return new_data

In [22]:
# Padding/truncating the data (if necessary)

x_train = pad_trunc(x_train, maxlen)
x_val = pad_trunc(x_val, maxlen)
x_test = pad_trunc(x_test, maxlen)

# The shape is [number of samples, sequence length, word vector]  CUBE 
x_train = np.reshape(x_train, (len(x_train), maxlen, embedding_dims))
y_train = np.array(y_train)
x_val = np.reshape(x_val, (len(x_val), maxlen, embedding_dims))
y_val = np.array(y_val)
x_test = np.reshape(x_test, (len(x_test), maxlen, embedding_dims))
y_test = np.array(y_test)

In [23]:
print('Building model...')
model = Sequential()   # The standard NN model
model.add(Conv1D(      # Adding a convolutional layer
    filters,
    kernel_size,
    padding='valid',   # in this example the output is going to be lightly smaller
    activation='relu',
    strides=1,         # the shift
    input_shape=(maxlen, embedding_dims))
    )

model.summary()
# Formulation: max (0, dot(filter, 3-gram))

Building model...
Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv1d (Conv1D)             (None, 398, 250)          225250    
                                                                 
Total params: 225,250
Trainable params: 225,250
Non-trainable params: 0
_________________________________________________________________


In [24]:
# Adding the max pooling. What is max pooling? For each filter one new version of the instance is produced, Pooling evenly divides the output of each filter into subsections
#It selects (or computes) a representative value for each subsection.
# Alternatives 
# - GlobalMaxPooling1D() (the max for the entire filter's output)
# - MaxPooling1D(n)  (the max for a specific area of n; default n=2)
# - AvgPooling1D(n)

model.add(GlobalMaxPooling1D())

In [25]:
# Adding dropout (20% of the data will be "cancelled")
model.add(Dense(hidden_dims))
model.add(Dropout(0.2))
model.add(Activation('relu'))

In [26]:
# The classification layer!
# sigmoid range: [0,1]
model.add(Dense(1))
model.add(Activation('sigmoid'))

In [28]:
# Compiling the CNN
model.compile(loss='binary_crossentropy',
        optimizer='adam',   # don't question it, just use adam
        metrics=['accuracy']
        )
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv1d (Conv1D)             (None, 398, 250)          225250    
                                                                 
 global_max_pooling1d (Globa  (None, 250)              0         
 lMaxPooling1D)                                                  
                                                                 
 dense (Dense)               (None, 250)               62750     
                                                                 
 dropout (Dropout)           (None, 250)               0         
                                                                 
 activation (Activation)     (None, 250)               0         
                                                                 
 dense_1 (Dense)             (None, 1)                 251       
                                                      

In [29]:
# Fitting (training) the model
model.fit(x_train, y_train,
    batch_size=batch_size,
    epochs=epochs,
    validation_data=(x_val, y_val)
    )

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x1d9b005d510>

In [30]:
# Evaluate the model on the test data using `evaluate`
print("Evaluate on test data")
results = model.evaluate(x_test, y_test, batch_size=128)
print("test loss, test acc:", results)

# Generate predictions (probabilities -- the output of the last layer)
# on new data using `predict`
print("Generate predictions for 3 samples")
predictions = model.predict(x_test[:3])
print("predictions shape:", predictions.shape)


Evaluate on test data
test loss, test acc: [0.5137109160423279, 0.7333333492279053]
Generate predictions for 3 samples
predictions shape: (3, 1)


In [25]:
# Saving the model
model_structure = model.to_json()
with open("cnn_model.json", "w") as json_file:
    json_file.write(model_structure)  # saves just the architecture
model.save_weights("cnn_weights.h5")  # saves the weights

In [23]:
# Predicting a new instance

# Notice we have both positive and negative words here
sample_1 = "C:\\Users\\morzm\\jup_txts\\corpus\\male\\aa4b605f6679148ff186c46a616bfe8a.txt"

# The first value is a "fake" class (this is the expected input)
vec_list = tokenize_and_vectorize([(1, sample_1)])
test_vec_list = pad_trunc(vec_list, maxlen)
test_vec = np.reshape(test_vec_list, (len(test_vec_list), maxlen,\
        embedding_dims))
model.predict(test_vec)

array([[0.5629348]], dtype=float32)

In [24]:
# Get the class
(model.predict(test_vec) > 0.5).astype("int32")

array([[1]])