In [3]:
# Importing the dependencies

import glob
import numpy as np
import os
import re

import os.path
import xml.etree.ElementTree as ET

from random import shuffle
from keras.preprocessing import sequence   # necessary for padding
from keras.models import Sequential        # Base Keras NN model
from keras.layers import Conv1D, GlobalMaxPooling1D # Convolution layer and pooling
from keras.layers import Dense, Dropout, Activation # The objects for each layer
from nltk.tokenize import TreebankWordTokenizer
from gensim.models.keyedvectors import KeyedVectors


In [4]:
xmls_directory = "C:\\Users\\morzm\\jup_txts\\test_text\\" # This is the path to the .xml files of the PAN2018 twitter corpus

truth_path = "C:\\Users\\morzm\\jup_txts\\testc_en.txt" # This is the path to the .txt file containing the ids and genders of each twitter user

In [3]:
# This bit of code transforms the .xml files containing the tweets
# on the base of a truth file into .txts, depending on gender
# thus creating a corpus

with open(truth_path, 'r') as f:
    for line in f.readlines():
        string = line.partition(":::") # This line divides the author_id from the gender of the author
        if "male\n" in string:
            save_path = "C:\\Users\\morzm\\jup_txts\\testing_corpus\\male\\"
            author_id = string[0]
            for file in os.listdir(xmls_directory):
                if file.endswith(author_id+".xml"):
                    os.chdir("C:\\Users\\morzm\\jup_txts\\test_text\\")
                    xml_file = file # so that the file can be parsed by ElementTree
                    tree = ET.parse(xml_file)
                    root = tree.getroot()
                    n = 0
                    
                    with open(os.path.join("C:\\Users\\morzm\\jup_txts\\testing_corpus\\male\\", author_id+".txt"), "w"):
                        for tweets in root.find('documents'):
                            tweet = tweets.text
                            text = re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '', tweet)
                            line_to_write = text + '\n'
                            with open(os.path.join(save_path, author_id+".txt"), 'a', encoding="utf-8") as f:
                              f.write(line_to_write)
                              
        elif "female\n" in string: # same for female authors
            save_path = "C:\\Users\\morzm\\jup_txts\\testing_corpus\\female\\"
            author_id = string[0]
            for file in os.listdir(xmls_directory):
                if file.endswith(author_id+".xml"):
                    os.chdir("C:\\Users\\morzm\\jup_txts\\test_text\\")
                    xml_file = file
                    tree = ET.parse(xml_file)
                    root = tree.getroot()
                    n = 0
                    
                    with open(os.path.join("C:\\Users\\morzm\\jup_txts\\testing_corpus\\female\\", author_id+".txt"), "w"):
                        for tweets in root.find('documents'):
                            tweet = tweets.text
                            text = re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '', tweet)
                            line_to_write = text + '\n'
                            with open(os.path.join(save_path, author_id+".txt"), 'a', encoding="utf-8") as f:
                              f.write(line_to_write)

In [5]:
import tensorflow as tf
from tensorflow import keras
from keras.models import model_from_json

json_file = open("C:\\Users\\morzm\\Jupyter\\model\\cnn_model.json", 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)
# load weights into new model
loaded_model.load_weights("C:\\Users\\morzm\\Jupyter\\model\\cnn_weights.h5")

In [6]:
loaded_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv1d (Conv1D)             (None, 398, 250)          225250    
                                                                 
 global_max_pooling1d (Globa  (None, 250)              0         
 lMaxPooling1D)                                                  
                                                                 
 dense (Dense)               (None, 250)               62750     
                                                                 
 dropout (Dropout)           (None, 250)               0         
                                                                 
 activation (Activation)     (None, 250)               0         
                                                                 
 dense_1 (Dense)             (None, 1)                 251       
                                                        

In [7]:
# Method to tokenise and vectorise all the training data

def tokenize_and_vectorize(dataset):
    tokenizer = TreebankWordTokenizer()
    vectorized_data = []
#    expected = [] this line appears in the book, but it's not necessary here!
    for sample in dataset:
        tokens = tokenizer.tokenize(sample[1])
        sample_vecs = []
        for token in tokens:
            try:
                sample_vecs.append(word_vectors[token])
            except KeyError:
                pass # No matching token in the Google w2v vocab
        vectorized_data.append(sample_vecs)

    return vectorized_data

In [8]:
# Method to pad or truncate the input
# (notice that this code is quite verbose)
def pad_trunc(data, maxlen):
    """
    For a given dataset pad with zero vectors or truncate to maxlen
    """
    new_data = []
    # Create a vector of 0s the length of our word vectors
    zero_vector = []
    for _ in range(len(data[0][0])):
        zero_vector.append(0.0)

    for sample in data:
        if len(sample) > maxlen:
            temp = sample[:maxlen]
        elif len(sample) < maxlen:
            temp = sample
            # Append the appropriate number 0 vectors to the list
            additional_elems = maxlen - len(sample)
            for _ in range(additional_elems):
                temp.append(zero_vector)
        else:
            temp = sample
        new_data.append(temp)

    return new_data

In [9]:
google_vectors = "C:\\Users\\morzm\\jup_txts\\GoogleNews-vectors-negative300.bin.gz"
# Loading the word2vec embeddings

word_vectors = KeyedVectors.load_word2vec_format(google_vectors,
    binary=True, limit=400000)

In [10]:
maxlen = 400
embedding_dims = 300

In [11]:
# Predicting a new instance

# Notice we have both positive and negative words here
sample_1 = "C:\\Users\\morzm\\jup_txts\\corpus\\male\\aa4b605f6679148ff186c46a616bfe8a.txt"

# The first value is a "fake" class (this is the expected input)
vec_list = tokenize_and_vectorize([(1, sample_1)])
test_vec_list = pad_trunc(vec_list, maxlen)
test_vec = np.reshape(test_vec_list, (len(test_vec_list), maxlen,\
        embedding_dims))
loaded_model.predict(test_vec)

array([[0.5629348]], dtype=float32)

In [12]:
# Get the class
if loaded_model.predict(test_vec) > 0.5:
    print("Male")
else:
    print("Female")

Male


In [13]:
corpus_male_path = "C:\\Users\\morzm\\jup_txts\\testing_corpus\\male\\"
corpus_female_path = "C:\\Users\\morzm\\jup_txts\\testing_corpus\\female\\"

In [14]:
# This code tests the model on the pre-existing 'testing' subset of the PAN2018 database
# Can be tweaked to test on any data

n_total = 0 # total instances
pred_n = 0 # correctly predicted instances

with open(truth_path, 'r') as f:
    for line in f.readlines():
        string = line.partition(":::")
        if "male\n" in string:
            author_id = string[0] # need this to open files
            author_gender = "male"
            for file in os.listdir(corpus_male_path):
                if file.endswith(author_id+".txt"):
                    with open("C:\\Users\\morzm\\jup_txts\\testing_corpus\\male\\"+author_id+".txt", "r", encoding="utf-8") as f:
                        testfile = f.readlines()
                        vec_list = tokenize_and_vectorize([(1, str(testfile))])
                        test_vec_list = pad_trunc(vec_list, maxlen)
                        test_vec = np.reshape(test_vec_list, (len(test_vec_list), maxlen,\
                                embedding_dims))
                        loaded_model.predict(test_vec)                   
   
                        if loaded_model.predict(test_vec) > 0.5:
                            predicted_gender = "male"
                        else:
                            predicted_gender = "female"
                
                        if predicted_gender == author_gender:
                            n_total += 1
                            pred_n += 1
                        else:
                            n_total += 1
            

                              
        elif "female\n" in string:
            author_id = string[0]
            author_gender = "female"
            for file in os.listdir(corpus_female_path):
                if file.endswith(author_id+".txt"):
                    with open("C:\\Users\\morzm\\jup_txts\\testing_corpus\\female\\"+author_id+".txt", "r", encoding="utf-8") as f:
                        testfile = f.readlines()
                        vec_list = tokenize_and_vectorize([(1, str(testfile))])
                        test_vec_list = pad_trunc(vec_list, maxlen)
                        test_vec = np.reshape(test_vec_list, (len(test_vec_list), maxlen,\
                                embedding_dims))
                        loaded_model.predict(test_vec)
                    
                        if loaded_model.predict(test_vec) > 0.5:
                            predicted_gender = "male"
                        else:
                            predicted_gender = "female"
                
                        if predicted_gender == author_gender:
                            n_total += 1
                            pred_n += 1

                        else:
                            n_total += 1
                    
    sum = pred_n/n_total
    print(sum*100)   # accuracy %


0  out of 1900
1  out of 1900
2  out of 1900
3  out of 1900
4  out of 1900
5  out of 1900
6  out of 1900
7  out of 1900
8  out of 1900
9  out of 1900
10  out of 1900
11  out of 1900
12  out of 1900
13  out of 1900
14  out of 1900
15  out of 1900
16  out of 1900
17  out of 1900
18  out of 1900
19  out of 1900
20  out of 1900
21  out of 1900
22  out of 1900
23  out of 1900
24  out of 1900
25  out of 1900
26  out of 1900
27  out of 1900
28  out of 1900
29  out of 1900
30  out of 1900
31  out of 1900
32  out of 1900
33  out of 1900
34  out of 1900
35  out of 1900
36  out of 1900
37  out of 1900
38  out of 1900
39  out of 1900
40  out of 1900
41  out of 1900
42  out of 1900
43  out of 1900
44  out of 1900
45  out of 1900
46  out of 1900
47  out of 1900
48  out of 1900
49  out of 1900
50  out of 1900
51  out of 1900
52  out of 1900
53  out of 1900
54  out of 1900
55  out of 1900
56  out of 1900
57  out of 1900
58  out of 1900
59  out of 1900
60  out of 1900
61  out of 1900
62  out of 1900
63

491  out of 1900
492  out of 1900
493  out of 1900
494  out of 1900
495  out of 1900
496  out of 1900
497  out of 1900
498  out of 1900
499  out of 1900
500  out of 1900
501  out of 1900
502  out of 1900
503  out of 1900
504  out of 1900
505  out of 1900
506  out of 1900
507  out of 1900
508  out of 1900
509  out of 1900
510  out of 1900
511  out of 1900
512  out of 1900
513  out of 1900
514  out of 1900
515  out of 1900
516  out of 1900
517  out of 1900
518  out of 1900
519  out of 1900
520  out of 1900
521  out of 1900
522  out of 1900
523  out of 1900
524  out of 1900
525  out of 1900
526  out of 1900
527  out of 1900
528  out of 1900
529  out of 1900
530  out of 1900
531  out of 1900
532  out of 1900
533  out of 1900
534  out of 1900
535  out of 1900
536  out of 1900
537  out of 1900
538  out of 1900
539  out of 1900
540  out of 1900
541  out of 1900
542  out of 1900
543  out of 1900
544  out of 1900
545  out of 1900
546  out of 1900
547  out of 1900
548  out of 1900
549  out of 19

975  out of 1900
976  out of 1900
977  out of 1900
978  out of 1900
979  out of 1900
980  out of 1900
981  out of 1900
982  out of 1900
983  out of 1900
984  out of 1900
985  out of 1900
986  out of 1900
987  out of 1900
988  out of 1900
989  out of 1900
990  out of 1900
991  out of 1900
992  out of 1900
993  out of 1900
994  out of 1900
995  out of 1900
996  out of 1900
997  out of 1900
998  out of 1900
999  out of 1900
1000  out of 1900
1001  out of 1900
1002  out of 1900
1003  out of 1900
1004  out of 1900
1005  out of 1900
1006  out of 1900
1007  out of 1900
1008  out of 1900
1009  out of 1900
1010  out of 1900
1011  out of 1900
1012  out of 1900
1013  out of 1900
1014  out of 1900
1015  out of 1900
1016  out of 1900
1017  out of 1900
1018  out of 1900
1019  out of 1900
1020  out of 1900
1021  out of 1900
1022  out of 1900
1023  out of 1900
1024  out of 1900
1025  out of 1900
1026  out of 1900
1027  out of 1900
1028  out of 1900
1029  out of 1900
1030  out of 1900
1031  out of 1900

1433  out of 1900
1434  out of 1900
1435  out of 1900
1436  out of 1900
1437  out of 1900
1438  out of 1900
1439  out of 1900
1440  out of 1900
1441  out of 1900
1442  out of 1900
1443  out of 1900
1444  out of 1900
1445  out of 1900
1446  out of 1900
1447  out of 1900
1448  out of 1900
1449  out of 1900
1450  out of 1900
1451  out of 1900
1452  out of 1900
1453  out of 1900
1454  out of 1900
1455  out of 1900
1456  out of 1900
1457  out of 1900
1458  out of 1900
1459  out of 1900
1460  out of 1900
1461  out of 1900
1462  out of 1900
1463  out of 1900
1464  out of 1900
1465  out of 1900
1466  out of 1900
1467  out of 1900
1468  out of 1900
1469  out of 1900
1470  out of 1900
1471  out of 1900
1472  out of 1900
1473  out of 1900
1474  out of 1900
1475  out of 1900
1476  out of 1900
1477  out of 1900
1478  out of 1900
1479  out of 1900
1480  out of 1900
1481  out of 1900
1482  out of 1900
1483  out of 1900
1484  out of 1900
1485  out of 1900
1486  out of 1900
1487  out of 1900
1488  out 

1890  out of 1900
1891  out of 1900
1892  out of 1900
1893  out of 1900
1894  out of 1900
1895  out of 1900
1896  out of 1900
1897  out of 1900
1898  out of 1900
1899  out of 1900
71.42105263157895
