In [11]:
import os
import pandas as pd
import tensorflow as tf
import keras
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, Flatten, Dense
import numpy as np
import matplotlib
matplotlib.use("Qt5Agg")
import matplotlib.pyplot as plt
import random
import PySide6

In [18]:
# import data
df = pd.read_csv("texts/text_data_train.csv")
df['text'] = df['text'].astype(str)

# separate into texts (paragraphs) and labels (authorship)
texts = list(df['text'])
labels = list(df['author_is_TW'])

# parameters
maxlen = 250

# train on 5000 samples
training_samples = 5000

# validate on 2000 samples
validation_samples = 2000

# consider only the top 10k words in the dataset
max_words = 10000

# embedding dimension
embedding_dim = 100

# helper function to add spaces before common punctuation, so that these symbols will be read as their own tokens
def separate_punctuation(txts):
    for t in range(len(txts)):
        txts[t] = txts[t].replace(".", " .")
        txts[t] = txts[t].replace("!", " !")
        txts[t] = txts[t].replace("?", " ?")
        txts[t] = txts[t].replace(":", " :")
        txts[t] = txts[t].replace(";", " ;")
        txts[t] = txts[t].replace(",", " ,")
        txts[t] = txts[t].replace("(", "( ")
        txts[t] = txts[t].replace(")", " )")
        txts[t] = txts[t].replace('"', ' " ')
    return txts


texts = separate_punctuation(texts)

# tokenize using keras built-in tokenizer
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(texts)

# convert to sequences
sequences = tokenizer.texts_to_sequences(texts)
word_index = tokenizer.word_index   # save index mapping numbers to words
print('Found %s unique tokens.' % len(word_index))

# pad sequences so that each sequence has the same length
data = pad_sequences(sequences, maxlen=maxlen)

# convert labels to numpy array
labels = np.asarray(labels)

# split the data into a training set and a validation set
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]

# training data
x_train = data[:training_samples]
y_train = labels[:training_samples]

# validation data
x_val = data[training_samples: training_samples + validation_samples]
y_val = labels[training_samples: training_samples + validation_samples]

Found 25474 unique tokens.


In [19]:
model = Sequential()

# add an embedding layer
model.add(Embedding(max_words, embedding_dim, input_length=maxlen))
# flatten the 3D tensor of embeddings into a 2D tensor of shape (samples, maxlen*embedding_dim)
model.add(Flatten())
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 250, 100)          1000000   
                                                                 
 flatten_2 (Flatten)         (None, 25000)             0         
                                                                 
 dense_4 (Dense)             (None, 32)                800032    
                                                                 
 dense_5 (Dense)             (None, 1)                 33        
                                                                 
Total params: 1800065 (6.87 MB)
Trainable params: 1800065 (6.87 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [20]:
model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['acc', 'Precision', 'Recall'])

In [21]:
history = model.fit(x_train, y_train,
                    epochs=10,
                    batch_size=32,
                    validation_data=(x_val, y_val))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [22]:
# read in holdout data
df_holdout = pd.read_csv("texts/text_data_holdout.csv")
texts_holdout = list(df_holdout['text'])
labels_holdout = list(df_holdout['author_is_TW'])

# separate punctuation
texts_holdout = separate_punctuation(texts_holdout)

# convert to sequences
holdout_sequences = tokenizer.texts_to_sequences(texts_holdout)

# "pad" sequences so that each sequence has the same length
holdout_data = pad_sequences(holdout_sequences, maxlen=maxlen)

# convert labels (list) to a numpy array
holdout_labels = np.asarray(labels_holdout)

# shuffle the data
indices = np.arange(holdout_data.shape[0])
np.random.shuffle(indices)
x_holdout = holdout_data[indices]
y_holdout = holdout_labels[indices]

In [23]:
# Display sample predictions
for i in range(5):
    indx = random.randint(0, len(x_holdout))
    print('text:', tokenizer.sequences_to_texts([x_holdout[indx]]))
    print('predicted label:', round(model.predict(np.array([x_holdout[indx]]))[0][0], 2))
    print('actual label:', y_holdout[indx])

text: ['happen at the ” “i didn’t see it all one man was badly ” “where ” “here ”']
predicted label: 0.0
actual label: 0
text: ["and he would say gosh miss edith i didn't mean to do nothin' later as the golden sun was waning and there was nothing in the room but the smell of chalk and the heavy of the old october flies they would prepare to depart as he twisted carelessly into his overcoat she would him call him to her arrange the lapels and his necktie and smooth out his hair saying you're a good looking boy"]
predicted label: 0.99
actual label: 1
text: ['and now the city was left behind those familiar faces forms and voices of just six minutes past now seemed as remote as dreams imprisoned there as in another world a world of massive brick and stone and pavements a world of four million lives of hope and fear and hatred of anguish and despair of love of cruelty and devotion that was called berlin and now the land was stroking past the level land of the lonely of the north which he ha

In [24]:
# Calculate model accuracy and loss

holdout_results = model.evaluate(x_holdout, y_holdout)
print('loss on hold-out data:', holdout_results[0])
print('accuracy on hold-out data:', holdout_results[1])

loss on hold-out data: 0.2304934710264206
accuracy on hold-out data: 0.9597146511077881
