In [1]:
import os
import pandas as pd
import tensorflow as tf
import keras
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, Flatten, Dense
import numpy as np
import random
import PySide6

In [2]:
# import data
df = pd.read_csv("text_data/text_data_train.csv")
df['text'] = df['text'].astype(str)

# separate into texts (paragraphs) and labels (authorship)
texts = list(df['text'])
labels = list(df['author_is_TW'])

# parameters
maxlen = 250

# train on 5000 samples
training_samples = 5000

# validate on 2000 samples
validation_samples = 2000

# consider only the top 10k words in the dataset
max_words = 10000

# embedding dimension
embedding_dim = 100

# helper function to add spaces before common punctuation, so that these symbols will be read as their own tokens
def separate_punctuation(txts):
    for t in range(len(txts)):
        txts[t] = txts[t].replace(".", " .")
        txts[t] = txts[t].replace("!", " !")
        txts[t] = txts[t].replace("?", " ?")
        txts[t] = txts[t].replace(":", " :")
        txts[t] = txts[t].replace(";", " ;")
        txts[t] = txts[t].replace(",", " ,")
        txts[t] = txts[t].replace("(", "( ")
        txts[t] = txts[t].replace(")", " )")
        txts[t] = txts[t].replace('"', ' " ')
    return txts


texts = separate_punctuation(texts)

# tokenize using keras built-in tokenizer
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(texts)

# convert to sequences
sequences = tokenizer.texts_to_sequences(texts)
word_index = tokenizer.word_index   # save index mapping numbers to words
print('Found %s unique tokens.' % len(word_index))

# pad sequences so that each sequence has the same length
data = pad_sequences(sequences, maxlen=maxlen)

# convert labels to numpy array
labels = np.asarray(labels)

# split the data into a training set and a validation set
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]

# training data
x_train = data[:training_samples]
y_train = labels[:training_samples]

# validation data
x_val = data[training_samples: training_samples + validation_samples]
y_val = labels[training_samples: training_samples + validation_samples]

Found 25474 unique tokens.


In [4]:
model = Sequential()

# add an embedding layer
model.add(Embedding(max_words, embedding_dim, input_length=maxlen))
# flatten the 3D tensor of embeddings into a 2D tensor of shape (samples, maxlen*embedding_dim)
model.add(Flatten())
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

In [5]:
model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['acc', 'Precision', 'Recall'])

In [6]:
history = model.fit(x_train, y_train,
                    epochs=10,
                    batch_size=32,
                    validation_data=(x_val, y_val))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [7]:
# read in holdout data
df_holdout = pd.read_csv("text_data/text_data_holdout.csv")
texts_holdout = list(df_holdout['text'])
labels_holdout = list(df_holdout['author_is_TW'])

# separate punctuation
texts_holdout = separate_punctuation(texts_holdout)

# convert to sequences
holdout_sequences = tokenizer.texts_to_sequences(texts_holdout)

# "pad" sequences so that each sequence has the same length
holdout_data = pad_sequences(holdout_sequences, maxlen=maxlen)

# convert labels (list) to a numpy array
holdout_labels = np.asarray(labels_holdout)

# shuffle the data
indices = np.arange(holdout_data.shape[0])
np.random.shuffle(indices)
x_holdout = holdout_data[indices]
y_holdout = holdout_labels[indices]

In [8]:
# Display sample predictions
for i in range(5):
    indx = random.randint(0, len(x_holdout))
    print('text:', tokenizer.sequences_to_texts([x_holdout[indx]]))
    print('predicted label:', round(model.predict(np.array([x_holdout[indx]]))[0][0], 2))
    print('actual label:', y_holdout[indx])

text: ["the editor of the local paper heard of it and sent a reporter to interview him and printed a story about it so you've written a book said the reporter what kind of a book is it what's it about why i i hardly know how to tell you george stammered"]
predicted label: 1.0
actual label: 1
text: ['who are you he said hoarsely holding a hairy hand carefully beside his mouth prince hal said eugene likewise hoarsely and behind his hand']
predicted label: 1.0
actual label: 1
text: ["these jews she cried such things would never happen if it were not for them they make all the trouble germany has had to protect herself the jews were taking all the money from the country thousands of them escaped taking millions of marks with them and now when it's too late we wake up to it it's too bad that foreigners must see these things that they've got to go through these painful experiences it makes a bad impression they don't understand the reason but it's the jews she whispered"]
predicted label: 1.

In [9]:
# Calculate model accuracy and loss

holdout_results = model.evaluate(x_holdout, y_holdout)
print('loss on hold-out data:', holdout_results[0])
print('accuracy on hold-out data:', holdout_results[1])

loss on hold-out data: 0.2253234088420868
accuracy on hold-out data: 0.9605539441108704


In [11]:
# re-import data
df = pd.read_csv("text_data/text_data_train.csv")
df['text'] = df['text'].astype(str)

# separate into texts (paragraphs) and labels (authorship)
texts = list(df['text'])
labels = list(df['author_is_TW'])

# cut off paragraphs after 300 words
maxlen = 300

# consider the top 20k words in the dataset
max_words = 20000

# set higher embedding dimension (150)
embedding_dim = 150

# # tokenize the text and convert to numerical sequences
# tokenize using keras built-in tokenizer, but this time don't filter out punctuation (potentially informative)

# helper function to add spaces before common punctuation, so that these symbols will be read as their own tokens
def separate_punctuation(txts):
    for t in range(len(txts)):
        txts[t] = txts[t].replace(".", " .")
        txts[t] = txts[t].replace("!", " !")
        txts[t] = txts[t].replace("?", " ?")
        txts[t] = txts[t].replace(":", " :")
        txts[t] = txts[t].replace(";", " ;")
        txts[t] = txts[t].replace(",", " ,")
        txts[t] = txts[t].replace("(", "( ")
        txts[t] = txts[t].replace(")", " )")
        txts[t] = txts[t].replace('"', ' " ')
    return txts


texts = separate_punctuation(texts)

tokenizer = Tokenizer(num_words=max_words,
                      filters='#$%*+-/<=>@[\\]^_`{|}~\t\n')
# tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(texts)

# convert to sequences
sequences = tokenizer.texts_to_sequences(texts)
word_index = tokenizer.word_index   # save index mapping numbers to words
print('Found %s unique tokens.' % len(word_index))

# "pad" sequences so that each sequence has the same length
data = pad_sequences(sequences, maxlen=maxlen)

# convert labels (list) to a numpy array
labels = np.asarray(labels)

# split the data into a training set and a validation set
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]

# training data
x_train = data[:training_samples]
y_train = labels[:training_samples]

# validation data
x_val = data[training_samples: training_samples + validation_samples]
y_val = labels[training_samples: training_samples + validation_samples]

Found 25616 unique tokens.


In [16]:
from keras.layers import LSTM
from keras.layers import Dropout

# initialize a sequential model
model = Sequential()
# add an embedding layer
model.add(Embedding(max_words, embedding_dim, input_length=maxlen))
# add dropout layers between each subsequent layer (to help slow overfitting by introducing random noise)
model.add(Dropout(0.15))
# use an LSTM ("Long Short-Term Memory) layer rather than a Dense layer; this allows the model to have memory, turning
# it into an "RNN" (Recurrent Neural Network)... Another common layer type for this purpose is the "GRU" (Gated
# Recurrent Unit)
model.add(LSTM(32))
model.add(Dropout(0.15))
model.add(Dense(1, activation='sigmoid'))


# compile model (now using Keras' Adam optimizer, widely considered the gold standard)
model.compile(optimizer=keras.optimizers.Adam(learning_rate=1e-3),
              loss='binary_crossentropy',
              metrics=['acc', 'Precision', 'Recall'])

# add an early stopping rule for training
# this will end training early if the chosen monitor (here, validation accuracy) stops improving
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_acc',
                                                  verbose=1,
                                                  patience=5,
                                                  mode='max',
                                                  restore_best_weights=True)

# train the model
history = model.fit(x_train, y_train,
                    epochs=25,
                    batch_size=32,
                    callbacks=[early_stopping],
                    validation_data=(x_val, y_val))



Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 8: early stopping


In [17]:
# read in holdout data
df_holdout = pd.read_csv("text_data/text_data_holdout.csv")
texts_holdout = list(df_holdout['text'])
labels_holdout = list(df_holdout['author_is_TW'])

# separate punctuation
texts_holdout = separate_punctuation(texts_holdout)

# convert to sequences
holdout_sequences = tokenizer.texts_to_sequences(texts_holdout)

# "pad" sequences so that each sequence has the same length
holdout_data = pad_sequences(holdout_sequences, maxlen=maxlen)

# convert labels (list) to a numpy array
holdout_labels = np.asarray(labels_holdout)

# shuffle the data
indices = np.arange(holdout_data.shape[0])
np.random.shuffle(indices)
x_holdout = holdout_data[indices]
y_holdout = holdout_labels[indices]

# Show predictions for a random sample of holdout data
for i in range(5):
    indx = random.randint(0, len(x_holdout))
    print('text:', tokenizer.sequences_to_texts([x_holdout[indx]]))
    print('predicted label:', round(model.predict(np.array([x_holdout[indx]]))[0][0], 2))
    print('actual label:', y_holdout[indx])

# Evaluate model on the holdout data
holdout_results = model.evaluate(x_holdout, y_holdout)
print('loss on hold-out data:', holdout_results[0])
print('accuracy on hold-out data:', holdout_results[1])

text: ['“how are yours ?” .” “let’s see them .” “they’re packed .” “how big are they really ?” “they’re all about the size of your smallest .” “you’re not holding out on me ?” “i wish i were .” “get them all on worms ?” “yes .” “you lazy bum !”']
predicted label: 0.0
actual label: 0
text: ["'i been thinking a lot tonight ,' her dad said . he poured out his beer and sprinkled a few of salt on the back of his hand . then he licked up the salt and took a swallow out of the glass ."]
predicted label: 0.0
actual label: 0
text: ['it was a photograph of half a dozen young men in loafing in an through which were visible a host of spires . there was gatsby , looking a little , not much , a bat in his hand . then it was all true . i saw the skins of flaming in his palace on the grand canal ; i saw him opening a chest of rubies to ease , with their crimson lighted depths , the of his broken heart .']
predicted label: 0.09
actual label: 0
text: ['“it’s all right ,” i said . “i don’t give a damn an