# <center>Thesis text generator

A program to generate text using my thesis on the rhenium dichalcogenides.
- <a href='#Open'>Open</a>
- <a href='#Preprocess'>Preprocess</a>
- <a href='#Tokenize'>Tokenize</a>
- <a href='#Createmodel'>Create model</a>
- <a href='#Plot'>Plot results</a>
- <a href='#Generate'>Generate text</a>

In [1]:
import tensorflow as tf
import numpy as np 
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import os

## Open <a id="Open"></a>

In [2]:
here = os.getcwd()
file = here + '/thesis.pdf'

In [3]:
import urllib.request

# Download thesis from pdf
pdf_path = r"https://purehost.bath.ac.uk/ws/portalfiles/portal/187941844/Lewis_Hart_thesis_final.pdf"
def download_file(download_url, filename):
    response = urllib.request.urlopen(download_url)    
    file = open(filename, 'wb')
    file.write(response.read())
    file.close()
 
download_file(pdf_path, file)

In [4]:
# for decoding
import codecs

# using Textract
import textract

# extract text in byte format
textract_text = textract.process(file)

# convert bytes to string
texts = codecs.decode(textract_text)

## Preprocessing <a id="Preprocess"></a>

In [5]:
import re
import string

# Make all text lowercase
texts = texts.lower()

# Remove special characters, units and other items that are not words
texts = re.sub(r'cid.*|φ.*|µ.*|ω.*|ν.*|δ.*|χ.*|˚.*|γ.*|ψ.*|π.*|θ.*|±.*|“|”|†.*|∂.*|α.*|¯.*', '', texts)
texts = re.sub(r'.*[0-9](a.*|.*b.*|c.*|.*d.*|e.*|f.*|i.*|d.*)', '', texts)
texts = re.sub(r'(?:res)*([0-9]+)','',texts)
texts = re.sub('cm.*', '', texts)

# Another method to remove items that are not words
texts = texts.split()
stopwords = ['cid', 's',' c ',' d ',' j ', ' h ',' f ',' g ']
texts = [w for w in texts if w not in stopwords]

# Remove full stops from text
texts = ' '.join(texts)
corpus = texts.split(".")

# Extract introduction section of the thesis
del corpus[0:1710]
del corpus[1100:]

## Tokenize <a id="Tokenize"></a>

In [6]:
# Initialize the Tokenizer class
tokenizer = Tokenizer(num_words=2026, oov_token = '<oov>')

# Generate the word index dictionary
tokenizer.fit_on_texts(corpus)

# Define the total words. You add 1 for the index `0` which is just the padding token.
total_words = len(tokenizer.word_index) + 1

print(f'word index dictionary: {tokenizer.word_index}')
print(f'total words: {total_words}')

word index dictionary: {'<oov>': 1, 'the': 2, 'of': 3, 'a': 4, 'in': 5, 'is': 6, 'to': 7, 'and': 8, 'this': 9, 'be': 10, 'that': 11, 'are': 12, 'band': 13, 'can': 14, 'an': 15, 'from': 16, 'as': 17, 'by': 18, 'for': 19, 'these': 20, 'rhenium': 21, 'with': 22, 'monolayer': 23, 'it': 24, 'dichalcogenides': 25, 'raman': 26, 'which': 27, 'there': 28, 'gap': 29, 'crystal': 30, 'using': 31, 'at': 32, 'not': 33, 'arpes': 34, 'paper': 35, 'was': 36, 'energy': 37, 'electronic': 38, 'plane': 39, 'will': 40, 'on': 41, 'has': 42, 'have': 43, 'used': 44, 'electron': 45, 'bulk': 46, 'been': 47, 'i': 48, 'two': 49, 'along': 50, 'et': 51, 'al': 52, 'dft': 53, 'valence': 54, 'n': 55, 'structure': 56, 'state': 57, 'modes': 58, 'chapter': 59, 'rese': 60, 'sample': 61, 'light': 62, 'material': 63, 'also': 64, 'results': 65, 'indirect': 66, 'high': 67, 'work': 68, 'spectroscopy': 69, 'bands': 70, 'crystals': 71, 'materials': 72, 'transition': 73, 'therefore': 74, 'electrons': 75, 'ﬂake': 76, 'calculations'

In [7]:
# Initialize the sequences list
input_sequences = []

# Loop over every line
for line in corpus:

    # Tokenize the current line
    token_list = tokenizer.texts_to_sequences([line])[0]

    # Loop over the line several times to generate the subphrases
    for i in range(1, len(token_list)):

        # Generate the subphrase
        n_gram_sequence = token_list[:i+1]

        # Append the subphrase to the sequences list
        input_sequences.append(n_gram_sequence)

# Get the length of the longest line
max_sequence_len = max([len(x) for x in input_sequences])

# Pad all sequences
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

# Create inputs and label by splitting the last token in the subphrases
xs, labels = input_sequences[:,:-1],input_sequences[:,-1]

# Convert the label into one-hot arrays
ys = tf.keras.utils.to_categorical(labels, num_classes=total_words)

In [8]:
# Get sample sentence
sentence = corpus[1].split()
print(f'sample sentence: {sentence}')

# Initialize token list
token_list = []

# Look up the indices of each word and append to the list
for word in sentence: 
  token_list.append(tokenizer.word_index[word])

# Print the token list
print(token_list)

sample sentence: ['the', 'reasoning', 'for', 'this', 'name', 'is', 'due', 'to', 'its', 'remarkable', 'properties']
[2, 1195, 19, 9, 1196, 6, 203, 7, 491, 866, 88]


## Create model <a id="Createmodel"></a>

In [9]:
# Hyperparameters
embedding_dim = 100
lstm_units = 150
learning_rate = 0.01

# Build the model
model = Sequential([
          Embedding(total_words, embedding_dim, input_length=max_sequence_len-1),
          Bidirectional(LSTM(lstm_units)),
          Dense(total_words, activation='softmax')
])

# Use categorical crossentropy because this is a multi-class problem
model.compile(
    loss='categorical_crossentropy', 
    optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate), 
    metrics=['accuracy']
    )

# Print the model summary
model.summary()

Metal device set to: Apple M1


2022-09-07 13:25:30.105651: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2022-09-07 13:25:30.105983: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 88, 100)           200500    
                                                                 
 bidirectional (Bidirectiona  (None, 300)              301200    
 l)                                                              
                                                                 
 dense (Dense)               (None, 2005)              603505    
                                                                 
Total params: 1,105,205
Trainable params: 1,105,205
Non-trainable params: 0
_________________________________________________________________


In [None]:
epochs = 30

# Train the model
history = model.fit(xs, ys, epochs=epochs)

Epoch 1/30


2022-09-07 13:25:30.741770: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2022-09-07 13:25:31.811111: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-09-07 13:25:32.141636: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-09-07 13:25:32.158135: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-09-07 13:25:32.530723: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-09-07 13:25:32.555211: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
 94/467 [=====>........................] - ETA: 23s - loss: 2.1367 - accuracy: 0.4977

## Plot results <a id="Plot"></a>

In [None]:
import matplotlib.pyplot as plt

# Plot utility
def plot_graphs(history, string):
  plt.plot(history.history[string])
  plt.xlabel("Epochs")
  plt.ylabel(string)
  plt.show()

# Visualize the accuracy
plot_graphs(history, 'accuracy')
plot_graphs(history, 'loss')

## Generate text <a id="Generate"></a>

In [None]:
# Define seed text
seed_text = "rese is a"

# Define total words to predict
next_words = 5

# Loop until desired length is reached
for _ in range(next_words):

    # Convert the seed text to a token sequence
    token_list = tokenizer.texts_to_sequences([seed_text])[0]

    # Pad the sequence
    token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')

    # Feed to the model and get the probabilities for each index
    probabilities = model.predict(token_list)

    # Get the index with the highest probability
    predicted = np.argmax(probabilities, axis=-1)[0]

    # Ignore if index is 0 because that is just the padding.
    if predicted != 0:

        # Look up the word associated with the index. 
        output_word = tokenizer.index_word[predicted]

        # Combine with the seed text
        seed_text += " " + output_word

# Print the result	
print(seed_text)