In [1]:
import pandas as pd

## Load and clean data

In [12]:
df = pd.read_csv("adventure_game.csv")
df = df.drop(columns=["Consists of", "[hide]Images"])
df = df.drop([0])
df = df.dropna(how='all')
df.head(10)

Unnamed: 0,Structure name,Description
1,1x1_a1,A room that is empty except for a bench on whi...
2,"(""Flower room"")",
9,1x1_a2,A room with a cobblestone ring around the wall...
15,"(""Rails and ladder room"")",
16,1x1_a3,A room with two types of tables made out of co...
22,"(""Office"")",
23,1x1_a4,A room with a checkerboard pattern on the floo...
29,"(""Checkerboard room"")",
30,1x1_a5,A room with cobblestone and a single flower po...
36,"(""White tulip sanctuary room"")",


In [14]:
df["Description"] = df["Description"].shift(periods=1)
df = df.dropna()

Unnamed: 0,Structure name,Description
1,1x1_a1,
2,"(""Flower room"")",A room that is empty except for a bench on whi...
9,1x1_a2,
15,"(""Rails and ladder room"")",A room with a cobblestone ring around the wall...
16,1x1_a3,
22,"(""Office"")",A room with two types of tables made out of co...
23,1x1_a4,
29,"(""Checkerboard room"")",A room with a checkerboard pattern on the floo...
30,1x1_a5,
36,"(""White tulip sanctuary room"")",A room with cobblestone and a single flower po...


In [16]:
df['Structure name'] = df['Structure name'].map(lambda x: x.lstrip('("').rstrip('")'))
df

Unnamed: 0,Structure name,Description
2,Flower room,A room that is empty except for a bench on whi...
15,Rails and ladder room,A room with a cobblestone ring around the wall...
22,Office,A room with two types of tables made out of co...
29,Checkerboard room,A room with a checkerboard pattern on the floo...
36,White tulip sanctuary room,A room with cobblestone and a single flower po...
43,X room,A secret room with walls lined with cobbleston...
50,Spider room,A secret room filled with many cobwebs and a s...
57,Obsidian room,A secret room with an octahedron-shaped mass o...
64,Birch pillar room,A secret room with 4 birch plank pillars that ...
71,Birch arch,An office-like room with a desk with a flower ...


In [17]:
df.shape

(52, 2)

## Now that data is clean and ready, work on initial LSTM model

In [18]:
import numpy as np
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.utils import np_utils

Using TensorFlow backend.


In [29]:
# load the data

text = df["Description"].str.cat(sep=' ')
text = text.lower()

In [30]:
# creating character/word mappings

characters = sorted(list(set(text)))
n_to_char = {n:char for n, char in enumerate(characters)}
char_to_n = {char:n for n, char in enumerate(characters)}

In [32]:
# preprocessing

X = []
Y = []
length = len(text)
seq_length = 100
for i in range(0, length-seq_length, 1):
    sequence = text[i:i + seq_length]
    label =text[i + seq_length]
    X.append([char_to_n[char] for char in sequence])
    Y.append(char_to_n[label])

In [33]:
X_modified = np.reshape(X, (len(X), seq_length, 1))
X_modified = X_modified / float(len(characters))
Y_modified = np_utils.to_categorical(Y)

In [36]:
# modeling

model = Sequential()
model.add(LSTM(400, input_shape=(X_modified.shape[1], X_modified.shape[2]), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(400, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(400))
model.add(Dropout(0.2))
model.add(Dense(Y_modified.shape[1], activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam')

In [37]:
model.fit(X_modified, Y_modified, epochs=100, batch_size=50)

model.save_weights('text_generator_400_0.2_400_0.2_400_0.2_100.h5')

W1023 22:55:58.471351 4350342592 deprecation.py:323] From /Users/ljohnson/anaconda3/lib/python3.7/site-packages/tensorflow/python/ops/math_grad.py:1250: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [39]:
string_mapped = X[98]
full_string = [n_to_char[value] for value in string_mapped]
# generating characters
for i in range(200):
    x = np.reshape(string_mapped,(1,len(string_mapped), 1))
    x = x / float(len(characters))

    pred_index = np.argmax(model.predict(x, verbose=0))
    seq = [n_to_char[value] for value in string_mapped]
    full_string.append(n_to_char[pred_index])

    string_mapped.append(pred_index)
    string_mapped = string_mapped[1:len(string_mapped)]

In [91]:
#combining text

txt=""
for char in full_string:
    txt = txt+char
    
txt    

'ifferent flowers. a room with a cobblestone ring around the walls, with rails on top of it, and ladd of the cenler ald a chest and a chest and a chest and a chest and a chest and a chest and a chest and a chest and a chest and a chest and a chest and a chest and a chest and a chest and a chest and a'

## Trying a different method

In [92]:
import numpy
import sys
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM
from keras.utils import np_utils
from keras.callbacks import ModelCheckpoint

In [93]:
# preprocess function
# make all lowercase
# tokenize
# remove stop words

def tokenize_words(input):
    # lowercase everything to standardize it
    input = input.lower()

    # instantiate the tokenizer
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(input)

    # if the created token isn't in the stop words, make it part of "filtered"
    filtered = filter(lambda token: token not in stopwords.words('english'), tokens)
    return " ".join(filtered)

In [119]:
# load data
text = df["Description"].str.cat(sep=' ')

# preprocess the input data using tokenize function
processed_text = tokenize_words(text)

In [120]:
# sort unique characters, then use enumerate to get numbers
# create a dictionary for the characters and their equivalent numbers

text_chars = sorted(list(set(processed_text)))
char_to_num = dict((c, i) for i, c in enumerate(text_chars))

In [121]:
# store variables for use later

input_len_text = len(processed_text)
vocab_len_text = len(text_chars)
print ("Total number of characters in text:", input_len_text)
print ("Total vocab in text:", vocab_len_text)

Total number of characters in text: 5228
Total vocab in text: 31


In [123]:
# set sequence length (one complete mapping of inputs characters as integers)

seq_length = 100
x_data = []
y_data = []

In [124]:
# convert all characters in input to numbers

# loop through inputs, start at the beginning and go until we hit the final character 
# we can create a sequence out of

for i in range(0, input_len_text - seq_length, 1):
    # Define input and output sequences
    # Input is the current character plus desired sequence length
    in_seq = processed_text[i:i + seq_length]

    # Out sequence is the initial character plus total sequence length
    out_seq = processed_text[i + seq_length]

    # We now convert list of characters to integers based on
    # previously and add the values to our lists
    x_data.append([char_to_num[char] for char in in_seq])
    y_data.append(char_to_num[out_seq])

In [125]:
# check total number of sequences

n_patterns = len(x_data)
print ("Total Patterns:", n_patterns)

Total Patterns: 5128


In [126]:
# convert to numpy array, then make sure values are floats

X = numpy.reshape(x_data, (n_patterns, seq_length, 1))
X = X/float(vocab_len_text)

In [127]:
# one hot encode the labeled data

y = np_utils.to_categorical(y_data)

In [152]:
# create LSTM model and add layers, adding dropout to prevent overfitting

model = Sequential()
model.add(LSTM(400, input_shape=(X.shape[1], X.shape[2]), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(400, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(400))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))

In [153]:
# compile model, now ready for training

model.compile(loss='categorical_crossentropy', optimizer='adam')

In [154]:
# create checkpoints 

filepath = "model_weights_saved.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
desired_callbacks = [checkpoint]

In [155]:
# fit model and let it train

model.fit(X, y, epochs=50, batch_size=256, callbacks=desired_callbacks)

Epoch 1/50

Epoch 00001: loss improved from inf to 3.06922, saving model to model_weights_saved.hdf5
Epoch 2/50

Epoch 00002: loss improved from 3.06922 to 2.92133, saving model to model_weights_saved.hdf5
Epoch 3/50

Epoch 00003: loss improved from 2.92133 to 2.90568, saving model to model_weights_saved.hdf5
Epoch 4/50

Epoch 00004: loss improved from 2.90568 to 2.89864, saving model to model_weights_saved.hdf5
Epoch 5/50

Epoch 00005: loss did not improve from 2.89864
Epoch 6/50

Epoch 00006: loss improved from 2.89864 to 2.89524, saving model to model_weights_saved.hdf5
Epoch 7/50

Epoch 00007: loss improved from 2.89524 to 2.89315, saving model to model_weights_saved.hdf5
Epoch 8/50

Epoch 00008: loss did not improve from 2.89315
Epoch 9/50

Epoch 00009: loss improved from 2.89315 to 2.89114, saving model to model_weights_saved.hdf5
Epoch 10/50

Epoch 00010: loss improved from 2.89114 to 2.88980, saving model to model_weights_saved.hdf5
Epoch 11/50

Epoch 00011: loss improved from 

<keras.callbacks.History at 0xb3f6386a0>

In [156]:
# load weights back in and recompile model

filename = "model_weights_saved.hdf5"
model.load_weights(filename)
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [157]:
# convert output back to characters

num_to_char = dict((i, c) for i, c in enumerate(text_chars))

In [158]:
# start with random seed to generate sequence of characters from
# create 150 room descriptions from random seed

descriptions = []
for _ in range(150):
    start = numpy.random.randint(0, len(x_data) - 1)
    pattern = x_data[start]
    description = ''.join([num_to_char[value] for value in pattern])
    descriptions.append(description)  

In [159]:
# check out results

print(descriptions[0:10])

['ccess top wool ring trapped chest surrounded two tnt blocks containing two ender pearls trap sprung ', 'irs two chandeliers hang ceiling two vindicators evoker always generate large library study room thr', 'enerates room bunch arches made dark oak logs chest exists room end containing loot vindicator alway', 'eon otherwise empty room containing illager pixel art stairs leading decorated illager head made woo', 'h 5 flower pots placed containing different flowers room cobblestone ring around walls rails top lad', ' vindicator always generates room spiral staircase 1 wide dead end sloping curving hallway leads sin', 'potted alliums full cauldron chest containing alliums sit nearby roof made oak fences altar like roo', 'rner chandelier hangs ceiling long bedroom pink purple beds towards rear table flower pot corner two', 'op ladders placed walls ring single carved pumpkin sits wall facing door room two types tables made ', 'fountain surrounded andesite opposite corner dark oak tree lar

In [161]:
# save list of descriptions as text file to use in adventure game

with open("descriptions.txt", "w") as output:
    output.write(str(descriptions))
    
import json  

with open("desc_json.txt", "w") as output:
    json.dump(descriptions, output)    

# Now repeat process with names

In [162]:
# load data
names = df["Structure name"].str.cat(sep=' ')

# preprocess data using tokenize function from above
processed_names = tokenize_words(names)

In [163]:
# sort unique characters, then use enumerate to get numbers
# create a dictionary for the characters and their equivalent numbers

name_chars = sorted(list(set(processed_names)))
char_to_num_name = dict((c, i) for i, c in enumerate(name_chars))

In [164]:
# store variables for use later

input_len_names = len(processed_names)
vocab_len_names = len(name_chars)
print ("Total number of characters in names:", input_len_names)
print ("Total vocab in names:", vocab_len_names)

Total number of characters in names: 817
Total vocab in names: 25


In [198]:
# set sequence length (one complete mapping of inputs characters as integers)

seq_length = 50
x_data = []
y_data = []

In [220]:
# convert all characters in input to numbers

# loop through inputs, start at the beginning and go until we hit the final character 
# we can create a sequence out of

for i in range(0, input_len_names - seq_length, 1):
    # Define input and output sequences
    # Input is the current character plus desired sequence length
    in_seq = processed_names[i:i + seq_length]

    # Out sequence is the initial character plus total sequence length
    out_seq = processed_names[i + seq_length]

    # We now convert list of characters to integers based on
    # previously and add the values to our lists
    x_data.append([char_to_num_name[char] for char in in_seq])
    y_data.append(char_to_num_name[out_seq])

In [221]:
# check total number of sequences

n_patterns = len(x_data)
print ("Total Patterns:", n_patterns)

Total Patterns: 1534


In [222]:
# convert to numpy array, then make sure values are floats

X = numpy.reshape(x_data, (n_patterns, seq_length, 1))
X = X/float(vocab_len_names)

In [223]:
# one hot encode the labeled data

y = np_utils.to_categorical(y_data)

In [224]:
# create LSTM model and add layers, adding dropout to prevent overfitting

model = Sequential()
model.add(LSTM(400, input_shape=(X.shape[1], X.shape[2]), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(400, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(400))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))

In [225]:
# compile model, now ready for training

model.compile(loss='categorical_crossentropy', optimizer='adam')

In [226]:
# create checkpoints 

filepath = "model_weights_saved.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
desired_callbacks = [checkpoint]

In [227]:
# fit model and let it train

model.fit(X, y, epochs=50, batch_size=256, callbacks=desired_callbacks)

Epoch 1/50

Epoch 00001: loss improved from inf to 3.25520, saving model to model_weights_saved.hdf5
Epoch 2/50

Epoch 00002: loss improved from 3.25520 to 3.07743, saving model to model_weights_saved.hdf5
Epoch 3/50

Epoch 00003: loss improved from 3.07743 to 3.05503, saving model to model_weights_saved.hdf5
Epoch 4/50

Epoch 00004: loss improved from 3.05503 to 3.03498, saving model to model_weights_saved.hdf5
Epoch 5/50

Epoch 00005: loss improved from 3.03498 to 3.02531, saving model to model_weights_saved.hdf5
Epoch 6/50

Epoch 00006: loss improved from 3.02531 to 3.01464, saving model to model_weights_saved.hdf5
Epoch 7/50

Epoch 00007: loss improved from 3.01464 to 2.97514, saving model to model_weights_saved.hdf5
Epoch 8/50

Epoch 00008: loss improved from 2.97514 to 2.94979, saving model to model_weights_saved.hdf5
Epoch 9/50

Epoch 00009: loss improved from 2.94979 to 2.89701, saving model to model_weights_saved.hdf5
Epoch 10/50

Epoch 00010: loss improved from 2.89701 to 2.8

<keras.callbacks.History at 0x1a4e380198>

In [228]:
# load weights back in and recompile model

filename = "model_weights_saved.hdf5"
model.load_weights(filename)
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [229]:
# convert output back to characters

num_to_char = dict((i, c) for i, c in enumerate(name_chars))

In [307]:
# start with random seed to generate sequence of characters from
# create 150 room names from random seed

start = numpy.random.randint(0, len(x_data) - 1)
pattern = x_data[start]
print("Random Seed:")
space = ''
name = space.join([num_to_char[value] for value in pattern])
print(name[0:20])

Random Seed:
nding stairway room 


In [247]:
room_list = ["farm small empty", "obsidian room birch", "iple bed bedroom", "room large dining", "room single bed", "room x room spider", "redstone jail", "x room spider", "large dining room", "flower room rails", "room illager", "bed bedroom", "statue room nature", "tulip sanctuary", "ference room large", "small jail wood", "room obsidian", "stairway room", ]


 bedroom medium library straight stairs room maste


In [190]:
# save list of descriptions as text file to use in adventure game
import json  

with open("names.txt", "w") as output:
    json.dump(room_names, output)