In [3]:
import string
import re
import os

from glob import glob
from pycontractions import Contractions

In [4]:
DATA_PATH = 'data'

In [12]:
cont = Contractions(w2v_path=os.path.join(DATA_PATH, 'GoogleNews-vectors-negative300.bin.gz'))
cont.load_models()

In [20]:
commands = {
    '^x (.+)$': 'examine \g<1>',
    '^(.+)\.(.+)$': '\g<1> \g<2>',
    '^m$': 'commands',
    '^l$' : 'look',
    '^nw$' : 'northwest',
    '^sw$' : 'southwest',
    '^ne$' : 'northeast',
    '^se$' : 'southeast',
    '^w$' : 'west',
    '^e$' : 'east',
    '^n$' : 'north',
    '^s$' : 'south',
    '^u$' : 'up', 
    '^d$' : 'down', 
    '^i$' : 'inventory',
    '^z$' : 'wait',
    '^g$' : 'again'            
}

def parse_transcript(infile, outfile, contractions=None, keep_caret=False):
    # convenience var to parse different interpreters
    sleepmask = False
    
    try:
        # open the file and start stripping all lines not starting with """Floyd |"""
        with open(infile, 'r', encoding='utf-8') as inf, open(outfile, 'w', encoding='utf-8') as outf:
            try:
                content = inf.read()
            except UnicodeDecodeError as e:
                print('Could not parse unicode: ' + e.reason)
                inf = open(infile, 'r', encoding='iso-8859-2') # LATIN-1
                content = inf.read()
            
            dest = []

            # skip everything up to the last load attempt (if none, just assume the game's already loaded)
            loads = re.finditer(string=content, pattern=r'say(?:s)? \(to Floyd\), "(?:load( sleepmask)?) (?:.+)"', flags=re.IGNORECASE)
            load = None
            for load in loads:
                pass            
            if load:
                if load.group(1):
                    sleepmask = True
                content = content[load.end():]

            # remove version commands 
            content = re.sub(repl='', string=content, pattern=r'say(?:s)? \(to Floyd\), "version".*?Floyd \| >\n', flags=re.IGNORECASE | re.DOTALL)

            # replace external commands with inline ones (unless we're using sleepmask, which does it for us)
            if not sleepmask:
                content = re.sub(repl='Floyd | > \g<1>\n', string=content, pattern=r'(?:.+) say(?:s)? \(to Floyd\), "(.+)"', flags=re.IGNORECASE)
            content = content.split('\n')

            # expand contractions
            if contractions:
                content = contractions.expand_texts(content, precise=True)

            ##############################################################
            ### From here on, we consider everything part of the game. ###
            ##############################################################
            print('Parsing gameplay...')

            scene = []
            about = False
            for line in content:
                # Strip away about commands
                if about:
                    # Either match the explicit "Q" command, or look for the first empty command prompt
                    if re.match(string=line, pattern=r'^Floyd \| >(?: q)?$', flags=re.IGNORECASE):
                        print('Line [{}], about=False'.format(line))
                        about = False
                    continue
                else:
                    if re.match(string=line, pattern=r'^Floyd \| > (about|help)$', flags=re.IGNORECASE):
                        print('Line [{}], about=True'.format(line))
                        about = True
                        continue

                # Just strip blank prompts and "press <key> to continue" lines
                if re.match(string=line, pattern=r'^Floyd \| >$|^Floyd \|$'):
                    continue
                if re.match(string=line, pattern=r'press (?:.+) to continue', flags=re.IGNORECASE):
                    continue

                # Scene descriptions
                match = re.match(string=line, pattern=r'^Floyd \| ([^>#\n]+)', flags=re.IGNORECASE)
                if match:
                    scene_line = match.group(1).strip().lower()
#                     print('Scene found:', scene_line)
                    scene.append(scene_line)

                # user's commands are appended to the current scene
                match = re.match(string=line, pattern=r'^Floyd \| > (.+)', flags=re.IGNORECASE)        
                if match:
                    command = match.group(1).strip().lower()
#                     print('Command found:', command)
                    # replace shortcuts
                    for pattern,repl in commands.items():
                        command = re.sub(string=command, pattern=pattern, repl=repl)

                    if keep_caret:
                        command = '> ' + command
                        
                    if len(scene) > 0:
                        newline = '{}\n{}\n'.format(' '.join(scene).replace('  ', ' '), command)
                    else:
                        newline = '{}\n'.format(command)
                        
#                     print('Adding line:', newline)
                    dest.append(newline)
                    scene = []

            # Append the very last scene
            if len(scene) > 0:
                dest.append(' '.join(scene).replace('  ', ' '))

            outf.writelines(dest)
            del content
            del dest
            print('Done.')
    finally:
        inf.close()

In [21]:
transcripts = sorted(glob('data/transcripts/*.txt'))
for transcript in transcripts:
    print('Parsing file: [{}]'.format(transcript))
    parsed = 'data/parsed/parsed-{}'.format('-'.join(transcript.split('-')[1:]))
    parse_transcript(transcript, parsed, contractions=cont)

Parsing file: [data/transcripts/raw-chineseroom.txt]
Parsing gameplay...
Done.
Parsing file: [data/transcripts/raw-groovebillygoat.txt]
Parsing gameplay...
Done.
Parsing file: [data/transcripts/raw-gdc09.txt]
Parsing gameplay...
Line [Floyd | > help], about=True
Line [Floyd | >], about=False
Done.
Parsing file: [data/transcripts/raw-varkana.txt]
Parsing gameplay...
Done.
Parsing file: [data/transcripts/raw-finalexam20160124.txt]
Parsing gameplay...
Line [Floyd | > HELP], about=True
Line [Floyd | >], about=False
Done.
Parsing file: [data/transcripts/raw-newernewyear.txt]
Parsing gameplay...
Done.
Parsing file: [data/transcripts/raw-recluse.txt]
Parsing gameplay...
Done.
Parsing file: [data/transcripts/raw-sinsagainstmimesis.txt]
Parsing gameplay...
Done.
Parsing file: [data/transcripts/raw-fdb-tin-folkar.txt]
Parsing gameplay...
Done.
Parsing file: [data/transcripts/raw-galatea.txt]
Parsing gameplay...
Done.
Parsing file: [data/transcripts/raw-mite.txt]
Parsing gameplay...
Line [Floyd |

Done.
Parsing file: [data/transcripts/raw-simplethefts.txt]
Parsing gameplay...
Line [Floyd | > help], about=True
Line [Floyd | >], about=False
Done.
Parsing file: [data/transcripts/raw-primrose-edited.txt]
Parsing gameplay...
Line [Floyd | > about], about=True
Line [Floyd | >], about=False
Done.
Parsing file: [data/transcripts/raw-allthingsdevours.txt]
Parsing gameplay...
Line [Floyd | > about], about=True
Line [Floyd | >], about=False
Done.
Parsing file: [data/transcripts/raw-tacofiction.txt]
Parsing gameplay...
Done.
Parsing file: [data/transcripts/raw-pathway.txt]
Parsing gameplay...
Done.
Parsing file: [data/transcripts/raw-unscientific.txt]
Parsing gameplay...
Line [Floyd | > help], about=True
Line [Floyd | >], about=False
Line [Floyd | > help], about=True
Line [Floyd | >], about=False
Done.
Parsing file: [data/transcripts/raw-samfortune.txt]
Parsing gameplay...
Done.
Parsing file: [data/transcripts/raw-hollywoodvisionary-part2.txt]
Parsing gameplay...
Done.
Parsing file: [data/t

Line [Floyd | > ABOUT], about=True
Line [Floyd | >], about=False
Done.
Parsing file: [data/transcripts/raw-hollywoodvisionary-part1.txt]
Parsing gameplay...
Done.
Parsing file: [data/transcripts/raw-saugusnet-b.txt]
Parsing gameplay...
Line [Floyd | > help], about=True
Line [Floyd | > q], about=False
Done.
Parsing file: [data/transcripts/raw-introcomp11.txt]
Could not parse unicode: invalid continuation byte
Parsing gameplay...
Done.
Parsing file: [data/transcripts/raw-rogue.txt]
Parsing gameplay...
Line [Floyd | > about], about=True
Line [Floyd | >], about=False
Done.
Parsing file: [data/transcripts/raw-bellwater.txt]
Parsing gameplay...
Done.
Parsing file: [data/transcripts/raw-stf.txt]
Parsing gameplay...
Line [Floyd | > about], about=True
Line [Floyd | >], about=False
Line [Floyd | > help], about=True
Line [Floyd | >], about=False
Line [Floyd | > about], about=True
Line [Floyd | >], about=False
Line [Floyd | > help], about=True
Line [Floyd | >], about=False
Done.
Parsing file: [dat

Done.
Parsing file: [data/transcripts/raw-halothane.txt]
Parsing gameplay...
Done.
Parsing file: [data/transcripts/raw-dialcforcupcakes-103014.txt]
Parsing gameplay...
Line [Floyd | > ABOUT], about=True
Line [Floyd | >], about=False
Done.
Parsing file: [data/transcripts/raw-plan6-waker.txt]
Parsing gameplay...
Done.
Parsing file: [data/transcripts/raw-everybodydies.txt]
Parsing gameplay...
Done.
Parsing file: [data/transcripts/raw-bse.txt]
Parsing gameplay...
Done.
Parsing file: [data/transcripts/raw-snacktime.txt]
Parsing gameplay...
Done.
Parsing file: [data/transcripts/raw-moonlittower.txt]
Parsing gameplay...
Line [Floyd | > about], about=True
Line [Floyd | > q], about=False
Done.


In [10]:
transcript = 'data/transcripts/raw-allthingsdevours.txt'
parsed = 'data/parsed-test.txt'
parse_transcript(transcript, parsed)

Parsing gameplay...
Scene found: welcome to the cheap glk implementation, library version 0.9.0.
Scene found: you're in.
Scene found: the plan now is simple: go to your lab, plant the bomb, and run. the
Scene found: prototype will be destroyed. the military will have no way to continue
Scene found: the experiment. no-one will die.
Scene found: the guard is out securing the grounds. the building is empty. you have
Scene found: six minutes.
Scene found: all things devours
Scene found: written for ifcomp 2004, by half sick of shadows.
Scene found: a second, more difficult, version will be released after the
Scene found: competition closes.
Scene found: first-time players should type 'about'.
Scene found: release 1 / serial number 040930 / inform v6.30 library 6/11
Scene found: foyer
Scene found: a darkened foyer, presided over by a security desk near the north
Scene found: wall. floating above it in the capacious darkness is a small landing.
Scene found: a set of stairs both rises up to m

Scene found: time passes.
Scene found: *******
Scene found: nearby, your past self attempts to unlock the door to the deutsch lab
Scene found: but finds it already unlocked.
Scene found: time unravels...
Scene found: *******
Scene found: tuesday 12th october 2004
Scene found: at precisely 4:21 am this morning a catastrophe of unprecedented scale
Scene found: occurred. an immense explosion -- that could only have been atomic in
Scene found: nature -- tore through the city and suburbs of boston ma, destroying
Scene found: all in its path. casualties are expected to number in the millions.
Scene found: the president has declared a state of emergency across the nation and
Scene found: the military is mobilizing for a possible war with the as yet unknown
Scene found: enemy.
Scene found: this thing all things devours:
Scene found: birds, beasts, trees, flowers;
Scene found: gnaws iron, bites steel;
Scene found: grinds hard stones to meal;
Scene found: slays king, ruins town;
Scene found: and

Scene found: to the south-east is a large, tinted glass window.
Command found: z
Adding line: balcony a wide balcony, looking out over the sleeping city. below you, a lawn slopes gently away into the gloom, a few silhouetted trees marking the edge of the modest grounds. above, the clear sky is a deep black, dusted with glinting stars. to the south, the door to the upstairs corridor lies open. to the south-east is a large, tinted glass window.
wait

Scene found: time passes.
Command found: z
Adding line: time passes.
wait

Scene found: time passes.
Command found: z
Adding line: time passes.
wait

Scene found: time passes.
Command found: z
Adding line: time passes.
wait

Scene found: time passes.
Command found: z
Adding line: time passes.
wait

Scene found: time passes.
Command found: z
Adding line: time passes.
wait

Scene found: time passes.
Command found: z
Adding line: time passes.
wait

Scene found: time passes.
Command found: xyzzy
Adding line: time passes.
xyzzy

Scene found: that

Scene found: the dark.
Scene found: in the north wall is a closed automatic door.
Scene found: to the south is a set of automatic doors leading out into the inky
Scene found: night.
Scene found: the siren wails.
Command found: s
Adding line: foyer a darkened foyer, presided over by a security desk near the north wall. floating above it in the capacious darkness is a small landing. a set of stairs both rises up to meet it and stretches downwards into the dark. in the north wall is a closed automatic door. to the south is a set of automatic doors leading out into the inky night. the siren wails.
south

Scene found: (first pressing the green exit button)
Scene found: the main doors open onto the night.
Scene found: the siren wails.
Scene found: you slip out into the night, darting quickly through the cambridge
Scene found: shadows until you reach the quiet corner where you left your car.
Scene found: exhausted, you gently start the cold motor and roll off towards the
Scene found: intersta

## Corpus building
Here we build a single corpus of `<scene_before, command, scene_after>` triplets to be used by the model.
We save the model in h5py format for ease-of-use and perfomance reasons.