In [2]:
import string
import re
import os

from glob import glob
from pycontractions import Contractions



In [6]:
commands = {
    '^x (.+)$': 'examine \g<1>',
    '^m$': 'commands',
    '^l$' : 'look',
    '^nw$' : 'northwest',
    '^sw$' : 'southwest',
    '^ne$' : 'northeast',
    '^se$' : 'southeast',
    '^w$' : 'west',
    '^e$' : 'east',
    '^n$' : 'north',
    '^s$' : 'south',
    '^i$' : 'inventory',
    '^z$' : 'wait',
    '^g$' : 'again'            
}

def parse_transcript(infile, outfile, contractions=None):
    # open the file and start stripping all lines not starting with """Floyd |"""
    with open(infile, 'r', encoding='utf-8') as inf, open(outfile, 'w', encoding='utf-8') as outf:
        content = inf.read()
        
        # skip everything up to the last load attempt (if none, just assume the game's already loaded)
        loads = re.finditer(string=content, pattern=r'say(?:s)? \(to Floyd\), "(?:load|load sleepmask) (?:.+)"', flags=re.IGNORECASE)
        load = None
        for load in loads:
            pass            
        if load:
            content = content[load.end():]
        
#         # the same with intro
#         intro = re.finditer(string=content, pattern=r'(?:Floyd \| Release (?:\d+)|Welcome to the Cheap Glk Implementation).*?Floyd \|\n', flags=re.IGNORECASE | re.DOTALL)
#         if intro:
#             *_, last = intro
#             content = content[last.end():]
        
#         intro = re.finditer(string=content, pattern=r'Floyd \| Release (?:\d+).*?Floyd \|\n', flags=re.IGNORECASE | re.DOTALL)
#         if intro:
#             *_, last = intro
#             content = content[last.end():]
        
#         # the same with end of the game (pick up the last one, as there might be the option to restore a saved game)
#         # everythin after the last match is considered out of game.
#         quits = re.finditer(string=content, pattern=r'(.*)QUIT\?')
#         *_, last = quits
#         content = content[:last.end()]
        
        # remove version commands 
        content = re.sub(repl='', string=content, pattern=r'say(?:s)? \(to Floyd\), "version".*?Floyd \| >\n', flags=re.IGNORECASE | re.DOTALL)
        
        # replace external commands with inline ones
        content = re.sub(repl='Floyd | > \g<1>\n', string=content, pattern=r'(?:.+) say(?:s)? \(to Floyd\), "(.+)"', flags=re.IGNORECASE)
        # replace shortcuts
        for pattern,repl in commands.items():
            content = re.sub(string=content, pattern=pattern, repl=repl)
                
        # expand contractions
        if contractions:
            content = contractions.expand_texts(content.split('\n'), precise=True)
        
        ##############################################################
        ### From here on, we consider everything part of the game. ###
        ##############################################################
        print('Parsing gameplay...')
        
        scene = []
        about = False
        for line in content:
            # Strip away about commands
            if about:
                # Either match the explicit "Q" command, or look for the first empty command prompt
                if re.match(string=line, pattern=r'^Floyd \| >(?:q)?$', flags=re.IGNORECASE):
                    print('about=False')
                    about = False
                continue
            else:
                if re.match(string=line, pattern=r'^Floyd \| > (about|help|commands)', flags=re.IGNORECASE):
                    print('about=True')
                    about = True
                    continue
                    
            # Just strip blank prompts and "press <key> to continue" lines
            if re.match(string=line, pattern=r'^Floyd \| >$'):
                continue
            if re.match(string=line, pattern=r'press (?:.+) to continue', flags=re.IGNORECASE):
                continue
                    
            # Scene descriptions
            match = re.match(string=line, pattern=r'^Floyd \| ([^>#\n]+)', flags=re.IGNORECASE)
            if match:
                scene.append(match.group(1).strip().lower())
            
            # user's commands are appended to the current scene
            match = re.match(string=line, pattern=r'^Floyd \| > (.+)', flags=re.IGNORECASE)        
            if match:
                command = match.group(1).strip().lower()                
                newline = '{}\n{}\n'.format(' '.join(scene).replace('  ', ' '), command)
                outf.write(newline)
                
                scene = []
                
        # Append the very last scene
        if len(scene) > 0:
            outf.write(' '.join(scene).replace('  ', ' '))
            
        print('Done.')

In [7]:
# infile = 'raw_weishaupt.txt'
# outfile = 'parsed-weishaupt.txt'

cont = Contractions('data/GoogleNews-vectors-negative300.bin')
cont.load_models()

In [None]:
transcripts = glob('data/transcripts/*.txt')
for transcript in transcripts:
    print('Parsing file: [{}]'.format(transcript))
    parsed = 'data/parsed/parsed-{}'.format(transcript.split('-')[1])
    parse_transcript(transcript, parsed, cont)

Parsing file: [data/transcripts\raw-12heads.txt]
Parsing gameplay...
Done.
Parsing file: [data/transcripts\raw-1893.txt]
Parsing gameplay...
Done.
Parsing file: [data/transcripts\raw-20160221-thesueno-utf8.txt]
Parsing gameplay...
Done.
Parsing file: [data/transcripts\raw-20160221-thesueno.txt]
Parsing gameplay...
Done.
Parsing file: [data/transcripts\raw-20160327-unrealcity-lifeonmars.txt]
Parsing gameplay...
Done.
Parsing file: [data/transcripts\raw-3card-deadmanshill-2016Ap24.txt]
Parsing gameplay...
Done.
Parsing file: [data/transcripts\raw-69krakatoa.txt]
Parsing gameplay...
Done.
Parsing file: [data/transcripts\raw-905-shrapnel.txt]
Parsing gameplay...
Done.
Parsing file: [data/transcripts\raw-abno.txt]
Parsing gameplay...
Done.
Parsing file: [data/transcripts\raw-acg-crossbow.txt]
Parsing gameplay...
Done.
Parsing file: [data/transcripts\raw-acitw.txt]
Parsing gameplay...
Done.
Parsing file: [data/transcripts\raw-actofmurder.txt]
Parsing gameplay...
Done.
Parsing file: [data/tra