In [1]:
import string
import re
import os

from glob import glob
from pycontractions import Contractions



In [2]:
commands = {
    '^x (.+)$': 'examine \g<1>',
    '^m$': 'commands',
    '^l$' : 'look',
    '^nw$' : 'northwest',
    '^sw$' : 'southwest',
    '^ne$' : 'northeast',
    '^se$' : 'southeast',
    '^w$' : 'west',
    '^e$' : 'east',
    '^n$' : 'north',
    '^s$' : 'south',
    '^i$' : 'inventory',
    '^z$' : 'wait',
    '^g$' : 'again'            
}

def parse_transcript(infile, outfile, contractions=None):
    # convenience var to parse different interpreters
    sleepmask = False
    
    # open the file and start stripping all lines not starting with """Floyd |"""
    with open(infile, 'r', encoding='utf-8') as inf, open(outfile, 'w', encoding='utf-8') as outf:
        content = inf.read()
        
        # skip everything up to the last load attempt (if none, just assume the game's already loaded)
        loads = re.finditer(string=content, pattern=r'say(?:s)? \(to Floyd\), "(?:load( sleepmask)?) (?:.+)"', flags=re.IGNORECASE)
        load = None
        for load in loads:
            pass            
        if load:
            if load.group(1):
                sleepmask = True
            content = content[load.end():]
        
        # remove version commands 
        content = re.sub(repl='', string=content, pattern=r'say(?:s)? \(to Floyd\), "version".*?Floyd \| >\n', flags=re.IGNORECASE | re.DOTALL)
        
        # replace external commands with inline ones (unless we're using sleepmask, which does it for us)
        if not sleepmask:
            content = re.sub(repl='Floyd | > \g<1>\n', string=content, pattern=r'(?:.+) say(?:s)? \(to Floyd\), "(.+)"', flags=re.IGNORECASE)
        content = content.split('\n')
        
        # expand contractions
        if contractions:
            content = contractions.expand_texts(content, precise=True)
        
        ##############################################################
        ### From here on, we consider everything part of the game. ###
        ##############################################################
        print('Parsing gameplay...')
        
        scene = []
        about = False
        for line in content:
            # Strip away about commands
            if about:
                # Either match the explicit "Q" command, or look for the first empty command prompt
                if re.match(string=line, pattern=r'^Floyd \| >(?: q)?$', flags=re.IGNORECASE):
                    print('Line [{}], about=False'.format(line))
                    about = False
                continue
            else:
                if re.match(string=line, pattern=r'^Floyd \| > (about|help)$', flags=re.IGNORECASE):
                    print('Line [{}], about=True'.format(line))
                    about = True
                    continue
                    
            # Just strip blank prompts and "press <key> to continue" lines
            if re.match(string=line, pattern=r'^Floyd \| >$'):
                continue
            if re.match(string=line, pattern=r'press (?:.+) to continue', flags=re.IGNORECASE):
                continue
                    
            # Scene descriptions
            match = re.match(string=line, pattern=r'^Floyd \| ([^>#\n]+)', flags=re.IGNORECASE)
            if match:
                scene.append(match.group(1).strip().lower())
            
            # user's commands are appended to the current scene
            match = re.match(string=line, pattern=r'^Floyd \| > (.+)', flags=re.IGNORECASE)        
            if match:
                command = match.group(1).strip().lower()
                # replace shortcuts
                for pattern,repl in commands.items():
                    command = re.sub(string=command, pattern=pattern, repl=repl)
                newline = '{}\n{}\n'.format(' '.join(scene).replace('  ', ' '), command)
                outf.write(newline)
                
                scene = []
                
        # Append the very last scene
        if len(scene) > 0:
            outf.write(' '.join(scene).replace('  ', ' '))
            
        print('Done.')

In [3]:
# infile = 'raw_weishaupt.txt'
# outfile = 'parsed-weishaupt.txt'
w2v = load_embeddings(os.path.join(DATA_PATH, 'GoogleNews-vectors-negative300.bin'), binary=True)
cont = Contractions(w2v_model=w2v)
cont.load_models()

In [4]:
transcripts = glob('data/transcripts/*.txt')
for transcript in transcripts:
    print('Parsing file: [{}]'.format(transcript))
    parsed = 'data/parsed/parsed-{}'.format('-'.join(transcript.split('-')[1:]))
    parse_transcript(transcript, parsed, cont)
    
#     # move parsed files
#     os.rename(transcript, os.path.join('data/done', os.path.basename(transcript)))

Parsing file: [data/transcripts\raw-12heads.txt]
Parsing gameplay...
Done.
Parsing file: [data/transcripts\raw-1893.txt]
Parsing gameplay...
Done.
Parsing file: [data/transcripts\raw-20160221-thesueno-utf8.txt]
Parsing gameplay...
Done.
Parsing file: [data/transcripts\raw-20160221-thesueno.txt]
Parsing gameplay...
Done.
Parsing file: [data/transcripts\raw-20160327-unrealcity-lifeonmars.txt]
Parsing gameplay...
Done.
Parsing file: [data/transcripts\raw-3card-deadmanshill-2016Ap24.txt]
Parsing gameplay...
Line [Floyd | > about], about=True
Line [Floyd | >], about=False
Line [Floyd | > ABOUT], about=True
Line [Floyd | >], about=False
Line [Floyd | > help], about=True
Line [Floyd | > q], about=False
Done.
Parsing file: [data/transcripts\raw-69krakatoa.txt]
Parsing gameplay...
Line [Floyd | > about], about=True
Line [Floyd | >], about=False
Done.
Parsing file: [data/transcripts\raw-905-shrapnel.txt]
Parsing gameplay...
Done.
Parsing file: [data/transcripts\raw-abno.txt]
Parsing gameplay...


Done.
Parsing file: [data/transcripts\raw-fragileshells.txt]
Parsing gameplay...
Done.
Parsing file: [data/transcripts\raw-ft-n-awe.txt]
Parsing gameplay...
Done.
Parsing file: [data/transcripts\raw-galatea.txt]
Parsing gameplay...
Done.
Parsing file: [data/transcripts\raw-gdc09.txt]
Parsing gameplay...
Line [Floyd | > help], about=True
Line [Floyd | >], about=False
Done.
Parsing file: [data/transcripts\raw-glowgrass.txt]
Parsing gameplay...
Done.
Parsing file: [data/transcripts\raw-goldilocks.txt]
Parsing gameplay...
Line [Floyd | > help], about=True
Line [Floyd | >], about=False
Line [Floyd | > help], about=True
Line [Floyd | >], about=False
Done.
Parsing file: [data/transcripts\raw-groovebillygoat.txt]
Parsing gameplay...
Line [Floyd | > about], about=True
Line [Floyd | >], about=False
Done.
Parsing file: [data/transcripts\raw-ground.txt]
Parsing gameplay...
Line [Floyd | > about], about=True
Line [Floyd | >], about=False
Line [Floyd | > about], about=True
Done.
Parsing file: [data/

KeyboardInterrupt: 

## Corpus building
Here we build a single corpus of `<scene_before, command, scene_after>` triplets to be used by the model.
We save the model in h5py format for ease-of-use and perfomance reasons.

In [None]:
# Now build a single corpus of triplets
def preprocess(text):
    return re.sub(repl='', string=text, pattern='[{}\n\t]'.format(''.join(punctuation)))

def build_vocabulary(infile, w2v):
    idx2word = set() # only count unique words
    
    with open(infile, 'r', encoding='utf-8') as f:            
        for line in f:
            line = preprocess(line)
            for word in line.split(' '):
                if len(word) > 0:
                    idx2word.add(word)
            
    idx2word = list(idx2word)
    idx2word.insert(0, '<PAD>')
    idx2word.insert(1, '<UNK>')

    vocab_size, vocab_dim = len(idx2word), w2v.vector_size
    word2idx = {w:i for i,w in enumerate(idx2word)}
    word2embeddings = {w:w2v[w] if w in w2v else np.zeros(vocab_dim) for w in idx2word}
    
    
    return idx2word, word2idx, word2embeddings, (vocab_size, vocab_dim)

# this assumes no newlines characters
def sentence_to_embeddings(sentence, embeddings):
    return [embeddings[word] for word in sentence.split(' ') if len(word) > 0]

# prepare the data to fit the vector representation. Returns a generator 
def prepare_data(w2v):
    files = glob('data/parsed/*.txt')
    with open(infile, 'r', encoding='utf-8') as f:
        data = []
        for line in f:
            data.append(np.array(sentence_to_embeddings(preprocess(line), w2v)))
        return data