In [None]:
from __future__ import print_function
import numpy as np
import pickle
import codecs
import lxml.etree as ET
import regex
from nltk.tokenize import sent_tokenize

In [2]:
def clean_text(text):
    text = regex.sub("\[http[^]]+? ([^]]+)]", r"\1", text) 
    text = regex.sub("\[http[^]]+]", "", text) 
    text = regex.sub("(?s)<ref>.+?</ref>", "", text) # remove reference links
    text = regex.sub("(?s)<[^>]+>", "", text) # remove html tags
    text = regex.sub("&[a-z]+;", "", text) # remove html entities
    text = regex.sub("(?s){{.+?}}", "", text) # remove markup tags
    text = regex.sub("(?s){.+?}", "", text) # remove markup tags
    text = regex.sub("(?s)\[\[([^]]+\|)", "", text) # remove link target strings
    text = regex.sub("(?s)\[\[([^]]+\:.+?]])", "", text) # remove media links
    
    text = regex.sub("[']{5}", "", text) # remove italic+bold symbols
    text = regex.sub("[']{3}", "", text) # remove bold symbols
    text = regex.sub("[']{2}", "", text) # remove italic symbols
    
    text = regex.sub(u"[^ \r\n\p{Latin}\d\-'.?!]", " ", text)
    text = text.lower()
    
    text = regex.sub("[ ]{2,}", " ", text) # Squeeze spaces.
    return text

def build_corpus():
    import glob
    
    with codecs.open('data/en_wikinews.txt', 'w', 'utf-8') as fout:
        fs = glob.glob('data/raw/*.xml')
        ns = "{http://www.mediawiki.org/xml/export-0.10/}" # namespace
        for f in fs:
            i = 1
            for _, elem in ET.iterparse(f, tag=ns+"text"):
                try:
                    if i > 5000:
                        running_text = elem.text
                        running_text = running_text.split("===")[0]
                        running_text = clean_text(running_text)
                        paras = running_text.split("\n")
                        for para in paras:
                            if len(para) > 500:
                                sents = [regex.sub("([.!?]+$)", r" \1", sent) for sent in sent_tokenize(para.strip())]
                                fout.write(" ".join(sents) + "\n")
                except:
                    continue
                
                elem.clear() # We need to save memory!
                i += 1
                if i % 1000 == 0: print(i,)

if __name__ == '__main__':
    build_corpus()
    print("Done")        

1000
2000
3000
4000
5000
Done


In [3]:
from __future__ import print_function
import numpy as np
import pickle
import codecs

In [4]:
class Hyperparams:
    '''Hyper parameters'''
    batch_size = 64
    embed_dim = 300
    seqlen = 50  # We will predict the next/current word based on the preceding 50 characters.

def load_char_vocab():
    vocab = "EU abcdefghijklmnopqrstuvwxyz0123456789-.,?!'" # E: Empty, U:Unknown
    char2idx = {char:idx for idx, char in enumerate(vocab)}
    idx2char = {idx:char for idx, char in enumerate(vocab)}  
    
    return char2idx, idx2char      

def create_word_vocab():
    from collections import Counter
    from itertools import chain
    
    words = codecs.open('data/en_wikinews.txt', 'r', 'utf-8').read().split()
    word2cnt = Counter(chain(words))
    vocab = ["<EMP>", "<UNK>"] + [word for word, cnt in word2cnt.items() if cnt > 50]
    word2idx = {word:idx for idx, word in enumerate(vocab)}
    idx2word = {idx:word for idx, word in enumerate(vocab)} 
    pickle.dump( (word2idx, idx2word), open("data/word_vocab.pkl", "wb") )

def load_word_vocab():
    word2idx, idx2word = pickle.load( open("data/word_vocab.pkl", "rb") )
    return word2idx, idx2word
    
def create_data():
    char2idx, idx2char = load_char_vocab()
    word2idx, idx2word = load_word_vocab()
    lines = codecs.open('data/en_wikinews.txt', 'r', 'utf-8').read().splitlines()
    xs, ys = [], [] # vectorized sentences
    for line in lines:
        x, y = [], []
        for i, word in enumerate(line.split()):
            x.append(2) # space
            y.append(word2idx.get(word, 1))
            for char in word:
                x.append(char2idx.get(char, 1))
                y.append(word2idx.get(word, 1))
        if len(x) <= 1000: #zero pre-padding
            xs.append([0] * (1000 - len(x)) + x)
            ys.append([0] * (1000 - len(x)) + y)
  
    # Convert to 2d-arrays
    X = np.array(xs)
    Y = np.array(ys)
    
    print("X.shape =", X.shape, "\nY.shape =", Y.shape)
    np.savez('data/train.npz', X=X, Y=Y)

def load_train_data():
    X = np.load('data/train.npz')['X'][:-64]
    Y = np.load('data/train.npz')['Y'][:-64]
    return X, Y

def load_test_data():
    X = np.load('data/train.npz')['X'][-64:]
    Y = np.load('data/train.npz')['Y'][-64:]
    return X, Y

if __name__ == '__main__':
    create_word_vocab()
    create_data()
    print("Done")        

X.shape = (0,) 
Y.shape = (0,)
Done


In [None]:
from __future__ import print_function
from prepro import *
import sugartensor as tf
import random

In [None]:
def q_process(t1, t2):
    '''
    Processes each training sample so that it fits in the queue.
    '''
    # Lstrip zeros
    zeros = tf.equal(t1, tf.zeros_like(t1)).sg_int().sg_sum()
    t1 = t1[zeros:] 
    t2 = t2[zeros:]

    # zero-PrePadding
    t1 = tf.concat([tf.zeros([Hyperparams.seqlen-1], tf.int32), t1], 0)# 49 zero-prepadding
    t2 = tf.concat([tf.zeros([Hyperparams.seqlen-1], tf.int32), t2], 0)# 49 zero-prepadding
    # radom crop    
    stacked = tf.stack((t1, t2))
    cropped = tf.random_crop(stacked, [2, Hyperparams.seqlen])
    t1, t2 = cropped[0], cropped[1]
    
    t2 = t2[-1]

    return t1, t2

def get_batch_data():
    '''Makes batch queues from the data.
    '''
    # Load data
    X, Y = load_train_data() # (196947, 1000) int64

    # Create Queues
    x_q, y_q = tf.train.slice_input_producer([tf.convert_to_tensor(X, tf.int32),
                                          tf.convert_to_tensor(Y, tf.int32)]) # (1000,) int32
    
    x_q, y_q = q_process(x_q, y_q) # (50,) int32, () int32

    # create batch queues
    x, y = tf.train.shuffle_batch([x_q, y_q],
                              num_threads=32,
                              batch_size=Hyperparams.batch_size, 
                              capacity=Hyperparams.batch_size*64,
                              min_after_dequeue=Hyperparams.batch_size*32, 
                              allow_smaller_final_batch=False)
    
    num_batch = len(X) // Hyperparams.batch_size

    return x, y, num_batch # (64, 50) int32, (64, 50) int32, ()

class ModelGraph():
    '''Builds a model graph'''
    def __init__(self, mode="train"):
        '''
        Args:
          mode: A string. Either "train" or "test"
        '''
        self.char2idx, self.idx2char = load_char_vocab()
        self.word2idx, self.idx2word = load_word_vocab()
        
        if mode == "train":
            self.x, self.y, self.num_batch = get_batch_data() 
        else:
            self.x = tf.placeholder(tf.int32, [None, Hyperparams.seqlen])
        
        self.emb_x = tf.sg_emb(name='emb_x', voca_size=len(self.char2idx), dim=Hyperparams.embed_dim)
        self.enc = self.x.sg_lookup(emb=self.emb_x)
        
        with tf.sg_context(size=5, act='relu', bn=True):
            for _ in range(20):
                dim = self.enc.get_shape().as_list()[-1]
                self.enc += self.enc.sg_conv1d(dim=dim) # (64, 50, 300) float32
        
        self.enc = self.enc.sg_conv1d(size=1, dim=len(self.word2idx), act='linear', bn=False) # (64, 50, 21293) float32
#         self.logits = self.enc.sg_mean(dims=[1], keep_dims=False) # (64, 21293) float32
        
        # Weighted Sum. Updated on Feb. 15, 2017.
        def make_weights(size):
            weights = tf.range(1, size+1, dtype=tf.float32)
            weights *= 1. / ((1 + size) * size // 2)
            weights = tf.expand_dims(weights, 0)
            weights = tf.expand_dims(weights, -1)
            return weights
        
        self.weights = make_weights(Hyperparams.seqlen) # (1, 50, 1)
        self.enc *= self.weights # Broadcasting
        self.logits = self.enc.sg_sum(axis=[1], keep_dims=False) # (64, 21293)

        if mode == "train":
            self.ce = self.logits.sg_ce(target=self.y, mask=False, one_hot=False)
            self.istarget = tf.not_equal(self.y, tf.ones_like(self.y)).sg_float() # 1: Unkown   
            self.reduced_loss = ((self.ce * self.istarget).sg_sum()) / (self.istarget.sg_sum() + 1e-5)
            tf.sg_summary_loss(self.reduced_loss, "reduced_loss")
            
def train():
    g = ModelGraph()
    print("Graph loaded!")

    tf.sg_train(optim="Adam", lr=0.00001, lr_reset=True, loss=g.reduced_loss, eval_metric=[], max_ep=20000, 
                save_dir='asset/train', early_stop=False, ep_size=g.num_batch)
     
if __name__ == '__main__':
    train(); print("Done")
