In [None]:
%matplotlib inline
import numpy as np
try:
    import cPickle as pickle
except:
    import pickle
import pandas as pd
import mxnet as mx
import wget
import time
import os.path
import math
import matplotlib.pyplot as plt
import logging
from tqdm import tqdm
import sys
import queue as Queue
import functools
import threading
import os.path
from mxnet.io import DataBatch

In [None]:
ALPHABET = list("abcdefghijklmnopqrstuvwxyz0123456789-,;.!?:'\"/\\|_@#$%^&*~`+ =<>()[]{}")
FEATURE_LEN = 1014
BATCH_SIZE = 128
NUM_FILTERS = 256
DATA_SHAPE = (BATCH_SIZE, 1, FEATURE_LEN, len(ALPHABET))

ctx = mx.gpu(2)
EPOCHS = 10
SD = 0.05  # std for gaussian distribution
INITY = mx.init.Normal(sigma=SD)
LR = 0.01
MOMENTUM = 0.9
WDECAY = 0.00001

In [None]:
# logging
logger = logging.getLogger()
fhandler = logging.FileHandler(filename='crepe_dbp.log', mode='a')
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
fhandler.setFormatter(formatter)
logger.addHandler(fhandler)
logger.setLevel(logging.DEBUG)

In [None]:
def load_file(infile):
    print("processing data frame: %s" % infile)
    # load data into dataframe
    df = pd.read_csv(infile,
                     header=None,
                     names=['sentiment', 'summary', 'text'])
    # concat summary, review; trim to 1014 char; reverse; lower
    df['rev'] = df.apply(lambda x: "%s %s" % (x['summary'], x['text']), axis=1)
    df.rev = df.rev.str[:FEATURE_LEN].str[::-1].str.lower()
    # store class as nparray
    y_split = np.asarray(df.sentiment, dtype='int')
    print("finished processing data frame: %s" % infile)
    print("data contains %d obs, each epoch will contain %d batches" % (df.shape[0], df.shape[0]//BATCH_SIZE))
    return df.rev, y_split

In [72]:
def load_data_frame(X_data, y_data, batch_size=128, shuffle=False):
    """
    For low RAM this methods allows us to keep only the original data
    in RAM and calculate the features (which are orders of magnitude bigger
    on the fly). This keeps only 10 batches worth of features in RAM using
    asynchronous programing and yields one DataBatch() at a time.
    """

    if shuffle:
        idx = X_data.index
        assert len(idx) == len(y_data)
        rnd = np.random.permutation(idx)
        X_data = X_data.reindex(rnd)
        y_data = y_data[rnd]

    # Dictionary to create character vectors
    hashes = {} 
    for index, letter in enumerate(ALPHABET):
        hashes[letter] = np.zeros(len(ALPHABET), dtype='bool')
        hashes[letter][index] = True

    # Yield processed batches asynchronously
    # Buffy 'batches' at a time
    def async_prefetch_wrp(iterable, buffy=1):#buffy=30
        poison_pill = object()

        def worker(q, it):
            for item in it:
                q.put(item)
            q.put(poison_pill)

        queue = Queue.Queue(buffy)
        it = iter(iterable)
        thread = threading.Thread(target=worker, args=(queue, it))
        thread.daemon = True
        thread.start()
        while True:
            item = queue.get()
            if item == poison_pill:
                return
            else:
                yield item

    # Async wrapper around
    def async_prefetch(func):
        @functools.wraps(func)
        def wrapper(*args, **kwds):
            return async_prefetch_wrp(func(*args, **kwds))
        return wrapper

    @async_prefetch
    def feature_extractor(dta, val):
        # Yield mini-batch amount of character vectors
        X_split = np.zeros([batch_size, 1, FEATURE_LEN, len(ALPHABET)], dtype='bool')
        for ti, tx in enumerate(dta):
            chars = list(tx)
            print(tx)
            for ci, ch in enumerate(chars):
                if ch in hashes:
                    X_split[ti % batch_size][0][ci] = hashes[ch].copy()
            # No padding -> only complete batches processed
            if (ti + 1) % batch_size == 0:
                #yield mx.nd.array(X_split), mx.nd.array(val[ti + 1 - batch_size:ti + 1])
                yield X_split, val[ti + 1 - batch_size:ti + 1]
                X_split = np.zeros([batch_size, 1, FEATURE_LEN, len(ALPHABET)], dtype='bool')

    # Yield one mini-batch at a time and asynchronously process to keep 4 in queue
    for Xsplit, ysplit in feature_extractor(X_data, y_data):
        #yield DataBatch(data=[Xsplit], label=[ysplit])
        yield Xsplit, ysplit

In [None]:
def create_crepe():
    """
    Replicating: https://github.com/zhangxiangxiao/Crepe/blob/master/train/config.lua
    """
    input_x = mx.sym.Variable('data')  # placeholder for input
    input_y = mx.sym.Variable('softmax_label')  # placeholder for output
    # 1. alphabet x 1014
    conv1 = mx.symbol.Convolution(data=input_x, kernel=(7, 69), num_filter=NUM_FILTERS)
    relu1 = mx.symbol.Activation(data=conv1, act_type="relu")
    pool1 = mx.symbol.Pooling(data=relu1, pool_type="max", kernel=(3, 1), stride=(3, 1))
    # 2. 336 x 256
    conv2 = mx.symbol.Convolution(data=pool1, kernel=(7, 1), num_filter=NUM_FILTERS)
    relu2 = mx.symbol.Activation(data=conv2, act_type="relu")
    pool2 = mx.symbol.Pooling(data=relu2, pool_type="max", kernel=(3, 1), stride=(3, 1))
    # 3. 110 x 256
    conv3 = mx.symbol.Convolution(data=pool2, kernel=(3, 1), num_filter=NUM_FILTERS)
    relu3 = mx.symbol.Activation(data=conv3, act_type="relu")
    # 4. 108 x 256
    conv4 = mx.symbol.Convolution(data=relu3, kernel=(3, 1), num_filter=NUM_FILTERS)
    relu4 = mx.symbol.Activation(data=conv4, act_type="relu")
    # 5. 106 x 256
    conv5 = mx.symbol.Convolution(data=relu4, kernel=(3, 1), num_filter=NUM_FILTERS)
    relu5 = mx.symbol.Activation(data=conv5, act_type="relu")
    # 6. 104 x 256
    conv6 = mx.symbol.Convolution(data=relu5, kernel=(3, 1), num_filter=NUM_FILTERS)
    relu6 = mx.symbol.Activation(data=conv6, act_type="relu")
    pool6 = mx.symbol.Pooling(data=relu6, pool_type="max", kernel=(3, 1), stride=(3, 1))
    # 34 x 256
    flatten = mx.symbol.Flatten(data=pool6)
    # 7.  8704
    fc1 = mx.symbol.FullyConnected(data=flatten, num_hidden=1024)
    act_fc1 = mx.symbol.Activation(data=fc1, act_type="relu")
    drop1 = mx.sym.Dropout(act_fc1, p=0.5)
    # 8. 1024
    fc2 = mx.symbol.FullyConnected(data=drop1, num_hidden=1024)
    act_fc2 = mx.symbol.Activation(data=fc2, act_type="relu")
    drop2 = mx.sym.Dropout(act_fc2, p=0.5)
    # 9. 1024
    fc3 = mx.symbol.FullyConnected(data=drop2, num_hidden=NOUTPUT)
    crepe = mx.symbol.SoftmaxOutput(data=fc3, label=input_y, name="softmax")
    return crepe

In [None]:
# Visualise symbol (for crepe)
crepe = create_crepe()

#a = mx.viz.plot_network(crepe)
#a.render('Crepe Model')
#a

In [None]:
def save_check_point(mod_arg, mod_aux, pre, epoch):
    """
    Save model each epoch, load as:

    sym, arg_params, aux_params = \
        mx.model.load_checkpoint(model_prefix, n_epoch_load)

    # assign parameters
    mod.set_params(arg_params, aux_params)

    OR

    mod.fit(..., arg_params=arg_params, aux_params=aux_params,
            begin_epoch=n_epoch_load)
    """

    save_dict = {('arg:%s' % k): v for k, v in mod_arg.items()}
    save_dict.update({('aux:%s' % k): v for k, v in mod_aux.items()})
    param_name = '%s-%04d.pk' % (pre, epoch)
    pickle.dump(save_dict, open(param_name, "wb"))
    print('Saved checkpoint to \"%s\"' % param_name)
    print('Saving model with mxnet notation')
    mx.callback.do_checkpoint(pre)

In [None]:
def train_model(prefix, filename):
    print("Initializing model")
    # Create mx.mod.Module()
    cnn = create_crepe()
    mod = mx.mod.Module(cnn, context=ctx)
    
    # Bind shape
    mod.bind(data_shapes=[('data', DATA_SHAPE)],
             label_shapes=[('softmax_label', (BATCH_SIZE,))])

    # Initialise parameters and optimiser
    mod.init_params(mx.init.Normal(sigma=SD))
    mod.init_optimizer(optimizer='sgd',
                       optimizer_params={
                           "learning_rate": LR,
                           "momentum": MOMENTUM,
                           "wd": WDECAY,
                           "rescale_grad": 1.0/BATCH_SIZE
                       })

    print("Loading file")
    # Load Data
    X_train, y_train = load_file(filename)

    # Train
    print("Alphabet %d characters: " % len(ALPHABET), ALPHABET)
    print("started training")
    tic = time.time()

    # Evaluation metric:
    metric = mx.metric.Accuracy()

    # Train EPOCHS
    for epoch in range(EPOCHS):
        t = 0
        metric.reset()
        tic_in = time.time()
        for batch in load_data_frame(X_data=X_train,
                                     y_data=y_train,
                                     batch_size=BATCH_SIZE,
                                     shuffle=True):
            # Push data forwards and update metric
            mod.forward_backward(batch)
            mod.update()
            mod.update_metric(metric, batch.label)

            # For training + testing
            #mod.forward(batch, is_train=True)
            #mod.update_metric(metric, batch.label)
            # Get weights and update
            # For training only
            #mod.backward()
            #mod.update()
            # Log every 50 batches = 128*50 = 6400
            t += 1
            if t % 50 == 0:
                train_t = time.time() - tic_in
                metric_m, metric_v = metric.get()
                print("epoch: %d iter: %d metric(%s): %.4f dur: %.0f" % (epoch, t, metric_m, metric_v, train_t))

        # Checkpoint
        print("Saving checkpoint")
        arg_params, aux_params = mod.get_params()
        save_check_point(mod_arg=arg_params,
                         mod_aux=aux_params,
                         pre=prefix,
                         epoch=epoch)
        print("Finished epoch %d" % epoch)

    print("Done. Finished in %.0f seconds" % (time.time() - tic))

In [26]:
NOUTPUT = 14  # Classes
model_prefix = 'crepe_dbpedia_prefetch'
train_file = '/datadrive/nlp/dbpedia_train.csv'


In [28]:
#test data
X_train, y_train = load_file(train_file)

processing data frame: /datadrive/nlp/categories_train_big.csv
finished processing data frame: /datadrive/nlp/categories_train_big.csv
data contains 2379999 obs, each epoch will contain 18593 batches


In [74]:

for batch in load_data_frame(X_data=X_train, y_data=y_train, batch_size=1,shuffle=False):
    #print("label = ", np.asarray(batch.label))
    #print("data = ", np.asarray(batch.data))
    d,l = batch
    dint = np.asarray(d,dtype='int32')
    break
print("label = ", l)
print("data shape = ", d.shape)
print("data =", dint)
dint = np.reshape(dint, [1014, 69])
print("dint shape = ", dint.shape)
print(type(dint))
#df = pd.DataFrame(dint)
#df.to_csv("file_path.csv")
np.savetxt('first_batch.csv', dint,  delimiter=',', fmt='%d',)

.maj siht fo tuo em gnitteg ni lufpleh oot t'nsaw koob eht tub ,3 lla ro ,em ,koob eht ,7 swodniw htiw seil tluaf eht fi wonk t'nod i .sdrawretfa yllaudividni eno hcae lebal ot evah i dna meht sdaolnwod yllacitamotua 7 swodniw ym wohemos .spuorg ni meht gnilebal dna serutcip gnidaolnwod fo erudecorp elpmis eht gnidrager depmuts neeb ev'i ?hcum oot tcepxe i did ?hcum oot tcepxe i did
deppots nehw od ot tahw wonk lliw yeht yllufepoh dna sdik ruoy ot siht wohs .deppots nehw evaheb otwoh wonk dluohs sevird ohw enoyreve.esnes nommoc s'ti yllaer tub .doog srevird gnuoy dna srevird wen rof doog
label =  [0]
data shape =  (1, 1, 1014, 69)
data = [[[[0 0 0 ..., 0 0 0]
   [0 0 0 ..., 0 0 0]
   [1 0 0 ..., 0 0 0]
   ..., 
   [0 0 0 ..., 0 0 0]
   [0 0 0 ..., 0 0 0]
   [0 0 0 ..., 0 0 0]]]]
dint shape =  (1014, 69)
<class 'numpy.ndarray'>
.detide llew dna daer htrae ot nwod yrev !reverof gnitirw speek ehs epoh yllaer i !!skoob reh lla evol  !retirw gnidnatstuo !!!!koob emosewa rehtona


In [None]:
def load_check_point(file_name):

    # Load file
    print(file_name)
    save_dict = pickle.load(open(file_name, "rb"))
    # Extract data from save
    arg_params = {}
    aux_params = {}
    for k, v in save_dict.items():
        tp, name = k.split(':', 1)
        if tp == 'arg':
            arg_params[name] = v
        if tp == 'aux':
            aux_params[name] = v

    # Recreate model
    cnn = create_crepe()
    mod = mx.mod.Module(cnn, context=ctx)

    # Bind shape
    mod.bind(data_shapes=[('data', DATA_SHAPE)],
             label_shapes=[('softmax_label', (BATCH_SIZE,))])

    # assign parameters from save
    mod.set_params(arg_params, aux_params)
    print('Model loaded from disk')

    return mod

In [None]:
def test_model(pickled_model, filename):
    """ This doesn't take too long but still seems it takes longer than
    it should be taking ... """

    # Load saved model:
    mod = load_check_point(pickled_model)
    #assert mod.binded and mod.params_initialized

    # Load data
    X_test, y_test = load_file(filename)

    # Score accuracy
    metric = mx.metric.Accuracy()

    # Test batches
    for batch in load_data_frame(X_data=X_test,
                                 y_data=y_test,
                                 batch_size=BATCH_SIZE):

        mod.forward(batch, is_train=False)
        mod.update_metric(metric, batch.label)
        metric_m, metric_v = metric.get()
        print("TEST(%s): %.4f" % (metric_m, metric_v))

In [None]:
model_prefix = 'crepe_dbpedia_prefetch'
model_epoch = 9
model_pk = model_prefix + '_000' + str(model_epoch) + '.pk'
test_file = '/datadrive/nlp/dbpedia_test.csv'
test_model(model_pk, test_file)