In [1]:
import pandas as pd
import nltk
import tensorflow as tf
import random

In [2]:
data = pd.read_csv("JobID-53_Concordance_H3_to_S3.CSV",encoding="latin-1",dtype="object")

In [3]:
data.head()

Unnamed: 0,HS 2007 Product Code,HS 2007 Product Description,SITC Revision 3 Product Code,SITC Revision 3 Product Description
0,10110,Live horses/asses/mules/hinnies: pure-bred bre...,151,Horses
1,10190,Live horses/asses/mules/hinnies other than pur...,152,"Asses/mules/hinnies,live"
2,10210,Live bovine animals: pure-bred breeding animals,111,"Bovine animals, breeding"
3,10290,Live bovine animals other than pure-bred breed...,119,"Bovine animals, other"
4,10310,Live swine: pure-bred breeding animals,131,"Swine, live for breeding"


In [4]:
instance = data[data.columns[0]][0]

In [5]:
[i for i in instance]

[u'0', u'1', u'0', u'1', u'1', u'0']

In [6]:
product_codes = data[data.columns[0]]
characters = []
for code in product_codes:
    characters.append([c for c in code])


In [7]:
len(characters)

5050

Prepare strings
-----

In [8]:
descriptions = data['HS 2007 Product Description'].str.replace("/"," ")

In [9]:
tokens = [nltk.word_tokenize(desc) for desc in descriptions]

In [10]:
def build_vocabulary(token_list):
    vocabulary = {}
    start_vocab = ["_PAD","_GO","_EOS","_UNK"]
    for row in token_list:
        for word in row:
            if word in vocabulary:
                vocabulary[word] += 1
            else:
                vocabulary[word] = 1
    vocab = start_vocab + sorted(vocabulary,key=vocabulary.get,reverse=True)
    return vocab   

In [11]:
vocab = build_vocabulary(tokens)

In [12]:
dec_vocab = build_vocabulary(characters)

In [13]:
vocab[0:20]

['_PAD',
 '_GO',
 '_EOS',
 '_UNK',
 u',',
 u'of',
 u'.',
 u'&',
 u'(',
 u')',
 u'other',
 u'not',
 u'excl',
 u'for',
 u'in',
 u'than',
 u'a',
 u'the',
 u'with',
 u'whether']

In [14]:
dec_vocab

['_PAD',
 '_GO',
 '_EOS',
 '_UNK',
 u'0',
 u'1',
 u'2',
 u'9',
 u'3',
 u'4',
 u'8',
 u'5',
 u'6',
 u'7']

In [15]:
targets = characters
a = [ch.insert(0,"_GO") for ch in characters]
b = [ch.append("_EOS") for ch in characters]

In [16]:
d = [token.append("_EOS") for token in tokens]

In [17]:
len(characters)

5050

In [18]:
# Add padding to the description sequences
# Only padding input because HS codes are already fixed length
buckets = [(5,8),(10,8),(20,8),(30,8),(40,8),(50,8),(60,8),(70,8)]

In [19]:
import numpy as np
lengths = [len(token) for token in tokens]
np.mean(lengths)

17.227326732673266

In [20]:
def make_buckets(tokens, buckets):
    lengths = [len(token) for token in tokens]
    padding = []
    i = 0
    for length in lengths:
        for bucket in buckets:
            if length <= bucket[0]:
                padding.append((bucket[0]-length))
                pad(tokens[i],bucket[0]-length)
                break
        i = i+1
    return padding

def pad(sentence,padding_size):
    for i in range(0,padding_size):
        sentence.insert(0,"_PAD")

padding = make_buckets(tokens,buckets)

In [31]:
tokens[0:20]

[[u'Live',
  u'horses',
  u'asses',
  u'mules',
  u'hinnies',
  u':',
  u'pure-bred',
  u'breeding',
  u'animals',
  '_EOS'],
 ['_PAD',
  '_PAD',
  '_PAD',
  '_PAD',
  '_PAD',
  '_PAD',
  '_PAD',
  '_PAD',
  '_PAD',
  u'Live',
  u'horses',
  u'asses',
  u'mules',
  u'hinnies',
  u'other',
  u'than',
  u'pure-bred',
  u'breeding',
  u'animals',
  '_EOS'],
 ['_PAD',
  '_PAD',
  u'Live',
  u'bovine',
  u'animals',
  u':',
  u'pure-bred',
  u'breeding',
  u'animals',
  '_EOS'],
 ['_PAD',
  u'Live',
  u'bovine',
  u'animals',
  u'other',
  u'than',
  u'pure-bred',
  u'breeding',
  u'animals',
  '_EOS'],
 ['_PAD',
  '_PAD',
  '_PAD',
  u'Live',
  u'swine',
  u':',
  u'pure-bred',
  u'breeding',
  u'animals',
  '_EOS'],
 ['_PAD',
  '_PAD',
  '_PAD',
  '_PAD',
  '_PAD',
  '_PAD',
  '_PAD',
  '_PAD',
  u'Live',
  u'swine',
  u'other',
  u'than',
  u'pure-bred',
  u'breeding',
  u'animals',
  u',',
  u'weighing',
  u'<',
  u'50kg',
  '_EOS'],
 ['_PAD',
  '_PAD',
  '_PAD',
  '_PAD',
  '_PAD',
  '

In [22]:
tf.nn.seq2seq.basic_rnn_seq2seq

<function tensorflow.python.ops.seq2seq.basic_rnn_seq2seq>

In [23]:
tf.nn.seq2seq.model_with_buckets

<function tensorflow.python.ops.seq2seq.model_with_buckets>

In [24]:
word_id = dict(zip(vocab,[i for i in range(0,len(vocab))]))
code_id = dict(zip(dec_vocab,[i for i in range(0,len(dec_vocab))]))

Prepare Input Vectors for Model
---------

In [25]:
def make_inputs(instances,vocabulary):
    inputs = []
    for row in instances:
        inputs.append([vocabulary[token] for token in row])
    return inputs

encoder_inputs = make_inputs(tokens,word_id)
decoder_inputs = make_inputs(characters,code_id)

In [26]:
cell = tf.nn.rnn_cell.BasicRNNCell(15)
#tf.nn.seq2seq.basic_rnn_seq2seq(encoder_inputs,decoder_inputs,cell)
#tf.nn.seq2seq.model_with_buckets(encoder_inputs,decoder_inputs, targets)

 It appears that targets increment on every step
targets = []
for decoder in decoder_inputs:
    targets.append([decoder[i+1] for i in range(len(decoder)-1)])

Make batches by bucket size. We need to group the data by its bucket size.

In [27]:
data = zip(encoder_inputs,decoder_inputs)

In [28]:
len(buckets)

8

In [29]:
from six.moves import xrange


def batch(data,bucket_id,batch_size):
    batch = random.sample(data[bucket_id],batch_size)
    return batch
    
    
def get_batch(encoder_inputs,decoder_inputs,buckets,bucket_id,batch_size):
    
    encoder_size, decoder_size = buckets[bucket_id]
    
    batch_encoder_inputs, batch_decoder_inputs, batch_weights = [], [], []
    for length_idx in xrange(encoder_size):
        batch_encoder_inputs.append(
            np.array([encoder_inputs[batch_idx][length_idx]
                    for batch_idx in xrange(batch_size)], dtype=np.int32))
    
    # Batch decoder inputs are re-indexed decoder_inputs, we create weights.
    for length_idx in xrange(decoder_size):
        batch_decoder_inputs.append(
            np.array([decoder_inputs[batch_idx][length_idx]
                    for batch_idx in xrange(batch_size)], dtype=np.int32))
    
    # Create target_weights to be 0 for targets that are padding.
    batch_weight = np.ones(batch_size, dtype=np.float32)
    for batch_idx in xrange(batch_size):
        # We set weight to 0 if the corresponding target is a PAD symbol.
        # The corresponding target is decoder_input shifted by 1 forward.
        if length_idx < decoder_size - 1:
            target = decoder_inputs[batch_idx][length_idx + 1]
        if length_idx == decoder_size - 1 or target == 0:
            batch_weight[batch_idx] = 0.0
    batch_weights.append(batch_weight)   
    return batch_encoder_inputs, batch_decoder_inputs, batch_weights


In [36]:
from itertools import groupby

def bucket_data(encoder_inputs,decoder_inputs):
    """ Groups data by bucket size that can be sampled for batch learning"""
    data = zip(encoder_inputs,decoder_inputs)
    keyfunc = lambda x : len(x[0])
    sorted_data = sorted(data,key=keyfunc)
    bucketed_data = []
    for k,g in groupby(sorted_data,keyfunc):
        bucketed_data.append(list(g))
    return bucketed_data

In [None]:
decoder_inputs[0:11]

In [37]:
bucketed_data = bucket_data(encoder_inputs,decoder_inputs)
batch(bucketed_data,1,500)

[([4371, 1148, 7, 1115, 1148, 4, 19, 11, 350, 2], [1, 6, 11, 4, 11, 5, 4, 2]),
 ([0, 0, 6007, 8, 12, 6, 5, 6485, 9, 2], [1, 6, 7, 4, 6, 7, 4, 2]),
 ([0, 802, 233, 846, 4, 30, 6, 14, 3354, 2], [1, 10, 11, 8, 6, 6, 7, 2]),
 ([0, 0, 0, 2339, 1063, 807, 24, 141, 687, 2], [1, 10, 12, 4, 5, 6, 4, 2]),
 ([0, 0, 0, 0, 8204, 470, 5, 41, 29, 2], [1, 13, 8, 5, 11, 6, 4, 2]),
 ([0, 0, 0, 3743, 1608, 5, 121, 4, 1808, 2], [1, 7, 4, 4, 5, 9, 4, 2]),
 ([0, 0, 1046, 853, 5, 712, 4, 56, 94, 2], [1, 4, 6, 4, 9, 5, 4, 2]),
 ([0, 7523, 619, 8, 12, 6, 5, 2580, 9, 2], [1, 10, 9, 5, 8, 13, 4, 2]),
 ([0, 0, 0, 0, 1881, 8, 1426, 9, 213, 2], [1, 10, 5, 4, 5, 7, 12, 2]),
 ([0, 313, 95, 2480, 5, 415, 532, 4, 132, 2], [1, 9, 10, 6, 5, 5, 4, 2]),
 ([0, 6812, 5, 247, 54, 10, 15, 175, 54, 2], [1, 9, 4, 5, 12, 7, 6, 2]),
 ([0, 8004, 7, 2783, 4, 7, 3591, 2931, 47, 2], [1, 9, 7, 4, 5, 7, 5, 2]),
 ([0, 0, 0, 1049, 272, 7, 211, 4, 548, 2], [1, 6, 12, 5, 8, 5, 4, 2]),
 ([0, 0, 0, 5170, 7, 7974, 8, 4053, 9, 2], [1, 7, 4, 5, 