# Parts Distributor SKU classifier, part 3: Another take

In parts [1]() and [2]() we built and evaluated a recurrent neural network that can classify electronics part numbers. We will now train another model, with some modifications. Here's the plan:
1. Load the same training/validation data set
2. Relabel the spurious MPN samples that look like Digi-Key SKUs. The regex for that is "`.*-ND$`".
3. Introduce a "start of sequence" token to hopefully help the model learn the Mouser SKU prefixes, in the same way that we have the "end of sequence" token helps with the Digi-Key suffix.
4. Don't pad the sequences. This requires a bit of work because if we use the high-level `fit`, `evaluate` and `predict` Keras functions, they require all arrays for each variable (X and Y) to be of the same length. To feed variable length sequences, we'll need to use slightly more involved `*_generator` or `*_on_batch` functions.

In [1]:
import pandas as pd
import numpy as np
import json
from IPython.display import Markdown, display
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Embedding
from keras.layers import LSTM
from keras.utils import to_categorical
import utils
from IPython.display import SVG
import re
import math

Using TensorFlow backend.


In [2]:
# Load the same cleaned data set
df = pd.read_json("data/cleaned_training_data.json")
class_names = ['MPN', 'Mouser SKU', 'Digi-Key SKU']

looks_like_digikey = lambda pn: bool(re.match(r'.*\-ND$', pn))
df.loc[(df['partnum'].apply(looks_like_digikey)), 'class'] = 2

# rebuild the 'y' column to reflect the changes.
df['y'] = list(list(l) for l in to_categorical(df['class']))

# make sure we changed them all
df[(df['partnum'].apply(looks_like_digikey)) & (df['class'] != 2)]

Unnamed: 0,class,dataset,partnum,x,y


In [3]:
# Build the dictionary with 2 special tokens: "end of sequence" (int value 0) and "start of sequence" (int value 1).
unique_chars = set()
for s in df['partnum'].values:
    unique_chars |= set(c for c in s)
partnum_dict = {c: i+2 for i, c in enumerate(unique_chars)}

# Redefine the 'x' column.
df['x'] = list(df['partnum'].map(lambda s: [1] + list(partnum_dict[c] for c in s) + [0]))
df.head()[['partnum', 'x']]

Unnamed: 0,partnum,x
0,296-8311-6-ND,"[1, 13, 24, 23, 41, 14, 10, 48, 48, 41, 23, 41..."
1,MCP1702T-3302E/CBCT-ND,"[1, 8, 47, 9, 48, 20, 33, 13, 26, 41, 10, 10, ..."
10,595-TPS65986ABZQZR,"[1, 50, 24, 50, 41, 26, 9, 43, 23, 50, 24, 14,..."
100,A109694TR-ND,"[1, 11, 48, 33, 24, 23, 24, 4, 26, 17, 41, 21,..."
1000,LP5900SD-3.0/NOPBCT-ND,"[1, 7, 9, 50, 24, 33, 33, 43, 35, 41, 10, 31, ..."


To feed sequences of different lengths into the model, we have to group them by length, then split each group into batches of fixed size.

In [4]:
def same_seqlen_batches_x_y(df, dataset, batch_size):
    lens = sorted(df['seqlen'].unique())
    for ln in lens:
        rows = df[(df['seqlen'] == ln) & (df['dataset'] == dataset)]
        xs = np.array(rows['x'])
        ys = np.array(rows['y'])
        batches, is_last_incomplete = divmod(len(rows), batch_size)
        batches = batches + 1 if is_last_incomplete else batches
        for b in range(batches):
            is_last_batch = (b == batches-1)
            subd = lambda a: a[b*batch_size:(b+1)*batch_size] if not is_last_batch else a[b*batch_size:]
            yield (subd(xs), subd(ys))

def count_same_seqlen_batches(df, dataset, batch_size):
    lens = sorted(df['seqlen'].unique())
    c = 0
    for ln in lens:
        rows = df[(df['seqlen'] == ln) & (df['dataset'] == dataset)]
        c += math.ceil(len(rows) / batch_size)
    return c

df['seqlen'] = df['partnum'].apply(len)

In [12]:
# config
batch_size = 32

# build model
model = Sequential()
model.add(Embedding(len(partnum_dict)+2, 32))
model.add(LSTM(32, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(3, activation='softmax'))

# try using different optimizers and different optimizer configs
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

def gen(n):
    while True:
        print('GEN', n)
        yield([[1,2,3,4,5,0]],[[0,0,1]])

model.fit_generator(gen('train'),
          steps_per_epoch=1,
          epochs=1,
          validation_data=gen('val'),
          validation_steps=1)

GENEpoch 1/1
 train
GEN train
GEN train
GEN train
GEN train
GEN train
GEN train
GEN train
GEN train
GEN train


AttributeError: 'list' object has no attribute 'shape'

In [6]:
# config
batch_size = 32

# build model
model = Sequential()
model.add(Embedding(len(partnum_dict)+2, 32))
model.add(LSTM(32, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(3, activation='softmax'))

# try using different optimizers and different optimizer configs
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model.fit_generator(same_seqlen_batches_x_y(df, 'train', batch_size),
          steps_per_epoch=count_same_seqlen_batches(df, 'train', batch_size),
          epochs=7,
          validation_data=same_seqlen_batches_x_y(df, 'val', batch_size),
          validation_steps=count_same_seqlen_batches(df, 'val', batch_size))

Epoch 1/7


ValueError: Error when checking target: expected dense_1 to have shape (None, 3) but got array with shape (2, 1)

In [None]:
#score, acc = model.evaluate(d('x', 'val'), d('y', 'val'), batch_size=batch_size)
#display(Markdown('### Accuracy of the model: {:.2f}%'.format(acc * 100.0)))

Let's see what's going on with Mouser SKU/MPN confusion.

In [None]:
res = []
for c in sorted(df['class'].unique()):
    score, acc = model.evaluate(d('x', 'val', class_filter=c), d('y', 'val', class_filter=c), batch_size=batch_size)
    res.append([class_names[c], '{:.2f}%'.format(acc*100.0)])
pd.DataFrame(res, columns=['class', 'accuracy'])

In [None]:
r = utils.cross_category_bleeding(model, d('x', 'val'), d('y', 'val'), batch_size)
g = utils.graphviz_cross_category_diagram(r, class_names)
SVG(g.pipe(format='svg'))

In [None]:
def char_by_char_classification_plot(model, partnum, x, pfx_len=1):
    partnum = ' '*pfx_len + partnum
    x_steps = [np.array(x[:i]) for i in range(1, len(x))]
    step_preds = []
    for i, x_step in enumerate(x_steps):
        c = partnum[i] if i < len(partnum) else ''
        pred = model.predict(np.array(x_step, ndmin=2))
        step_preds.append([c] + [v for v in pred.flat])
    df_res = pd.DataFrame(step_preds, columns=['char'] + class_names)
    return df_res.plot.line(x='char', grid=True, fontsize=15, xticks=range(len(x)-1), yticks=[0.1*v for v in range(11)], figsize=(15, 5))

def by_partnum(pn):
    return df[df.partnum == pn].iloc[0]

char_by_char_classification_plot(model, 'DF14-15P-1.25H(55)', by_partnum('DF14-15P-1.25H(55)')['x'])

In [None]:
char_by_char_classification_plot(model, '8Y-16.000MAAV-T', by_partnum('8Y-16.000MAAV-T')['x'])

In [None]:
char_by_char_classification_plot(model, 'NPA-700B-015A', by_partnum('NPA-700B-015A')['x'])