In [1]:
# we create a function to oobfuscate the store names and introduce impurities for the sake of this excercise
# Scrapped or OCR results could be much more random and unpredictable.
import string
import random 
import numpy as np 
import pandas as pd

seen = {}
symbols = "!@#$%^&*()-+,<.>?/'' "
numbers = [0,1,2,3,4,5,6,7,8,9]


def gen_impurities(n,word):
    for i in range(n):
        index = random.randint(0,len(word)-1)
        #print(f"impute index - {index}")

        if index%2 == 0:
            symbol = random.choice(symbols)
            #print(f"Symbol = {symbol}")
            word = word[:index] + symbol + word[index:]

        if index%7 == 0:
            num = random.choice(numbers)
            #print(f"Number = {num}")
            word = word[:index] + str(num) + word[index:]

        if index%3 == 0:
            word.replace(word[index],"")
    return word

def obfuscate(word):
    n = int(len(word)*0.5)
#     print(n)
    iteration = random.randint(1,n)
#     print(iteration)
    label = word
    for i in range(5):
        imputed_word = gen_impurities(iteration,label)
#         print(imputed_word)
        if imputed_word not in seen:
            seen[imputed_word] = 1
            return imputed_word
        seen[imputed_word] = seen[imputed_word]+1
    return word
            

def gen_gibberish(min_l,max_l):
    # initializing size of string
    l = random.randint(min_l,max_l)
    # using random.choices()
    # generating random strings
    res = ''.join(random.choices(string.ascii_uppercase +
                                 string.punctuation +
                                 string.digits, k=l))
    while res not in seen:    
        seen[res] = 1
        return res
    seen[res] = seen[res]+1
    

In [3]:
# Lets take top 100 US retailers arbitarily chosen based on their annual reported Sales

def read_data():
    df = pd.read_csv("stores.csv")
    del df['empty']
    df = df.set_index('no')
    df.head()
    return df


In [5]:
# this function will muddle up the stores to simulate real world data capture where system might introduce impurites
# the function will generate n number of instances for jumbled data

# n indicates number of obfuscated records
# instance_count is the number of observation per class

def get_obfuscated_stores(stores,n):
    obfuscate_stores = stores * n
    obfuscate_stores.sort()
    df_obfuscated = pd.DataFrame({"stores": obfuscate_stores})
    df_obfuscated['bad_names'] = df_obfuscated["stores"].apply(lambda x: obfuscate(x) )
    return df_obfuscated
    
def get_catch_gibberish(instance_count,min_l,max_l):
    others = ['Other'] * instance_count
    df_other = pd.DataFrame({'stores':others})
    df_other['bad_names'] = df_other['stores'].apply(lambda x : gen_gibberish(min_l,max_l)) 
    return df_other

# the training set will also have good captures where store name was interpreted correctly
# for this purpose the we are creagin 10% bad captures

def get_good_names(stores,n):
    good_captures = stores*  n
    df_good_captures = pd.DataFrame({'stores':good_captures})
    df_good_captures['bad_names'] = df_good_captures["stores"]
    return df_good_captures

def get_data(impute_n,instance,min_l,max_l): 
    n = impute_n
    instance_count = instance
    min_l = min_l
    max_l = max_l

    df_obfuscated = get_obfuscated_stores(stores,n)
    df_other = get_catch_gibberish(instance_count,min_l,max_l)
    df_good_captures = repeat_good_names(stores,n,instance_count)
    df = pd.concat([df_obfuscated,df_good_captures,df_other])
    return df


In [6]:
def get_train_test_data(split_ratio,impute_ratio,instance,min_l,max_l):
    test_size = (instance*split_ratio)
    train_size = (instance*(1-split_ratio))
    print(f"test_size = {test_size} & train_size = {train_size} & split = {split_ratio}")
    test_impute_n = int(test_size*impute_ratio)
    test_good_n = int(test_size - test_impute_n)
    print(f"test impute = {test_impute_n} & instances = {test_good_n}")
    df = read_data()
    stores = list(df["store"])[0:10]
    # get imputed
    df_obfuscated = get_obfuscated_stores(stores,test_impute_n)
    # get good names
    df_good_names = get_good_names(stores, test_good_n)
    # get others 
    df_other = get_catch_gibberish(test_impute_n+test_good_n,min_l,max_l)
    df_test = pd.concat([df_obfuscated,df_good_names,df_other])
    
    del df_obfuscated,df_good_names,df_other
    train_impute_n = int(train_size*impute_ratio)
    train_good_n = int(train_size - train_impute_n)
    # get imputed
    df_obfuscated = get_obfuscated_stores(stores,train_impute_n)
    # get good names
    df_good_names = get_good_names(stores, train_good_n)
    # get others 
    df_other = get_catch_gibberish(train_impute_n+train_good_n,min_l,max_l)
    
    df_train = pd.concat([df_obfuscated,df_good_names,df_other])
    
    del df_obfuscated,df_good_names,df_other
    return df_train, df_test
    
    

In [None]:
# seen

In [8]:
df_train, df_test = get_train_test_data(0.2,0.5,10000,5,20)

test_size = 2000.0 & train_size = 8000.0 & split = 0.2
test impute = 1000 & instances = 1000


In [9]:
print(f"len test stores = {len(set(df_test['stores']))}")
print(f"len train stores = {len(set(df_train['stores']))}")


len test stores = 11
len train stores = 11


In [10]:
df_train.shape

(88000, 2)

In [11]:
df_test.shape

(22000, 2)

In [12]:
df_train[df_train['stores'] == df_train['bad_names']].groupby("stores").count()


Unnamed: 0_level_0,bad_names
stores,Unnamed: 1_level_1
Albertsons Companies,4401
Amazon.com,4966
CVS Health Corporation,4378
Costco Wholesale,4561
Lowe's Companies,4556
Target,5882
The Home Depot,4651
The Kroger Co.,4641
Walgreens Boots Alliance,4349
Walmart,5703


In [13]:
df_test[df_test['stores'] == df_test['bad_names']].groupby("stores").count()


Unnamed: 0_level_0,bad_names
stores,Unnamed: 1_level_1
Albertsons Companies,1083
Amazon.com,1207
CVS Health Corporation,1082
Costco Wholesale,1146
Lowe's Companies,1144
Target,1425
The Home Depot,1189
The Kroger Co.,1166
Walgreens Boots Alliance,1069
Walmart,1393


In [14]:
df_test[df_test["stores"] == 'Other'].count()

stores       2000
bad_names    2000
dtype: int64

In [15]:
df_train[df_train["stores"] == 'Other'].count()

stores       8000
bad_names    8000
dtype: int64

In [16]:
df_train.head(10)

Unnamed: 0,stores,bad_names
0,Albertsons Companies,"Albe%rts1ons C7,2'om1 pan8-ie!s"
1,Albertsons Companies,2 Albertsons C4'ompanie3s
2,Albertsons Companies,"Albertsons Com4?pani,es"
3,Albertsons Companies,5?*Albe-rts(ons Companies
4,Albertsons Companies,Albertsons$ Com3)$pa<nies
5,Albertsons Companies,6$Albertsons Com4>panies
6,Albertsons Companies,Albertsons#!. Companies
7,Albertsons Companies,Al&bert.&sons Companies
8,Albertsons Companies,Al%bert(sons Compani-es
9,Albertsons Companies,Alberts9ons C&ompa%?nies


In [17]:
df_test.head(10)

Unnamed: 0,stores,bad_names
0,Albertsons Companies,7!Albertsons Compani+e's
1,Albertsons Companies,3(Albertsons Companies
2,Albertsons Companies,Al<bertsons Compa nies
3,Albertsons Companies,Albe<rtson<s Compan'i$es
4,Albertsons Companies,Al%bert-sons Comp?anies
5,Albertsons Companies,Al^bertsons Companie/s
6,Albertsons Companies,"Albertso ns Comp!ani+)@,es"
7,Albertsons Companies,Al&be<rtsons Compani!es
8,Albertsons Companies,Albe)rtsons Comp#a>%n ies
9,Albertsons Companies,Albe)rt1>sons Compan%ies


In [18]:
#from sklearn.model_selection import train_test_split
def get_train_test_split(df_train,df_test,one_hot_encode_labels=False):
    X_train = df_train["bad_names"].values
    y_train = df_train["stores"].values
    X_test  = df_test["bad_names"].values
    y_test  = df_test["stores"].values
    
    if one_hot_encode_labels:
        df_labels = pd.concat([df_train['stores'],df_test['stores']])
        print(f"Labels = {df_labels.shape})")
        lables = pd.get_dummies(df_labels)
        lookup = list(lables.columns)
        print(len(lookup))
        del df_labels
        y_test_labels = pd.get_dummies(y_test)
        y_test_encoded = y_test_labels.astype('float32').values
        y_train_labels = pd.get_dummies(y_train)
        y_train_encoded = y_train_labels.astype('float32').values
        return X_train,y_train_encoded,X_test,y_test_encoded, lookup
    else:
        return X_train, y_train, X_test, y_test, None
    

In [19]:

X_train, y_train, X_test, y_test, lable_lookup = get_train_test_split(df_train,
                                                                      df_test,
                                                                      one_hot_encode_labels=False)


In [20]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(88000,)
(88000,)
(22000,)
(22000,)


In [21]:
print(len(set(y_train)))
print(len(set(X_train)))

11
39922


In [25]:
labels = list(set(y_train))
labels

['Walmart',
 'CVS Health Corporation',
 'The Home Depot',
 'Walgreens Boots Alliance',
 "Lowe's Companies",
 'Amazon.com',
 'Costco Wholesale',
 'Target',
 'Other',
 'The Kroger Co.',
 'Albertsons Companies']

In [24]:
def labels_to_index(label,labels=labels):
    return labels.index(label)

In [27]:
def index_to_labels(idx,labels=labels):
    return labels[idx]

In [None]:
def get_lables(pred,lookup):
    return lookup[np.argmax(pred)]    

In [None]:
# one_hot_encode_labels = False
# if one_hot_encode_labels:
#     for i in range(5):
#         print(f" TRAIN : {X_train[i]} >  {get_lables(np.array(y_train[i]),lable_lookup)}")
#     for i in range(5):
#         print(f" TEST : {X_test[i]} >  {get_lables(np.array(y_test[i]),lable_lookup)}")
# else:
#     for i in range(5):
#         print(f" TRAIN : {X_train[i]} >  {y_train[i]}")
#     for i in range(5):
#         print(f" TEST : {X_test[i]} >  {y_test[i]}")


In [28]:
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras import losses

In [None]:
# Vocab = ['UNK']
# Vocab = Vocab+ list(set("".join(X_train)))
# print(len(Vocab))
# #Vocab.append("UNK")
# char_to_ind = {u:i for i, u in enumerate(Vocab)}
# vocab_len = len(Vocab)

In [31]:
y_train_tx = [labels_to_index(y) for y in y_train]
y_test_tx = [labels_to_index(y) for y in y_test]

In [32]:
y_train_tx

[10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,
 10,


In [None]:
def custom_standardization(input_data):

In [None]:
ind_to_char = np.array(Vocab)

In [None]:
X_train_tx = []
for x in X_train:
    X_train_tx.append([char_to_ind[c] for c in x])

In [None]:
X_test_tx = []
for x in X_test:
    X_test_tx.append([char_to_ind[c] for c in x])

In [None]:
y_train_tx = []
for y in y_train:
    y_train_tx.append([char_to_ind[c] for c in y])

In [None]:
y_test_tx = []
for y in y_test:
    y_test_tx.append([char_to_ind[c] for c in y])

In [None]:
 def get_word_from_ind(arr):
    return "".join([ind_to_char[i] for i in arr]).replace("UNK","")

In [None]:
print(get_word_from_ind(X_train_tx[0]))
print(get_word_from_ind(X_test_tx[0]))
print(get_word_from_ind(y_train_tx[0]))
print(get_word_from_ind(y_train_tx[0]))

In [None]:
x_max_len= 0
for i in X_train:
    if len(i)>x_max_len:
        x_max_len= len(i)
print(x_max_len)
x_max_input_length = x_max_len

x_min_len = np.inf
for i in X_train:
    if len(i)<x_min_len:
        x_min_len = len(i)
print(x_min_len)
min_input_length = x_min_len

y_min_len = np.inf
for i in y_train:
    if len(i)<y_min_len:
        y_min_len = len(i)
print(y_min_len)

y_max_len = 0
for i in y_train:
    if len(i)>y_max_len:
        y_max_len = len(i)
print(y_max_len)


In [None]:
input_length =24
def gen_encoded_strings(word,input_len):
    word_len = len(word)
    if word_len > input_len:
        word = word[0:input_len]
        return [char_to_ind[c] for c in word]
    else:
        encoded = [char_to_ind[c] for c in word]
        pad_width = input_len-word_len
        arr = np.array(encoded)
        return np.pad(arr,(pad_width,0),"constant")
        
    

In [None]:
# testing the encoded and decoded.

s = 'Target'

s_encoded = gen_encoded_strings(s,input_length)

print(f" encoded - {s_encoded} - {len(s_encoded)}")

print(f" decoded - {get_word_from_ind(s_encoded)}")


In [None]:
# labels = list(set(y_train))

In [None]:
# def categorical_coding(labels):
#     idx = labels.index(label)
#     return [idx]

# def categorical_label(idx):
#     return labels[idx[0]]
    

In [None]:
print(y_train[0])
#print(categorical_coding(y_train[0]))
#print(categorical_label(categorical_coding(y_train[0])))

In [None]:
# max_features = 100
# vectorize_layer = layers.TextVectorization(
#     standardize="lower",
#     split='character',
#     output_sequence_length=max_features,
#     max_tokens=max_features,
#     pad_to_max_tokens=True,
#     output_mode='int')


In [None]:
# vectorize_layer.adapt(X_train)
# len(vectorize_layer.get_vocabulary())

In [None]:
# encode both label and test
# def vectorize_text_code_lable(text, label):
#     vector = gen_encoded_strings(text,input_length)
#     idx = categorical_coding(label)
#     return vector, idx

In [None]:
# vectorize_layer.adapt(
#          X_train
#         )


In [None]:
# vector_categorical_coding = np.vectorize(categorical_coding)
# vector_gen_encoded_strings = np.vectorize(gen_encoded_strings)

In [None]:
X_train_encoded = [gen_encoded_strings(x,input_length) for x in X_train]
print(f" type - {type(X_train_encoded)}")
X_test_encoded = [gen_encoded_strings(x,input_length) for x in X_test]
print(f" type - {type(X_test_encoded)}")


In [None]:
y_train_encoded = [gen_encoded_strings(y,input_length) for y in y_train]
print(f" type - {type(y_train_encoded)}")
y_test_encoded = [gen_encoded_strings(y,input_length) for y in y_test]
print(f" type - {type(y_test_encoded)}")


In [None]:
# y_train_encoded = [categorical_coding(y) for y in y_train]
# type(f"type = {type(y_train_encoded)}")
# y_test_encoded = [categorical_coding(y) for y in y_test]
# type(f"type = {type(y_test_encoded)}")

In [None]:
X_train_encoded[0]
X_test_encoded[0]

In [None]:
for x in X_train_encoded:
    if len(x) != 24:
        print(len(x))
        
for y in y_train_encoded:
    if len(y) != 24:
        print(len(y))

In [None]:
print(type(y_train_encoded[0]))
print(type(y_test_encoded[0]))


In [None]:
train_data = tf.data.Dataset.from_tensor_slices((X_train_encoded, y_train_encoded))
val_data = tf.data.Dataset.from_tensor_slices((X_test_encoded, y_test_encoded))


In [None]:
train_data

In [None]:
# #lable_lookup
# def map_labels(x,y):
#     print(f"y - {type(y)} - {y.shape}")
#     print(y[0])
#     l = get_lables(y[0],lable_lookup)
#     encoded = gen_encoded_strings(l,input_length)
#     (encoded,y)
#     return x,(encoded,y)
    

In [None]:

# for example, label in train_data.take(1):
#     print('text: ', example.numpy())
#     print('label: ', label.numpy())
#     x,y = map_labels(example.numpy(),label)
#     #print('encoded label: ', y.shape)
    

In [None]:
# vectorize_text_code_lable('Target','Target')

In [None]:
BUFFER_SIZE = 14000
BATCH_SIZE = 512

train_dataset = train_data.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True).prefetch(tf.data.AUTOTUNE)
val_dataset = val_data.batch(BATCH_SIZE, drop_remainder=True).prefetch(tf.data.AUTOTUNE)



In [None]:
train_dataset

In [None]:
for example, label in train_dataset.take(1):
    print(f"{example.shape} - {label.shape}")
    #print('text: ', example.numpy()[:3])
    #print('label_one_hot: ', label.numpy()[:3])
    #for v in label.numpy()[:3]:
        #print('label ', decode_pred(v,lookup))

In [None]:
# vocab = np.array(vectorize_layer.get_vocabulary())
# vocab_len = len(vocab)
# print(vocab_len)

In [None]:
for x,y in train_dataset.take(1):
    print(x.shape)
    print(y.shape)


In [None]:
# example="Target"
# encoded_example = vectorize_layer(example).numpy()
# encoded_example.shape
len(Vocab)

In [None]:
embedding_dim = 20
batch_size = 512
time_steps = 512

def get_model(embedding_dim,batch_size,time_steps):
    model = tf.keras.Sequential()
    embed_layer = layers.Embedding(
                       input_dim=len(Vocab), # e.g, 10 if you have 10 words in your vocabulary
                       output_dim=embedding_dim, # size of the embedded vectors
                       batch_input_shape = [batch_size,None]
                        )
    bidirect_lstm = layers.Bidirectional(layers.LSTM(
                                        units=time_steps,
                                        return_sequences=True,
                                        stateful=True,
                                        activation='tanh',
                                        recurrent_activation="tanh", 
                                        recurrent_initializer='glorot_uniform'))
    pooling1 = layers.GlobalAveragePooling1D()
    dense1 = layers.Dense(96, activation = 'relu', kernel_regularizer='l2')
    drop1 = layers.Dropout(0.2)
    dense2 = layers.Dense(128, activation = 'tanh', kernel_regularizer='l2')
    drop2 = layers.Dropout(0.2)
    dense3 = layers.Dense(256, activation = 'tanh')
    drop3 = layers.Dropout(0.2)
    dense4 = layers.Dense(128, activation = 'sigmoid', kernel_regularizer='l2')
    drop4 = layers.Dropout(0.2)
    predictor = layers.Dense(24)
    for l in [embed_layer,
              bidirect_lstm,
              dense1,
              drop1,
              dense2,
              drop2,
              pooling1,
              predictor]:
        model.add(l)
    model.summary()
    return model

In [None]:
model = get_model(embedding_dim,batch_size,time_steps)

In [None]:
print([layer.supports_masking for layer in model.layers])

In [None]:
# print(len(vocab))
# sample_text = vectorize_layer('WALMART 1')
# #print(sample_text.shape)
# #print(type(sample_text))
# #print(np.array(sample_text))
# sample = np.array(sample_text).reshape(1,len(vocab))
# print(sample)

In [None]:
from keras.losses import categorical_crossentropy

#import tf.keras.optimizers.experimental.Adadelta
# opt = SGD(learning_rate=0.01)
# opt = Optimizer()
def cat_loss(y_true,y_pred):
    pred = tf.reshape(y_pred[0][0],(1,101))
    return categorical_crossentropy(y_true, y_pred, from_logits=True)

loss_fn = 'categorical_crossentropy'
#loss_fn = cat_loss

model.compile(loss=loss_fn,
              optimizer= 'adam',
             )

In [None]:
#from tensorflow.keras.callbacks import EarlyStopping
#early_stopping = EarlyStopping(monitor='val_loss',patience=5)
# from tensorflow.python.compiler.mlcompute import mlcompute
# tf.compat.v1.disable_eager_execution()
# mlcompute.set_mlc_device(device_name='gpu')
# print("is_apple_mlc_enabled %s" % mlcompute.is_apple_mlc_enabled())
# print("is_tf_compiled_with_apple_mlc %s" % mlcompute.is_tf_compiled_with_apple_mlc())
# print(f"eagerly? {tf.executing_eagerly()}")
print(tf.config.list_logical_devices())

In [None]:

history = model.fit(train_dataset, 
                epochs=5)

In [None]:
pred_model = get_model(embedding_dim,1,time_steps)

In [None]:
pred_model.set_weights(model.get_weights())

In [None]:
pred_model.summary()

In [None]:
predicted = pred_model.predict(np.array(x[0]).reshape(1,24))

In [None]:
 predictions 

In [None]:
predictions = predictions/1.0

In [None]:
predicted_id = tf.random.categorical(predictions, num_samples =24) 


In [None]:
labels = [decode_pred(i,lookup) for i in y_train]

df_train = pd.DataFrame({'stores':labels, 'bad_names':X_train})


In [None]:
df_train.groupby('stores').count()

In [None]:
store_name = "WALMART"

v_store_name = vectorize_text(store_name,'unknown')
#print(v_store_name[0].shape)
print(np.array(v_store_name[0]).reshape(1,3000))
#data = tf.data.Dataset.from_tensor_slices(v_store_name)


In [None]:
decode_pred(np.argmax(model.predict(np.array(v_store_name[0]).reshape(1,3000))))