In [1]:
# we create a function to oobfuscate the store names and introduce impurities for the sake of this excercise
# Scrapped or OCR results could be much more random and unpredictable.
import string
import random 
import numpy as np 
import pandas as pd

# Using seen dictionary to generate all unique values
# Main goal here is to ensure there is no cross category contamination
# 1 input has single label, that model does not get confused
seen = {}

def gen_impurities(n,word):
    for i in range(n):
        index = random.randint(0,len(word)-1)
        #print(f"impute index - {index}")

        if index%2 == 0:
            symbol = random.choice(string.punctuation)
            #print(f"Symbol = {symbol}")
            word = word[:index] + symbol + word[index:]

        if index%7 == 0:
            num = random.choice(string.digits)
            #print(f"Number = {num}")
            word = word[:index] + str(num) + word[index:]

        if index%3 == 0:
            word.replace(word[index],"")
    return word

def obfuscate(word):
    n = int(len(word)*0.5)
#     print(n)
    iteration = random.randint(1,n)
#     print(iteration)
    label = word
    for i in range(5):
        imputed_word = gen_impurities(iteration,label)
#         print(imputed_word)
        if imputed_word not in seen:
            seen[imputed_word] = 1
            return imputed_word
        seen[imputed_word] = seen[imputed_word]+1
    return word
            

def gen_gibberish(min_l,max_l):
    # initializing size of string
    l = random.randint(min_l,max_l)
    # using random.choices()
    # generating random strings
    res = ''.join(random.choices(string.ascii_uppercase +
                                 string.punctuation +
                                 string.digits, k=l))
    while res not in seen:    
        seen[res] = 1
        return res
    seen[res] = seen[res]+1
    

In [2]:
# Lets take top 100 US retailers arbitarily chosen based on their annual reported Sales

def read_data():
    df = pd.read_csv("stores.csv")
    del df['empty']
    df = df.set_index('no')
    df.head()
    return df


In [3]:
# this function will muddle up the stores to simulate real world data capture where system might introduce impurites
# the function will generate n number of instances for muddled data

# n indicates number of obfuscated records
# instance_count is the number of observation per class

def get_obfuscated_stores(stores,n):
    obfuscate_stores = stores * n
    obfuscate_stores.sort()
    df_obfuscated = pd.DataFrame({"stores": obfuscate_stores})
    df_obfuscated['bad_names'] = df_obfuscated["stores"].apply(lambda x: obfuscate(x) )
    return df_obfuscated
    
def get_catch_gibberish(instance_count,min_l,max_l):
    others = ['Other'] * instance_count
    df_other = pd.DataFrame({'stores':others})
    df_other['bad_names'] = df_other['stores'].apply(lambda x : gen_gibberish(min_l,max_l)) 
    return df_other

# the training set will also have good captures where store name was interpreted correctly
# for this purpose the we are creagin 10% bad captures

def get_good_names(stores,n):
    good_captures = stores*  n
    df_good_captures = pd.DataFrame({'stores':good_captures})
    df_good_captures['bad_names'] = df_good_captures["stores"]
    return df_good_captures

def get_data(impute_n,instance,min_l,max_l): 
    n = impute_n
    instance_count = instance
    min_l = min_l
    max_l = max_l

    df_obfuscated = get_obfuscated_stores(stores,n)
    df_other = get_catch_gibberish(instance_count,min_l,max_l)
    df_good_captures = repeat_good_names(stores,n,instance_count)
    df = pd.concat([df_obfuscated,df_good_captures,df_other])
    return df


In [4]:
# this function will muddle up the stores to simulate real world data capture where system might introduce impurites
# the function will generate n number of instances for jumbled data

# n indicates number of obfuscated records
# instance_count is the number of observation per class

def get_obfuscated_stores(stores,n):
    """
        This functions takes the stores list
        Repeats it n times
        calls teh obfuscate function to generate dirty names
    """
    obfuscate_stores = stores * n
    obfuscate_stores.sort()
    df_obfuscated = pd.DataFrame({"stores": obfuscate_stores})
    df_obfuscated['bad_names'] = df_obfuscated["stores"].apply(lambda x: obfuscate(x) )
    return df_obfuscated
    
def get_catch_gibberish(instance_count,min_l,max_l):
    others = ['Other'] * instance_count
    df_other = pd.DataFrame({'stores':others})
    df_other['bad_names'] = df_other['stores'].apply(lambda x : gen_gibberish(min_l,max_l)) 
    return df_other

# the training set will also have good captures where store name was interpreted correctly
# for this purpose the we are creagin 10% bad captures

def get_good_names(stores,n):
    good_captures = stores*  n
    df_good_captures = pd.DataFrame({'stores':good_captures})
    df_good_captures['bad_names'] = df_good_captures["stores"]
    return df_good_captures

def get_data(impute_n,instance,min_l,max_l): 
    n = impute_n
    instance_count = instance
    min_l = min_l
    max_l = max_l

    df_obfuscated = get_obfuscated_stores(stores,n)
    df_other = get_catch_gibberish(instance_count,min_l,max_l)
    df_good_captures = repeat_good_names(stores,n,instance_count)
    df = pd.concat([df_obfuscated,df_good_captures,df_other])
    return df


In [5]:
def get_train_test_data(split_ratio,impute_ratio,instance,min_l,max_l):
    test_size = (instance*split_ratio)
    train_size = (instance*(1-split_ratio))
    print(f"test_size = {test_size} & train_size = {train_size} & split = {split_ratio}")
    test_impute_n = int(test_size*impute_ratio)
    test_good_n = int(test_size - test_impute_n)
    print(f"test impute = {test_impute_n} & instances = {test_good_n}")
    df = read_data()
    stores = list(df["store"])
    # get imputed
    df_obfuscated = get_obfuscated_stores(stores,test_impute_n)
    # get good names
    df_good_names = get_good_names(stores, test_good_n)
    # get others 
    df_other = get_catch_gibberish(test_impute_n+test_good_n,min_l,max_l)
    df_test = pd.concat([df_obfuscated,df_good_names,df_other])
    
    del df_obfuscated,df_good_names,df_other
    train_impute_n = int(train_size*impute_ratio)
    train_good_n = int(train_size - train_impute_n)
    # get imputed
    df_obfuscated = get_obfuscated_stores(stores,train_impute_n)
    # get good names
    df_good_names = get_good_names(stores, train_good_n)
    # get others 
    df_other = get_catch_gibberish(train_impute_n+train_good_n,min_l,max_l)
    
    df_train = pd.concat([df_obfuscated,df_good_names,df_other])
    
    del df_obfuscated,df_good_names,df_other
    return df_train, df_test
    
    

In [6]:
df_train, df_test = get_train_test_data(0.2,0.5,10000,5,20)

test_size = 2000.0 & train_size = 8000.0 & split = 0.2
test impute = 1000 & instances = 1000


In [147]:
t = df_train.groupby('bad_names').count()

In [152]:
t.reset_index(inplace=True)
t = t.rename(columns = {'index':'bad_names'})


In [155]:
t[t['stores'] >1]

Unnamed: 0,bad_names,stores
86087,7-Eleven,4943
117765,AT&T Wireless,4643
120951,AVB Brandsource,4424
123879,Academy Sports,4505
125529,Ace Hardware,4638
...,...,...
323716,Wayfair,5346
326851,Wegmans Food Market,4311
328183,Weis Markets,4600
331787,Williams-Sonoma,4389


In [7]:
print(f"len test stores = {len(set(df_test['stores']))}")
print(f"len train stores = {len(set(df_train['stores']))}")

len test stores = 101
len train stores = 101


In [8]:
#from sklearn.model_selection import train_test_split
def get_train_test_split(df_train,df_test,one_hot_encode_labels=False):
    X_train = df_train["bad_names"].values
    y_train = df_train["stores"].values
    X_test  = df_test["bad_names"].values
    y_test  = df_test["stores"].values
    
    if one_hot_encode_labels:
        df_labels = pd.concat([df_train['stores'],df_test['stores']])
        print(f"Labels = {df_labels.shape})")
        lables = pd.get_dummies(df_labels)
        lookup = list(lables.columns)
        print(len(lookup))
        del df_labels
        y_test_labels = pd.get_dummies(y_test)
        y_test_encoded = y_test_labels.astype('float32').values
        y_train_labels = pd.get_dummies(y_train)
        y_train_encoded = y_train_labels.astype('float32').values
        return X_train,y_train_encoded,X_test,y_test_encoded, lookup
    else:
        return X_train, y_train, X_test, y_test, None
    

In [9]:

X_train, y_train, X_test, y_test, lable_lookup = get_train_test_split(df_train,
                                                                      df_test,
                                                                      one_hot_encode_labels=False)


In [10]:
print(f"Shape of X train - {X_train.shape}")
print(f"Shape of y train - {y_train.shape}")
print(f"shape of X test - {X_test.shape}")
print(f"shape of y test {y_test.shape}")
print(f"Test labels y train - {len(set(y_train))}")
print(f"Test labels X train - {len(set(X_train))}")
print(f"Test labels y test - {len(set(y_test))}")
print(f"Test labels X test - {len(set(X_test))}")

Shape of X train - (808000,)
Shape of y train - (808000,)
shape of X test - (202000,)
shape of y test (202000,)
Test labels y train - 101
Test labels X train - 335118
Test labels y test - 101
Test labels X test - 94379


In [11]:
LABELS = list(set(y_train))
print(LABELS)

def labels_to_index(label,labels=LABELS):
    return labels.index(label)

def index_to_labels(idx,labels=LABELS):
    return labels[idx]

def index_to_labels_arr(arr,labels=LABELS):
    return labels[np.argmax(arr)]

['PetSmart', 'Academy Sports', 'Sprouts Farmers Market', 'Bed Bath & Beyond', 'Walmart', 'Michaels Stores', 'J.C. Penney Company', 'Signet Jewelers', 'The Kroger Co.', 'CVS Health Corporation', 'Urban Outfitters', 'Harbor Freight Tools', 'My Demoulas', 'Advance Auto', 'Hy Vee', 'Qurate Retail', 'TJX Companies', 'Hobby Lobby Stores', 'Weis Markets', 'Bath & Body Works', 'Defense Commissary Agency', 'RH', "BJ's Wholesale Club", 'Bass Pro', 'Giant Eagle', 'Albertsons Companies', 'Golub', 'Williams-Sonoma', "Dillard's", 'Aldi', 'Wakefern / ShopRite', "Dick's Sporting Goods", 'AutoZone', "Hudson's Bay", 'Office Depot', 'Ikea North America Services', 'Barnes & Noble', 'Wayfair', 'Target', 'Good Neighbor Pharmacy', 'WinCo Foods', 'Publix Super Markets', 'Tapestry', 'Exxon Mobil Corporation', 'Meijer', 'Menards', 'AT&T Wireless', 'Wegmans Food Market', 'Ingles', 'Alimentation Couche-Tard', 'Petco', 'Ross Stores', 'Burlington', 'Other', "Lowe's Companies", 'American Eagle Outfitters', 'Total Wi

In [12]:
## Create indexed classes
y_train_tx = [labels_to_index(y) for y in y_train]
y_test_tx = [labels_to_index(y) for y in y_test]

In [13]:
print(f" y train indexed class - {y_train_tx[0]}")
print(f" y train label {y_train[0]}")
print(f" y test indexed class - {y_test[0]}")
print(f" y test indexed class - {y_test_tx[0]}")

 y train indexed class - 64
 y train label 7-Eleven
 y test indexed class - 7-Eleven
 y test indexed class - 64


In [14]:
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras import losses

In [16]:
import re
def custom_standardization(input_data):
    lowercase = tf.strings.lower(input_data)
    return tf.strings.regex_replace(lowercase,
                                  '[%s]' % re.escape(string.punctuation),
                                  '')

In [17]:
max_features = 100
sequence_length = 50

vectorize_layer = layers.TextVectorization(
    standardize=custom_standardization,
    split = 'character',
    max_tokens=max_features,
    output_mode='int',
    output_sequence_length=sequence_length)

In [18]:
train_text = raw_train.map(lambda x, y: x)
vectorize_layer.adapt(train_text)

2023-05-25 08:36:43.167322: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


In [19]:
vectorize_layer.get_vocabulary()

['',
 '[UNK]',
 'e',
 ' ',
 'a',
 'o',
 's',
 'r',
 't',
 'l',
 'i',
 'n',
 'c',
 'm',
 'h',
 'd',
 'b',
 'p',
 'g',
 'u',
 'y',
 'w',
 'f',
 'v',
 'k',
 'j',
 'x',
 '7',
 'z',
 '3',
 '8',
 '2',
 '0',
 '1',
 '6',
 '9',
 '4',
 '5',
 'q',
 '’']

In [117]:
def vectorize_text(text, label):
    text = tf.expand_dims(text, -1)
    return vectorize_layer(text)[0], label

In [21]:
print("3 ---> ",vectorize_layer.get_vocabulary()[3])
print(" 30 ---> ",vectorize_layer.get_vocabulary()[30])
print('Vocabulary size: {}'.format(len(vectorize_layer.get_vocabulary())))


3 --->   
 30 --->  8
Vocabulary size: 40


In [34]:
size = len(y_train_tx)

In [89]:
train_idx = int(size*0.8)
val_idx = int(size*0.2)
print(f"train size - {train_idx} val size - {val_idx}")

X_traning_ds = X_train[0:train_idx]
y_train_ds = y_train_tx[0:train_idx]
X_val_ds = X_train[train_idx:train_idx+val_idx]
y_val_ds = y_train_tx[train_idx:train_idx+val_idx]

train size - 646400 val size - 161600


In [92]:
print(f"Training - {X_traning_ds.shape} Val - {X_val_ds.shape}")
print(f"Training - {len(y_train_ds)} Val - {len(y_val_ds)}")

Training - (646400,) Val - (161600,)
Training - 646400 Val - 161600


In [93]:

raw_train = tf.data.Dataset.from_tensor_slices((X_traning_ds, y_train_ds))
raw_val = tf.data.Dataset.from_tensor_slices((X_val_ds, y_val_ds))
raw_test = tf.data.Dataset.from_tensor_slices((X_test, y_test_tx))


In [125]:
train_ds = raw_train.map(vectorize_text)
val_ds = raw_val.map(vectorize_text)
test_ds = raw_test.map(vectorize_text)

In [109]:
text_batch, label_batch = next(iter(train_ds))
print(text_batch)
print(label_batch)

tf.Tensor(
[[27  2  9  2 23  2 30 11  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0]], shape=(1, 50), dtype=int64)
tf.Tensor(64, shape=(), dtype=int32)


In [132]:
BUFFER_SIZE = 1000
BATCH_SIZE = 128


fin_train_ds = train_ds.shuffle(buffer_size=BUFFER_SIZE).batch(batch_size=BATCH_SIZE,drop_remainder=True)
fin_val_ds = val_ds.batch(batch_size=BATCH_SIZE,drop_remainder=True)
fin_test_ds = test_ds.batch(batch_size=BATCH_SIZE,drop_remainder=True)

In [133]:
text_batch, label_batch = next(iter(fin_train_ds))
print(text_batch)
print(label_batch)
    

tf.Tensor(
[[32 27  2 ...  0  0  0]
 [27  2  9 ...  0  0  0]
 [32 27  2 ...  0  0  0]
 ...
 [30 27  2 ...  0  0  0]
 [27  2  9 ...  0  0  0]
 [27  2  9 ...  0  0  0]], shape=(128, 50), dtype=int64)
tf.Tensor(
[64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64
 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64
 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64
 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64
 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64 64
 64 64 64 64 64 64 64 64], shape=(128,), dtype=int32)


In [138]:
embedding_dim = 16


def get_model(embedding_dim,max_features):
    model = tf.keras.Sequential([
          layers.Embedding(max_features + 1, embedding_dim),
          layers.Bidirectional(layers.LSTM(128)),
          layers.Dropout(0.2),
          layers.Dense(128,activation='tanh', kernel_regularizer='l2'),
          layers.Dropout(0.2),
          layers.Dense(101, activation='softmax')])
    model.summary()
    return model

In [139]:
model = get_model(embedding_dim,max_features)

Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_6 (Embedding)     (None, None, 16)          1616      
                                                                 
 bidirectional_6 (Bidirectio  (None, 256)              148480    
 nal)                                                            
                                                                 
 dropout_6 (Dropout)         (None, 256)               0         
                                                                 
 dense_6 (Dense)             (None, 128)               32896     
                                                                 
 dropout_7 (Dropout)         (None, 128)               0         
                                                                 
 dense_7 (Dense)             (None, 101)               13029     
                                                      

In [140]:
## The loss function and the metrics need to be compliant to get good results

model.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False),
              optimizer= 'adam',
              metrics=tf.keras.metrics.SparseCategoricalAccuracy()
             )

In [141]:
epochs = 10
history = model.fit(
    fin_train_ds,
    validation_data=fin_val_ds,
    epochs=epochs)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
