In [1]:
# we create a function to oobfuscate the store names and introduce impurities for the sake of this excercise
# Scrapped or OCR results could be much more random and unpredictable.
import string
import random 
import numpy as np 
import pandas as pd

# Using seen dictionary to generate all unique values
# Main goal here is to ensure there is no cross category contamination
# 1 input has single label, that model does not get confused
seen = {}

def gen_impurities(n,word):
    for i in range(n):
        index = random.randint(0,len(word)-1)
        #print(f"impute index - {index}")

        if index%2 == 0:
            symbol = random.choice(string.punctuation)
            #print(f"Symbol = {symbol}")
            word = word[:index] + symbol + word[index:]

        if index%7 == 0:
            num = random.choice(string.digits)
            #print(f"Number = {num}")
            word = word[:index] + str(num) + word[index:]

        if index%3 == 0:
            word.replace(word[index],"")
    return word

def obfuscate(word):
    n = int(len(word)*0.5)
#     print(n)
    iteration = random.randint(1,n)
#     print(iteration)
    label = word
    for i in range(5):
        imputed_word = gen_impurities(iteration,label)
#         print(imputed_word)
        if imputed_word not in seen:
            seen[imputed_word] = 1
            return imputed_word
        seen[imputed_word] = seen[imputed_word]+1
    return word
            

def gen_gibberish(min_l,max_l):
    # initializing size of string
    l = random.randint(min_l,max_l)
    # using random.choices()
    # generating random strings
    res = ''.join(random.choices(string.ascii_uppercase +
                                 string.punctuation +
                                 string.digits, k=l))
    while res not in seen:    
        seen[res] = 1
        return res
    seen[res] = seen[res]+1
    

In [2]:
# Lets take top 100 US retailers arbitarily chosen based on their annual reported Sales

def read_data():
    df = pd.read_csv("stores.csv")
    del df['empty']
    df = df.set_index('no')
    df.head()
    return df


In [3]:
# Test gibberish generator
# we will set the min and max length of 5 to 20
min_l = 5
max_l = 20
gen_gibberish(min_l,max_l)

'J9YU-Y/F~E%T(O'

In [4]:
# Lets take top 100 US retailers arbitarily chosen based on their annual reported Sales
import pandas as pd

df = pd.read_csv("stores.csv")
del df['empty']
df = df.set_index('no')
df.head()

Unnamed: 0_level_0,store,revenue
no,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Walmart,$459.51
2,Amazon.com,$217.79
3,Costco Wholesale,$140.41
4,The Home Depot,$140.06
5,The Kroger Co.,$136.49


In [5]:
stores = list(df["store"])

In [6]:
# this function will muddle up the stores to simulate real world data capture where system might introduce impurites
# the function will generate n number of instances for jumbled data

# n indicates number of obfuscated records
# instance_count is the number of observation per class

def get_obfuscated_stores(stores,n):
    """
        This functions takes the stores list
        Repeats it n times
        calls teh obfuscate function to generate dirty names
    """
    obfuscate_stores = stores * n
    obfuscate_stores.sort()
    df_obfuscated = pd.DataFrame({"stores": obfuscate_stores})
    df_obfuscated['bad_names'] = df_obfuscated["stores"].apply(lambda x: obfuscate(x) )
    return df_obfuscated
    
def get_catch_gibberish(instance_count,min_l,max_l):
    others = ['Other'] * instance_count
    df_other = pd.DataFrame({'stores':others})
    df_other['bad_names'] = df_other['stores'].apply(lambda x : gen_gibberish(min_l,max_l)) 
    return df_other

# the training set will also have good captures where store name was interpreted correctly
# for this purpose the we are creagin 10% bad captures

def get_good_names(stores,n):
    good_captures = stores*  n
    df_good_captures = pd.DataFrame({'stores':good_captures})
    df_good_captures['bad_names'] = df_good_captures["stores"]
    return df_good_captures

def get_data(impute_n,instance,min_l,max_l): 
    n = impute_n
    instance_count = instance
    min_l = min_l
    max_l = max_l

    df_obfuscated = get_obfuscated_stores(stores,n)
    df_other = get_catch_gibberish(instance_count,min_l,max_l)
    df_good_captures = repeat_good_names(stores,n,instance_count)
    df = pd.concat([df_obfuscated,df_good_captures,df_other])
    return df


In [7]:
def get_train_test_data(split_ratio,impute_ratio,instance,min_l,max_l):
    test_size = (instance*split_ratio)
    train_size = (instance*(1-split_ratio))
    print(f"test_size = {test_size} & train_size = {train_size} & split = {split_ratio}")
    test_impute_n = int(test_size*impute_ratio)
    test_good_n = int(test_size - test_impute_n)
    print(f"test impute = {test_impute_n} & instances = {test_good_n}")
    df = read_data()
    stores = list(df["store"])
    # get imputed
    df_obfuscated = get_obfuscated_stores(stores,test_impute_n)
    # get good names
    df_good_names = get_good_names(stores, test_good_n)
    # get others 
    df_other = get_catch_gibberish(test_impute_n+test_good_n,min_l,max_l)
    df_test = pd.concat([df_obfuscated,df_good_names,df_other])
    
    del df_obfuscated,df_good_names,df_other
    train_impute_n = int(train_size*impute_ratio)
    train_good_n = int(train_size - train_impute_n)
    # get imputed
    df_obfuscated = get_obfuscated_stores(stores,train_impute_n)
    # get good names
    df_good_names = get_good_names(stores, train_good_n)
    # get others 
    df_other = get_catch_gibberish(train_impute_n+train_good_n,min_l,max_l)
    
    df_train = pd.concat([df_obfuscated,df_good_names,df_other])
    
    del df_obfuscated,df_good_names,df_other
    return df_train, df_test
    

In [8]:
split_ratio = 0.2
impute_ratio = 0.5
instances = 50
min_l = 5
max_l = 20

df_train, df_test = get_train_test_data(split_ratio,
                                        impute_ratio,
                                        instances,
                                        min_l,
                                        max_l)

test_size = 10.0 & train_size = 40.0 & split = 0.2
test impute = 5 & instances = 5


In [9]:
#from sklearn.model_selection import train_test_split
def get_train_test_split(df_train,df_test,one_hot_encode_labels=False):
    X_train = df_train["bad_names"].values
    y_train = df_train["stores"].values
    X_test  = df_test["bad_names"].values
    y_test  = df_test["stores"].values
    
    if one_hot_encode_labels:
        df_labels = pd.concat([df_train['stores'],df_test['stores']])
        print(f"Labels = {df_labels.shape})")
        lables = pd.get_dummies(df_labels)
        lookup = list(lables.columns)
        print(len(lookup))
        del df_labels
        y_test_labels = pd.get_dummies(y_test)
        y_test_encoded = y_test_labels.astype('float32').values
        y_train_labels = pd.get_dummies(y_train)
        y_train_encoded = y_train_labels.astype('float32').values
        return X_train,y_train_encoded,X_test,y_test_encoded, lookup
    else:
        return X_train, y_train, X_test, y_test, None
    

In [10]:

X_train, y_train, X_test, y_test, lable_lookup = get_train_test_split(df_train,
                                                                      df_test,
                                                                      one_hot_encode_labels=False)


In [None]:
#type(list(X_train))
print(corpus)

In [11]:
## tf - idf
## some more robust embeddings are needed.
## Creting character level TF-EDF encoder/vectorizer to convert strings to consistent length vectors

from sklearn.feature_extraction.text import TfidfVectorizer
corpus = X_train
vectorizer = TfidfVectorizer(analyzer='char',strip_accents ="ascii" , ngram_range=(2,2)).fit(corpus)
print(vectorizer.get_feature_names_out())
print(len(vectorizer.get_feature_names_out()))


[' !' ' "' ' #' ... '~|' '~}' '~~']
3088


In [12]:
new_text = ["7-&E+l#'/-ev&en as"]
X_train_vector= vectorizer.transform(X_train).toarray()
X_test_vector = vectorizer.transform(X_test).toarray()
print(f" shape of train - {X_train_vector.shape} and len = {len(X_train_vector)}")
print(f" shage of test - {X_test_vector.shape} and len = {len(X_test_vector)}")

 shape of train - (4040, 3088) and len = 4040
 shage of test - (1010, 3088) and len = 1010


In [13]:
from sklearn.neighbors import KNeighborsClassifier
from scipy.spatial import distance      

classifier = KNeighborsClassifier(n_neighbors=20,n_jobs=10, metric = distance.correlation,)
classifier.fit(X_train_vector, y_train)

In [42]:
def my_classifier(w:list,vectorizer,classifier):
    w_vector = vectorizer.transform(w).toarray()
    pred = classifier.predict(w_vector)
    conf = classifier.predict_proba(w_vector)    
    return pred,conf
    

In [43]:
test_stores = ["Targe1", 
                "Larget", 
                '!a7g3T',
                "'7argay'",
                "7- 11",
                "Ac3 Hw"]
my_classifier(test_stores
              ,vectorizer,classifier)


(array(['Target', 'Target', 'Other', 'Gap', '7-Eleven', 'Ace Hardware'],
       dtype=object),
 array([[0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
         0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
         0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
         0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
         0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
         0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
         0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
         0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 1.  , 0.  , 0.  , 0.  , 0.  ,
         0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
         0.  , 0.  ],
        [0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
         0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
         0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0

In [None]:
#list(X_test)
## takes a while
import warnings
warnings.filterwarnings('ignore')

print(len(X_test))
y_predicted = []
for i in range(len(X_test)):
    pred = my_classifier([X_test[i]],vectorizer,classifier)
    print(f" {i}] Input - {X_test[i]} predicted - {pred[0]} ")
    y_predicted.append(pred)
    

In [26]:
conf_mat = pd.DataFrame({'test_names':X_test,'actual':y_test})
conf_mat["predicted"] = conf_mat["test_names"].apply(lambda x : my_classifier([x],vectorizer,classifier)[0])

In [27]:
conf_mat.head(10)

Unnamed: 0,test_names,actual,predicted
0,1/*7-Ele=ven,7-Eleven,7-Eleven
1,7-El..even,7-Eleven,7-Eleven
2,7-Eleven,7-Eleven,7-Eleven
3,7-Elev[e1n,7-Eleven,7-Eleven
4,7-El|even,7-Eleven,7-Eleven
5,AT&T@ Wireles]s,AT&T Wireless,AT&T Wireless
6,AT&T W-irele_ss,AT&T Wireless,AT&T Wireless
7,"AT&T Wirel,e~ss",AT&T Wireless,AT&T Wireless
8,AT$&T ?W#ireless,AT&T Wireless,AT&T Wireless
9,"AT&T` !(Wirele3,ss",AT&T Wireless,AT&T Wireless


In [25]:
len(conf_mat["predicted"].values[0])

1

In [29]:

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


# result = confusion_matrix(y_test, y_predicted)
# print("Confusion Matrix:")
# print(result)
# result1 = classification_report(y_test, y_predicted)
# print("Classification Report:",)
# print (result1)
# result2 = accuracy_score(y_test,y_predicted)
# print("Accuracy:",result2)

In [31]:
result = confusion_matrix(conf_mat['actual'].values,conf_mat['predicted'].values)
print("Confusion Matrix:")
print(result)

Confusion Matrix:
[[10  0  0 ...  0  0  0]
 [ 0 10  0 ...  0  0  0]
 [ 0  0 10 ...  0  0  0]
 ...
 [ 0  0  0 ... 10  0  0]
 [ 0  0  0 ...  0 10  0]
 [ 0  0  0 ...  0  0 10]]


In [32]:
result1 = classification_report(conf_mat['actual'].values,conf_mat['predicted'].values)
print("Classification Report:",)
print (result1)

Classification Report:
                                     precision    recall  f1-score   support

                           7-Eleven       0.91      1.00      0.95        10
                      AT&T Wireless       1.00      1.00      1.00        10
                    AVB Brandsource       1.00      1.00      1.00        10
                     Academy Sports       1.00      1.00      1.00        10
                       Ace Hardware       1.00      1.00      1.00        10
                       Advance Auto       1.00      1.00      1.00        10
               Albertsons Companies       1.00      1.00      1.00        10
                               Aldi       0.91      1.00      0.95        10
           Alimentation Couche-Tard       1.00      1.00      1.00        10
                         Amazon.com       1.00      1.00      1.00        10
          American Eagle Outfitters       1.00      1.00      1.00        10
              Apple Stores / iTunes       1.00      

In [33]:
result2 = accuracy_score(conf_mat['actual'].values,conf_mat['predicted'].values)
print("Accuracy:",result2)

Accuracy: 0.9910891089108911


In [35]:
my_classifier(['7a G3 7'],vectorizer,classifier)

array(['Ikea North America Services'], dtype=object)

In [36]:
my_classifier(['12321ueiajsidams'],vectorizer,classifier)

array(['True Value Co.'], dtype=object)

In [40]:
my_classifier(['Target'],vectorizer,classifier)

array(['Target'], dtype=object)

In [41]:
my_classifier(['T1argea214'],vectorizer,classifier)

array(['Target'], dtype=object)