In [1]:
# we create a function to oobfuscate the store names and introduce impurities for the sake of this excercise
# Scrapped or OCR results could be much more random and unpredictable.
import string
import random 
import numpy as np 
import pandas as pd

# Using seen dictionary to generate all unique values
# Main goal here is to ensure there is no cross category contamination
# 1 input has single label, that model does not get confused
seen = {}

def gen_impurities(n,word):
    for i in range(n):
        index = random.randint(0,len(word)-1)
        #print(f"impute index - {index}")

        if index%2 == 0:
            symbol = random.choice(string.punctuation)
            #print(f"Symbol = {symbol}")
            word = word[:index] + symbol + word[index:]

        if index%7 == 0:
            num = random.choice(string.digits)
            #print(f"Number = {num}")
            word = word[:index] + str(num) + word[index:]

        if index%3 == 0:
            word.replace(word[index],"")
    return word

def obfuscate(word):
    n = int(len(word)*0.5)
#     print(n)
    iteration = random.randint(1,n)
#     print(iteration)
    label = word
    for i in range(5):
        imputed_word = gen_impurities(iteration,label)
#         print(imputed_word)
        if imputed_word not in seen:
            seen[imputed_word] = 1
            return imputed_word
        seen[imputed_word] = seen[imputed_word]+1
    return word
            

def gen_gibberish(min_l,max_l):
    # initializing size of string
    l = random.randint(min_l,max_l)
    # using random.choices()
    # generating random strings
    res = ''.join(random.choices(string.ascii_uppercase +
                                 string.punctuation +
                                 string.digits, k=l))
    while res not in seen:    
        seen[res] = 1
        return res
    seen[res] = seen[res]+1
    

In [2]:
# Lets take top 100 US retailers arbitarily chosen based on their annual reported Sales

def read_data():
    df = pd.read_csv("stores.csv")
    del df['empty']
    df = df.set_index('no')
    df.head()
    return df


In [3]:
# Test gibberish generator
# we will set the min and max length of 5 to 20
min_l = 5
max_l = 20
gen_gibberish(min_l,max_l)

'`%M=:X'

In [4]:
# Lets take top 100 US retailers arbitarily chosen based on their annual reported Sales
import pandas as pd

df = pd.read_csv("stores.csv")
del df['empty']
df = df.set_index('no')
df.head()

Unnamed: 0_level_0,store,revenue
no,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Walmart,$459.51
2,Amazon.com,$217.79
3,Costco Wholesale,$140.41
4,The Home Depot,$140.06
5,The Kroger Co.,$136.49


In [5]:
stores = list(df["store"])

In [6]:
# this function will muddle up the stores to simulate real world data capture where system might introduce impurites
# the function will generate n number of instances for jumbled data

# n indicates number of obfuscated records
# instance_count is the number of observation per class

def get_obfuscated_stores(stores,n):
    """
        This functions takes the stores list
        Repeats it n times
        calls teh obfuscate function to generate dirty names
    """
    obfuscate_stores = stores * n
    obfuscate_stores.sort()
    df_obfuscated = pd.DataFrame({"stores": obfuscate_stores})
    df_obfuscated['bad_names'] = df_obfuscated["stores"].apply(lambda x: obfuscate(x) )
    return df_obfuscated
    
def get_catch_gibberish(instance_count,min_l,max_l):
    others = ['Other'] * instance_count
    df_other = pd.DataFrame({'stores':others})
    df_other['bad_names'] = df_other['stores'].apply(lambda x : gen_gibberish(min_l,max_l)) 
    return df_other

# the training set will also have good captures where store name was interpreted correctly
# for this purpose the we are creagin 10% bad captures

def get_good_names(stores,n):
    good_captures = stores*  n
    df_good_captures = pd.DataFrame({'stores':good_captures})
    df_good_captures['bad_names'] = df_good_captures["stores"]
    return df_good_captures

def get_data(impute_n,instance,min_l,max_l): 
    n = impute_n
    instance_count = instance
    min_l = min_l
    max_l = max_l

    df_obfuscated = get_obfuscated_stores(stores,n)
    df_other = get_catch_gibberish(instance_count,min_l,max_l)
    df_good_captures = repeat_good_names(stores,n,instance_count)
    df = pd.concat([df_obfuscated,df_good_captures,df_other])
    return df


In [7]:
def get_train_test_data(split_ratio,impute_ratio,instance,min_l,max_l):
    test_size = (instance*split_ratio)
    train_size = (instance*(1-split_ratio))
    print(f"test_size = {test_size} & train_size = {train_size} & split = {split_ratio}")
    test_impute_n = int(test_size*impute_ratio)
    test_good_n = int(test_size - test_impute_n)
    print(f"test impute = {test_impute_n} & instances = {test_good_n}")
    df = read_data()
    stores = list(df["store"])
    # get imputed
    df_obfuscated = get_obfuscated_stores(stores,test_impute_n)
    # get good names
    df_good_names = get_good_names(stores, test_good_n)
    # get others 
    df_other = get_catch_gibberish(test_impute_n+test_good_n,min_l,max_l)
    df_test = pd.concat([df_obfuscated,df_good_names,df_other])
    
    del df_obfuscated,df_good_names,df_other
    train_impute_n = int(train_size*impute_ratio)
    train_good_n = int(train_size - train_impute_n)
    # get imputed
    df_obfuscated = get_obfuscated_stores(stores,train_impute_n)
    # get good names
    df_good_names = get_good_names(stores, train_good_n)
    # get others 
    df_other = get_catch_gibberish(train_impute_n+train_good_n,min_l,max_l)
    
    df_train = pd.concat([df_obfuscated,df_good_names,df_other])
    
    del df_obfuscated,df_good_names,df_other
    return df_train, df_test
    

In [8]:
split_ratio = 0.2
impute_ratio = 0.5
instances = 50
min_l = 5
max_l = 20

df_train, df_test = get_train_test_data(split_ratio,
                                        impute_ratio,
                                        instances,
                                        min_l,
                                        max_l)

test_size = 10.0 & train_size = 40.0 & split = 0.2
test impute = 5 & instances = 5


In [9]:
#from sklearn.model_selection import train_test_split
def get_train_test_split(df_train,df_test,one_hot_encode_labels=False):
    X_train = df_train["bad_names"].values
    y_train = df_train["stores"].values
    X_test  = df_test["bad_names"].values
    y_test  = df_test["stores"].values
    
    if one_hot_encode_labels:
        df_labels = pd.concat([df_train['stores'],df_test['stores']])
        print(f"Labels = {df_labels.shape})")
        lables = pd.get_dummies(df_labels)
        lookup = list(lables.columns)
        print(len(lookup))
        del df_labels
        y_test_labels = pd.get_dummies(y_test)
        y_test_encoded = y_test_labels.astype('float32').values
        y_train_labels = pd.get_dummies(y_train)
        y_train_encoded = y_train_labels.astype('float32').values
        return X_train,y_train_encoded,X_test,y_test_encoded, lookup
    else:
        return X_train, y_train, X_test, y_test, None
    

In [10]:

X_train, y_train, X_test, y_test, lable_lookup = get_train_test_split(df_train,
                                                                      df_test,
                                                                      one_hot_encode_labels=False)


In [11]:
## tf - idf
## some more robust embeddings are needed.
## Creting character level TF-IDF encoder/vectorizer to convert strings to consistent length vectors

from sklearn.feature_extraction.text import TfidfVectorizer
corpus = X_train
vectorizer = TfidfVectorizer(analyzer='char',strip_accents ="ascii" , ngram_range=(2,2)).fit(corpus)
print(vectorizer.get_feature_names_out())
print(len(vectorizer.get_feature_names_out()))


[' !' ' "' ' #' ... '~w' '~y' '~|']
3127


In [12]:
new_text = ["7-&E+l#'/-ev&en as"]
X_train_vector= vectorizer.transform(X_train).toarray()
X_test_vector = vectorizer.transform(X_test).toarray()
print(f" shape of train - {X_train_vector.shape} and len = {len(X_train_vector)}")
print(f" shage of test - {X_test_vector.shape} and len = {len(X_test_vector)}")

 shape of train - (4040, 3127) and len = 4040
 shage of test - (1010, 3127) and len = 1010


In [13]:
from sklearn.neighbors import KNeighborsClassifier
from scipy.spatial import distance      

def create_my_knn_classifier(neighbors: int = 20, n_jobs: int = 10, distance_alg=distance.correlation):
    classifier = KNeighborsClassifier(n_neighbors=neighbors,n_jobs=n_jobs, metric = distance_alg,)
    classifier.fit(X_train_vector, y_train)
    return classifier

In [14]:
def my_classifier(vectorizer,classifier,w:str):
    w_vector = vectorizer.transform([w]).toarray()
    pred = classifier.predict(w_vector)[0]
    conf = classifier.predict_proba(w_vector)    
    return (pred,np.max(conf))

pred_func = lambda x : my_classifier(vectorizer,my_knn_cosine_classifier, x)

In [15]:
my_knn_cosine_classifier = create_my_knn_classifier(20,10,distance_alg= distance.cosine)


In [16]:
from collections import deque

test_stores = ["Targe1", 
                "Larget", 
                '!a7g3T',
                "'7argay'",
                "7- 11",
                "Ac3 Hw"
              ]

deque(map(pred_func, test_stores))


deque([('Target', 1.0),
       ('Target', 1.0),
       ('Camping World', 0.1),
       ('Gap', 1.0),
       ('7-Eleven', 0.8),
       ('Ace Hardware', 0.95)])

In [17]:
#list(X_test)
## takes a while
import warnings
warnings.filterwarnings('ignore')

y_predicted = []
print(f" sr. \t Input \t\t  predicted \t\t conf")
for i in range(len(X_test)):
    pred = my_classifier(vectorizer,my_knn_cosine_classifier,X_test[i])
    print(f" {i} \t {X_test[i]} \t {pred[0]} \t {np.max(pred[1])}")
    y_predicted.append(pred)
    

 sr. 	 Input 		  predicted 		 conf
 0 	 7(7-Eleven 	 7-Eleven 	 1.0
 1 	 9$7-}Eleven 	 7-Eleven 	 1.0
 2 	 8;7--E>l'even 	 7-Eleven 	 1.0
 3 	 0)7-Eleve9n 	 7-Eleven 	 1.0
 4 	 6\7-|{@Eleven 	 7-Eleven 	 1.0
 5 	 AT/&T W1ir%eless 	 AT&T Wireless 	 1.0
 6 	 AT&T W&1i,re*less 	 AT&T Wireless 	 1.0
 7 	 AT&T Wirel%ess 	 AT&T Wireless 	 1.0
 8 	 AT&T W+4i*reless 	 AT&T Wireless 	 1.0
 9 	 AT&T Wireless 	 AT&T Wireless 	 1.0
 10 	 AVB Br:an%dsource 	 AVB Brandsource 	 1.0
 11 	 AVB Brandsource 	 AVB Brandsource 	 1.0
 12 	 AVB ]Brandso/urce 	 AVB Brandsource 	 1.0
 13 	 AVB Br.+andsource 	 AVB Brandsource 	 1.0
 14 	 AV[B B$r\an)@dsource 	 AVB Brandsource 	 1.0
 15 	 2`Academ|y{ Sports 	 Academy Sports 	 1.0
 16 	 Academy Sports 	 Academy Sports 	 1.0
 17 	 Academy Spor/ts 	 Academy Sports 	 1.0
 18 	 Academ`y Spo>rts 	 Academy Sports 	 1.0
 19 	 Academ!y Sports 	 Academy Sports 	 1.0
 20 	 Ace Hard<ware 	 Ace Hardware 	 1.0
 21 	 Ace Hardware 	 Ace Hardware 	 1.0
 22 	 Ac;e Ha0rdware 	 Ace

 164 	 Dollar G`eneral 	 Dollar General 	 1.0
 165 	 Dollar 56Tre>e 	 Dollar Tree 	 1.0
 166 	 Dollar Tree 	 Dollar Tree 	 1.0
 167 	 Do_llar Tree 	 Dollar Tree 	 1.0
 168 	 Doll{a(r T_ree 	 Dollar Tree 	 1.0
 169 	 4.Dollar Tree 	 Dollar Tree 	 1.0
 170 	 Exxon M0o]bil Corporation 	 Exxon Mobil Corporation 	 1.0
 171 	 Exxo?n 67Mobil Corpora!tion 	 Exxon Mobil Corporation 	 1.0
 172 	 Ex~xon M=obil 4!Corpor]ation 	 Exxon Mobil Corporation 	 1.0
 173 	 Ex$x"o!n ~Mo-bi|l> Co=rporation 	 Exxon Mobil Corporation 	 1.0
 174 	 2$5<!Exxon #'Mobil Corporati{on 	 Exxon Mobil Corporation 	 1.0
 175 	 Foot L_0ocker 	 Foot Locker 	 1.0
 176 	 Foot Locke;r 	 Foot Locker 	 1.0
 177 	 Fo[ot L*ocker 	 Foot Locker 	 1.0
 178 	 Fo^ot #1.Locker 	 Foot Locker 	 1.0
 179 	 8"Foot# Locker 	 Foot Locker 	 1.0
 180 	 8=Gap 	 Gap 	 1.0
 181 	 Ga=p 	 Gap 	 1.0
 182 	 Ga!p 	 Gap 	 1.0
 183 	 Gap 	 Gap 	 1.0
 184 	 Ga>p 	 Gap 	 1.0
 185 	 Giant Eagle 	 Giant Eagle 	 1.0
 186 	 Gi}ant E3agle 	 Giant Eagle 	 1.0
 

 349 	 Save! ?Mart 	 Save Mart 	 1.0
 350 	 7`Save-A-Lot 	 Save-A-Lot 	 1.0
 351 	 6+Save-A-Lot 	 Save-A-Lot 	 1.0
 352 	 Sa(ve+-A-Lot 	 Save-A-Lot 	 1.0
 353 	 1&2`Save;!-A-Lot 	 Save-A-Lot 	 1.0
 354 	 Save/-A-Lot 	 Save-A-Lot 	 1.0
 355 	 3!Sephor\a@ (LVMH) 	 Sephora (LVMH) 	 1.0
 356 	 0$Sephora (LVMH) 	 Sephora (LVMH) 	 1.0
 357 	 Sephora (LVMH) 	 Sephora (LVMH) 	 1.0
 358 	 Se@<phor%a\ (L0)|VMH) 	 Sephora (LVMH) 	 1.0
 359 	 Sephora2 (LVMH) 	 Sephora (LVMH) 	 1.0
 360 	 Shell Oi&l Company 	 Shell Oil Company 	 1.0
 361 	 9\Shell8 Oil Company 	 Shell Oil Company 	 1.0
 362 	 Shell Oil ^Company 	 Shell Oil Company 	 1.0
 363 	 Shell Oi/l Company 	 Shell Oil Company 	 1.0
 364 	 Shell Oil Company 	 Shell Oil Company 	 1.0
 365 	 Sherwin-Williams 	 Sherwin-Williams 	 1.0
 366 	 0|Sher/win-Williams 	 Sherwin-Williams 	 1.0
 367 	 0)Sherw4$in-W"illiams 	 Sherwin-Williams 	 1.0
 368 	 Sherwin3-"?William='s 	 Sherwin-Williams 	 1.0
 369 	 Sh_erw^in-Wi=lliams 	 Sherwin-Williams 	 1.0
 370

 520 	 Macy's 	 Macy's 	 1.0
 521 	 7-Eleven 	 7-Eleven 	 1.0
 522 	 AT&T Wireless 	 AT&T Wireless 	 1.0
 523 	 Meijer 	 Meijer 	 1.0
 524 	 Verizon Wireless 	 Verizon Wireless 	 1.0
 525 	 Ross Stores 	 Ross Stores 	 1.0
 526 	 Kohl's 	 Kohl's 	 1.0
 527 	 Wakefern / ShopRite 	 Wakefern / ShopRite 	 1.0
 528 	 Rite Aid 	 Rite Aid 	 1.0
 529 	 BJ's Wholesale Club 	 BJ's Wholesale Club 	 1.0
 530 	 Dell Technologies 	 Dell Technologies 	 1.0
 531 	 Gap 	 Gap 	 1.0
 532 	 Nordstrom 	 Nordstrom 	 1.0
 533 	 Menards 	 Menards 	 1.0
 534 	 O’Reilly Auto Parts 	 O’Reilly Auto Parts 	 1.0
 535 	 Tractor Supply Co. 	 Tractor Supply Co. 	 1.0
 536 	 AutoZone 	 AutoZone 	 1.0
 537 	 Dick's Sporting Goods 	 Dick's Sporting Goods 	 1.0
 538 	 Hy Vee 	 Hy Vee 	 1.0
 539 	 Wayfair 	 Wayfair 	 1.0
 540 	 Health Mart Systems 	 Health Mart Systems 	 1.0
 541 	 Wegmans Food Market 	 Wegmans Food Market 	 1.0
 542 	 Qurate Retail 	 Qurate Retail 	 1.0
 543 	 Giant Eagle 	 Giant Eagle 	 1.0
 544 	 Aliment

 704 	 The Kroger Co. 	 The Kroger Co. 	 1.0
 705 	 Walgreens Boots Alliance 	 Walgreens Boots Alliance 	 1.0
 706 	 Target 	 Target 	 1.0
 707 	 CVS Health Corporation 	 CVS Health Corporation 	 1.0
 708 	 Lowe's Companies 	 Lowe's Companies 	 1.0
 709 	 Albertsons Companies 	 Albertsons Companies 	 1.0
 710 	 Apple Stores / iTunes 	 Apple Stores / iTunes 	 1.0
 711 	 Royal Ahold Delhaize USA 	 Royal Ahold Delhaize USA 	 1.0
 712 	 Publix Super Markets 	 Publix Super Markets 	 1.0
 713 	 Best Buy 	 Best Buy 	 1.0
 714 	 TJX Companies 	 TJX Companies 	 1.0
 715 	 Aldi 	 Aldi 	 1.0
 716 	 Dollar General 	 Dollar General 	 1.0
 717 	 H.E. Butt Grocery 	 H.E. Butt Grocery 	 1.0
 718 	 Dollar Tree 	 Dollar Tree 	 1.0
 719 	 Ace Hardware 	 Ace Hardware 	 1.0
 720 	 Macy's 	 Macy's 	 1.0
 721 	 7-Eleven 	 7-Eleven 	 1.0
 722 	 AT&T Wireless 	 AT&T Wireless 	 1.0
 723 	 Meijer 	 Meijer 	 1.0
 724 	 Verizon Wireless 	 Verizon Wireless 	 1.0
 725 	 Ross Stores 	 Ross Stores 	 1.0
 726 	 Kohl's 

 886 	 Total Wine & More 	 Total Wine & More 	 1.0
 887 	 Defense Commissary Agency 	 Defense Commissary Agency 	 1.0
 888 	 Ingles 	 Ingles 	 1.0
 889 	 Weis Markets 	 Weis Markets 	 1.0
 890 	 Casey's General Store 	 Casey's General Store 	 1.0
 891 	 Tapestry 	 Tapestry 	 1.0
 892 	 Smart & Final 	 Smart & Final 	 1.0
 893 	 Lululemon 	 Lululemon 	 1.0
 894 	 Shell Oil Company 	 Shell Oil Company 	 1.0
 895 	 Golub 	 Golub 	 1.0
 896 	 Save Mart 	 Save Mart 	 1.0
 897 	 RH 	 RH 	 1.0
 898 	 Urban Outfitters 	 Urban Outfitters 	 1.0
 899 	 Barnes & Noble 	 Barnes & Noble 	 1.0
 900 	 Walmart 	 Walmart 	 1.0
 901 	 Amazon.com 	 Amazon.com 	 1.0
 902 	 Costco Wholesale 	 Costco Wholesale 	 1.0
 903 	 The Home Depot 	 The Home Depot 	 1.0
 904 	 The Kroger Co. 	 The Kroger Co. 	 1.0
 905 	 Walgreens Boots Alliance 	 Walgreens Boots Alliance 	 1.0
 906 	 Target 	 Target 	 1.0
 907 	 CVS Health Corporation 	 CVS Health Corporation 	 1.0
 908 	 Lowe's Companies 	 Lowe's Companies 	 1.0
 90

In [18]:
conf_mat = pd.DataFrame({'test_names':X_test,'actual':y_test})

conf_mat['predicted'],conf_mat['conf']= zip(*map(pred_func,conf_mat['test_names']))


In [19]:
conf_mat.groupby('actual').describe().transpose()

Unnamed: 0,actual,7-Eleven,AT&T Wireless,AVB Brandsource,Academy Sports,Ace Hardware,Advance Auto,Albertsons Companies,Aldi,Alimentation Couche-Tard,Amazon.com,...,Verizon Wireless,Victoria's Secret,Wakefern / ShopRite,Walgreens Boots Alliance,Walmart,Wayfair,Wegmans Food Market,Weis Markets,Williams-Sonoma,WinCo Foods
conf,count,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,...,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0
conf,mean,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.995,1.0,1.0,...,1.0,1.0,1.0,1.0,0.995,1.0,1.0,1.0,1.0,1.0
conf,std,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.015811,0.0,0.0,...,0.0,0.0,0.0,0.0,0.015811,0.0,0.0,0.0,0.0,0.0
conf,min,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.95,1.0,1.0,...,1.0,1.0,1.0,1.0,0.95,1.0,1.0,1.0,1.0,1.0
conf,25%,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
conf,50%,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
conf,75%,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
conf,max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [20]:
conf_mat[conf_mat["actual"]=='Other'].describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
conf,10.0,0.22,0.227547,0.1,0.1,0.1,0.15,0.65


In [21]:
conf_mat[conf_mat["actual"]=='Other']

Unnamed: 0,test_names,actual,predicted,conf
1000,7O3;[1~H_\&'2M,Other,Hudson's Bay,0.15
1001,V`\5|[/@,Other,Amazon.com,0.1
1002,P9~@VOR,Other,CVS Health Corporation,0.65
1003,K_H0E:._>WGG|S'Y;Y,Other,Piggly Wiggly,0.65
1004,F@]$$|,Other,Other,0.1
1005,9*&WF3Q`\_,Other,Weis Markets,0.15
1006,+<^{2PM|388S,Other,Shell Oil Company,0.1
1007,>:0'E_,Other,Hy Vee,0.1
1008,'$D73V25,Other,Discount Tire,0.1
1009,"""2!+F+7>NHVH;-?/2(O",Other,Alimentation Couche-Tard,0.1


In [22]:
conf_mat.head()

Unnamed: 0,test_names,actual,predicted,conf
0,7(7-Eleven,7-Eleven,7-Eleven,1.0
1,9$7-}Eleven,7-Eleven,7-Eleven,1.0
2,8;7--E>l'even,7-Eleven,7-Eleven,1.0
3,0)7-Eleve9n,7-Eleven,7-Eleven,1.0
4,6\7-|{@Eleven,7-Eleven,7-Eleven,1.0


In [23]:
conf_mat[conf_mat['conf']>=.90].count()

test_names    1000
actual        1000
predicted     1000
conf          1000
dtype: int64

In [24]:
conf_mat[conf_mat['conf']<=.90]

Unnamed: 0,test_names,actual,predicted,conf
406,"Ta$pest4r""y",Tapestry,Tapestry,0.9
1000,7O3;[1~H_\&'2M,Other,Hudson's Bay,0.15
1001,V`\5|[/@,Other,Amazon.com,0.1
1002,P9~@VOR,Other,CVS Health Corporation,0.65
1003,K_H0E:._>WGG|S'Y;Y,Other,Piggly Wiggly,0.65
1004,F@]$$|,Other,Other,0.1
1005,9*&WF3Q`\_,Other,Weis Markets,0.15
1006,+<^{2PM|388S,Other,Shell Oil Company,0.1
1007,>:0'E_,Other,Hy Vee,0.1
1008,'$D73V25,Other,Discount Tire,0.1


In [25]:
len(conf_mat["predicted"].values[0])

8

In [26]:
conf_mat["correct"] = conf_mat["actual"] == conf_mat["predicted"]
conf_mat.groupby('correct').describe()

Unnamed: 0_level_0,conf,conf,conf,conf,conf,conf,conf,conf
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
correct,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
False,9.0,0.233333,0.237171,0.1,0.1,0.1,0.15,0.65
True,1001.0,0.998701,0.028867,0.1,1.0,1.0,1.0,1.0


In [27]:

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

y_predicted_eval = [x[0] for x in y_predicted]

result = confusion_matrix(y_test, y_predicted_eval)
print("Confusion Matrix:")
print(result)
result1 = classification_report(y_test, y_predicted_eval)
print("Classification Report:",)
print (result1)
result2 = accuracy_score(y_test,y_predicted_eval)
print("Accuracy:",result2)

Confusion Matrix:
[[10  0  0 ...  0  0  0]
 [ 0 10  0 ...  0  0  0]
 [ 0  0 10 ...  0  0  0]
 ...
 [ 0  0  0 ... 10  0  0]
 [ 0  0  0 ...  0 10  0]
 [ 0  0  0 ...  0  0 10]]
Classification Report:
                                     precision    recall  f1-score   support

                           7-Eleven       1.00      1.00      1.00        10
                      AT&T Wireless       1.00      1.00      1.00        10
                    AVB Brandsource       1.00      1.00      1.00        10
                     Academy Sports       1.00      1.00      1.00        10
                       Ace Hardware       1.00      1.00      1.00        10
                       Advance Auto       1.00      1.00      1.00        10
               Albertsons Companies       1.00      1.00      1.00        10
                               Aldi       1.00      1.00      1.00        10
           Alimentation Couche-Tard       0.91      1.00      0.95        10
                         Amazon.

In [30]:
my_classifier(vectorizer,my_knn_cosine_classifier,'7a G3 7')

('Ikea North America Services', 0.2)

In [31]:
my_classifier(vectorizer,my_knn_cosine_classifier,'12321ueiajsidams')

('True Value Co.', 0.95)

In [32]:
my_classifier(vectorizer,my_knn_cosine_classifier,'Target')

('Target', 1.0)

In [33]:
my_classifier(vectorizer,my_knn_cosine_classifier,'Target')

('Target', 1.0)

In [34]:
# updating predictor function to only assign predicted value when it is 95% sure
def threshold_classifier(threshold,vectorizer,classifier,w:str):
    w_vector = vectorizer.transform([w]).toarray()
    pred = classifier.predict(w_vector)[0]
    conf = classifier.predict_proba(w_vector)   
    guess_conf = np.max(conf)
    if guess_conf < threshold:
        pred = "Other"
    return (pred,np.max(conf))
    
pred_func = lambda x : threshold_classifier(0.95,vectorizer,my_knn_cosine_classifier, x)

In [35]:
conf_mat = pd.DataFrame({'test_names':X_test,'actual':y_test})

conf_mat['predicted'],conf_mat['conf']= zip(*map(pred_func,conf_mat['test_names']))


In [36]:
conf_mat["correct"] = conf_mat["actual"] == conf_mat["predicted"]
conf_mat.groupby('correct').describe()

Unnamed: 0_level_0,conf,conf,conf,conf,conf,conf,conf,conf
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
correct,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
False,1.0,0.9,,0.9,0.9,0.9,0.9,0.9
True,1009.0,0.991972,0.080302,0.1,1.0,1.0,1.0,1.0


In [37]:
conf_mat[conf_mat["actual"]=='Other']

Unnamed: 0,test_names,actual,predicted,conf,correct
1000,7O3;[1~H_\&'2M,Other,Other,0.15,True
1001,V`\5|[/@,Other,Other,0.1,True
1002,P9~@VOR,Other,Other,0.65,True
1003,K_H0E:._>WGG|S'Y;Y,Other,Other,0.65,True
1004,F@]$$|,Other,Other,0.1,True
1005,9*&WF3Q`\_,Other,Other,0.15,True
1006,+<^{2PM|388S,Other,Other,0.1,True
1007,>:0'E_,Other,Other,0.1,True
1008,'$D73V25,Other,Other,0.1,True
1009,"""2!+F+7>NHVH;-?/2(O",Other,Other,0.1,True


In [38]:
conf_mat[conf_mat["correct"]==False]

Unnamed: 0,test_names,actual,predicted,conf,correct
406,"Ta$pest4r""y",Tapestry,Other,0.9,False


In [39]:
result = confusion_matrix(conf_mat['actual'], conf_mat['predicted'])
print("Confusion Matrix:")
print(result)
result1 = classification_report(conf_mat['actual'], conf_mat['predicted'])
print("Classification Report:",)
print (result1)
result2 = accuracy_score(conf_mat['actual'], conf_mat['predicted'])
print("Accuracy:",result2)

Confusion Matrix:
[[10  0  0 ...  0  0  0]
 [ 0 10  0 ...  0  0  0]
 [ 0  0 10 ...  0  0  0]
 ...
 [ 0  0  0 ... 10  0  0]
 [ 0  0  0 ...  0 10  0]
 [ 0  0  0 ...  0  0 10]]
Classification Report:
                                     precision    recall  f1-score   support

                           7-Eleven       1.00      1.00      1.00        10
                      AT&T Wireless       1.00      1.00      1.00        10
                    AVB Brandsource       1.00      1.00      1.00        10
                     Academy Sports       1.00      1.00      1.00        10
                       Ace Hardware       1.00      1.00      1.00        10
                       Advance Auto       1.00      1.00      1.00        10
               Albertsons Companies       1.00      1.00      1.00        10
                               Aldi       1.00      1.00      1.00        10
           Alimentation Couche-Tard       1.00      1.00      1.00        10
                         Amazon.