In [108]:
class Record_Linkage(object):
    
    
    '''
    This is class to instantiate Record Linkage object. It accepts sample/test csv (test_data),
    master data csv (golden_data) to compare with
    threshold for fast string search (match_threshold_fast_string), 
    limit for choices(one shot or above that) (first_iter_lim), ngrams count, 
    score for matching the fast
    string (match_score_fast_string), spliting the strings starting in group 
    starting from the count (ensemble_string_split_lim) and top suggestions (top_suggestions)
    '''
    
    def __init__(self, test_data, golden_data, match_threshold_fast_string=0.45, first_iter_lim=1,ngram_no=2, 
                  match_score_fast_string=0.85, ensemble_string_split_lim=3, top_suggestions=10):
        
        import os
        import collections
        import random
        import pandas as pd
        import numpy as np
        import copy
        import re
        import time
        import gc
        from scipy.sparse import csr_matrix
        from fuzzywuzzy import fuzz
        from fuzzywuzzy import process
        from collections import Counter
        from sklearn.feature_extraction.text import TfidfVectorizer
        
        self.test_data = test_data
        self.golden_data = golden_data
        
        self.top_suggestions = top_suggestions
        self.ensemble_string_split_lim = ensemble_string_split_lim
        self.match_score_fast_string = match_score_fast_string
        self.match_threshold_fast_string = match_threshold_fast_string
        self.ngram_no = ngram_no
        self.first_iter_lim = first_iter_lim
        self.re = re
        self.TfidfVectorizer = TfidfVectorizer
        self.copy = copy
        self.time = time
        self.process = process
        self.gc = gc
        self.gc.collect()
        self.sample_csv= pd.read_csv(self.test_data)
        self.golden_csv = pd.read_csv(self.golden_data)

        self.golden_csv.columns = [['name', 'mcn-id','existing-or-new','address1','address2','city','zip','country','temp1','temp2','temp3','temp4','temp5','website','temp6','temp7']]
        self.golden_csv_cleaned = self.golden_csv.drop('temp1', axis=1)
        self.golden_csv_cleaned = self.golden_csv_cleaned.drop('temp2', axis=1)
        self.golden_csv_cleaned = self.golden_csv_cleaned.drop('temp3', axis=1)
        self.golden_csv_cleaned = self.golden_csv_cleaned.drop('temp4', axis=1)
        self.golden_csv_cleaned = self.golden_csv_cleaned.drop('temp5', axis=1)
        self.golden_csv_cleaned = self.golden_csv_cleaned.drop('temp6', axis=1)
        self.golden_csv_cleaned = self.golden_csv_cleaned.drop('temp7', axis=1)
        # golden_csv_cleaned = golden_csv_cleaned.dropna(axis=0, how='any')
        values = {'existing-or-new': 'EC-Existing', 'city': 'TX', 'zip': 11111, 'website': 'www.google.com'}
        self.golden_csv_cleaned['existing-or-new'] = self.golden_csv_cleaned['existing-or-new'].fillna('EC-Existing')
        self.golden_csv_cleaned['city'] = self.golden_csv_cleaned['city'].fillna('TX')
        self.golden_csv_cleaned['zip'] = self.golden_csv_cleaned['zip'].fillna('11111')
        self.golden_csv_cleaned['website'] = self.golden_csv_cleaned['website'].fillna('www.google.com')
        # golden_csv_cleaned.isnull().sum()

        self.sample_csv_cleaned = self.sample_csv.drop('eu_address2', axis=1)
        self.sample_csv_cleaned = self.sample_csv_cleaned.drop('eu_address3', axis=1)
        self.sample_csv_cleaned = self.sample_csv_cleaned.drop('eu_address4', axis=1)
        self.sample_csv_cleaned = self.sample_csv_cleaned.drop('eu_province', axis=1)
        self.sample_csv_cleaned = self.sample_csv_cleaned.dropna(axis=0, how='any')
        # sample_csv_cleaned.isnull().sum()

        self.sample_csv_cleaned['name_address'] = self.sample_csv_cleaned['eu_name']+ ' ' + self.sample_csv_cleaned['eu_address1']+' '  + self.sample_csv_cleaned['eu_city']+ ' ' + \
        self.sample_csv_cleaned['eu_zip_code']+ ' '  + self.sample_csv_cleaned['eu_state']+ ' ' +self. sample_csv_cleaned['eu_country']

        a1 = self.golden_csv_cleaned['name'].values
        a2 = self.golden_csv_cleaned['address1'].values
        a3 = self.golden_csv_cleaned['address2'].values
        a4 = self.golden_csv_cleaned['city'].values
        a5 = self.golden_csv_cleaned['zip'].values
        a6 = self.golden_csv_cleaned['country'].values

        a = a1+' '+a2+' '+a3+' '+a4+' '+a5+' '+a6

        self.train_arr = []
        self.test_arr = []
        self.mcn_arr_test = []
        train_data_length=len(self.sample_csv_cleaned)
        for i in range(0, int(train_data_length)):
            train_arr_temp = self.sample_csv_cleaned['name_address'].values[i]
            mcn_number_temp = self.sample_csv_cleaned['eu_mcn'].values[i]
            self.train_arr.append(train_arr_temp)
            self.mcn_arr_test.append(mcn_number_temp)


        self.mcn_arr_golden = self.golden_csv_cleaned['mcn-id'].values.tolist()
        self.train_arr = np.array(self.train_arr)

        np.savetxt(r'test_data_small.txt', self.train_arr, fmt='%s', encoding='utf-8')
        np.savetxt(r'golden_data_small.txt', a, fmt='%s', encoding='utf-8')

        self.golden_data = a.tolist()
        self.golden_flat_list = [item for sublist in self.golden_data for item in sublist]

        self.test_data = self.train_arr.tolist()

        self.test_dict = {}
        for i in range(0, len(self.mcn_arr_test)):
            self.test_dict[self.mcn_arr_test[i]] = self.train_arr[i]

        self.golden_dict = {}
        # golden_dict.setdefault('dummy',[])
        for i in range(0, len(self.mcn_arr_golden)):
            self.golden_dict[self.mcn_arr_golden[i][0]] = self.golden_flat_list[i]

        # test_data = test_data[0:120]

        self.test_dict_small = {}
        # golden_dict.setdefault('dummy',[])
        for i in range(0, 100):
            self.test_dict_small[self.mcn_arr_test[i]] = self.train_arr[i]

        self.golden_dict_small = {}
        # golden_dict.setdefault('dummy',[])
        for i in range(0, 1000):
            self.golden_dict_small[self.mcn_arr_golden[i][0]] = self.golden_flat_list[i]
    
    def by_size(self,words, size):
        return [word for word in words if len(word.split(' ')) >= size]
    
    def match_first_iteration(self):
        
        self.gc.collect()
        choices = self.golden_dict.values()
        self.score= 0
        found_flag=0
        self.not_found=[]
        for i in range(0, len(self.test_data)-1):   # change this to len(self.test_data)
            found_flag=0
            chlist=self.process.extract(self.test_data[i], choices, limit=self.first_iter_lim)
            print("{},   --test--{}".format(self.test_data[i], self.mcn_arr_test[i]))

            for mcn,value in self.golden_dict.items():
                for j in range(0,self.first_iter_lim):
                    if value == chlist[j][0]:
                        if self.mcn_arr_test[i] == mcn:
                            self.score+=1

                            print("{},   --MDM--{}".format(chlist[j][0], mcn))
                            print("===================================================")
                            found_flag=1
                            break
            if found_flag == 0:
                self.not_found.append(i)
        print("==/==/===/===/===/===/===/===/====/===/==/==/==/==/===/===/===/===/==")
        print("==/==/===/===/===/===/===/===/====/===/==/==/==/==/===/===/===/===/==")
        print("Total matches found are" ,self.score)        
        print('accuracy is', self.score/len(self.test_data))
        print('indexes of unmatched samples are', self.not_found)
        
    
    
    def mismatched_first_iter(self):
        
        self.not_matched = []
        for i in self.not_found:
            self.not_matched.append(self.train_arr[i])
            
        
        self.not_matched_test  = self.copy.copy(self.not_matched)
        
        for i in range(0, len(self.golden_flat_list)):
            self.not_matched.append(self.golden_flat_list[i])
            
        
    def get_matches_df(self,sparse_matrix, name_vector, top=1000):
        import numpy as np
        import pandas as pd
        non_zeros = sparse_matrix.nonzero()

        sparserows = non_zeros[0]
        sparsecols = non_zeros[1]

        if top:
            nr_matches = top
        else:
            nr_matches = sparsecols.size

        left_side = np.empty([nr_matches], dtype=object)
        right_side = np.empty([nr_matches], dtype=object)
        similairity = np.zeros(nr_matches)

        for index in range(0, nr_matches):
            left_side[index] = name_vector[sparserows[index]]
            right_side[index] = name_vector[sparsecols[index]]
            similairity[index] = sparse_matrix.data[index]

        return pd.DataFrame({'left_side': left_side,
                              'right_side': right_side,
                               'similairity': similairity})
    
    

    def ngrams_partial(self,string, n=None):
        if n is None:
            n= self.ngram_no
        string = self.re.sub(r'[,-./]|\sBD',r'', string)
        self.ngrams = zip(*[string[i:] for i in range(n)])
        return [''.join(self.ngram) for self.ngram in self.ngrams]


    def vectorize(self):
        
        self.mismatched_first_iter()
        self.company_names = self.not_matched
        vectorizer = self.TfidfVectorizer(min_df=1, analyzer=self.ngrams_partial)
        self.tf_idf_matrix_partial = vectorizer.fit_transform(self.company_names)
        
        
    
    def awesome_cossim_top(self,A, B, ntop, lower_bound=0):
        import numpy as np
        from scipy.sparse import csr_matrix
        import sparse_dot_topn.sparse_dot_topn as ct
        # force A and B as a CSR matrix.
        # If they have already been CSR, there is no overhead
        A = A.tocsr()
        B = B.tocsr()
        M, _ = A.shape
        _, N = B.shape

        idx_dtype = np.int32

        nnz_max = M*ntop

        indptr = np.zeros(M+1, dtype=idx_dtype)
        indices = np.zeros(nnz_max, dtype=idx_dtype)
        data = np.zeros(nnz_max, dtype=A.dtype)

        ct.sparse_dot_topn(
            M, N, np.asarray(A.indptr, dtype=idx_dtype),
            np.asarray(A.indices, dtype=idx_dtype),
            A.data,
            np.asarray(B.indptr, dtype=idx_dtype),
            np.asarray(B.indices, dtype=idx_dtype),
            B.data,
            ntop,
            lower_bound,
            indptr, indices, data)

        return csr_matrix((data,indices,indptr),shape=(M,N))

       
    def match_second_iteration(self):
        import pandas as pd
        t1 = self.time.time()
        self.vectorize()
        matches_partial = self.awesome_cossim_top(self.tf_idf_matrix_partial, self.tf_idf_matrix_partial.transpose(), 40, self.match_threshold_fast_string)
        t = self.time.time()-t1
        print("SELFTIMED:", t)
        
        self.matches_partial_df = self.get_matches_df(matches_partial, self.company_names,len(self.company_names))
        self.matches_partial_df = self.matches_partial_df[self.matches_partial_df['similairity'] <= 0.99999] # Remove all exact matches
        
        self.matches_partial_df1 = self.matches_partial_df[self.matches_partial_df['similairity']>=self.match_score_fast_string]
        self.matches_partial_df2 = self.matches_partial_df1.drop_duplicates('left_side', keep='first')
        self.matches_partial_filtered_df = self.matches_partial_df2.drop_duplicates('right_side', keep='first')
        
        self.golden_partial_mcn_list = []
        
        for i in range(0, len(self.matches_partial_filtered_df)):

            if self.matches_partial_filtered_df['right_side'].tolist()[i] in self.golden_dict.values():

                for mcn, value in self.golden_dict.items():

                    if (value == self.matches_partial_filtered_df['right_side'].tolist()[i]):

                        self.golden_partial_mcn_list.append(mcn)

                        break

            else:
                self.golden_partial_mcn_list.append(0)
                
        self.test_partial_mcn_list = []
        for i in range(0, len(self.matches_partial_filtered_df)):

            if self.matches_partial_filtered_df['left_side'].tolist()[i] in self.test_dict.values():

                for mcn, value in self.test_dict.items():

                    if (value == self.matches_partial_filtered_df['left_side'].tolist()[i]):

                        self.test_partial_mcn_list.append(mcn)

                        break
            
            else:
                self.test_partial_mcn_list.append(0)
                
                
        self.gc.collect()
        self.score_2 = 0
        self.score_list=[]
        self.matched_list_step2=[]
        for i in range(0, len(self.test_partial_mcn_list)):

            if self.test_partial_mcn_list[i] == self.golden_partial_mcn_list[i]:
                if self.test_partial_mcn_list[i] != 0:
                    self.score_2+=1
                    # score_list is for removing dumplicates from the final matched data
                    self.score_list.extend(([self.test_partial_mcn_list[i], self.matches_partial_filtered_df['left_side'].tolist()[i]]))
                    print("MCN Number in test data is {} and test sample is {}".format(self.test_partial_mcn_list[i], self.matches_partial_filtered_df['left_side'].tolist()[i]))
                    print("MCN Number in golden data is {} and golden sample is {}".format(self.golden_partial_mcn_list[i], self.matches_partial_filtered_df['right_side'].tolist()[i]))
                    print("score is {}".format(self.matches_partial_filtered_df['similairity'].tolist()[i]))
                    print('================================================================================') 
                    self.matched_list_step2.append(self.matches_partial_filtered_df['left_side'].tolist()[i])
        #         if test_partial_mcn_list[i] == test_partial_mcn_list[i+1]:

        #             continue

        # exclude zero from the list
        self.test_partial_mcn_nonzero_list = [num for num in self.test_partial_mcn_list if num] 
        try:
            self.accuracy = 0.5*len(set(self.score_list))/(len(self.test_partial_mcn_nonzero_list)) *100
        except:
            self.accuracy = 0
            

        print('====================================================')   
        print('====================================================')  
        print("Accuracy on mismatched data is {} %".format(self.accuracy))
        
    
    
    def mismatched_second_iter(self):
        
        self.matched_list_step2 = list(set(self.matched_list_step2))
        
        self.not_matched_test_step2 = [item for item in self.not_matched_test if item not in self.matched_list_step2]
        
        self.test_partial_mcn_list_step2 = []
        for i in range(0, len(self.not_matched_test_step2)):
            for mcn, value in self.test_dict.items():

                if (value == self.not_matched_test_step2[i]):


                    self.test_partial_mcn_list_step2.append(mcn)

                    break
        
    def match_third_iteration_ensemble(self):
        
        from collections import Counter
        t1 = self.time.time()
        self.mismatched_second_iter()
        # import pdb; pdb.set_trace()
        self.test_data_split_features = []
        for k in range(0, len(self.not_matched_test_step2)):
            self.test_data_split_features_k = []   
            z= self.not_matched_test_step2[k].split(' ')
            for i in range(0, len(z)+1):
                temp = z[0:-len(z)+i]
                if(i==len(z)):
                    temp = z
                temp = ' '.join(temp)
                self.test_data_split_features_k.append(temp)

            self.test_data_split_features_k = list(filter(None, self.test_data_split_features_k))
            self.test_data_split_features_k = self.by_size(self.test_data_split_features_k, self.ensemble_string_split_lim)
            self.test_data_split_features.append(self.test_data_split_features_k)
        print(self.test_data_split_features)

        choices = self.golden_dict_small.values()
        self.score_3= 0
        found_flag=0
        self.not_found={}
        self.accuracy_list=[]
        
        
        self.gc.collect()
        self.not_found_iteration = []
        for k in range(0, len(self.test_data_split_features)):

            self.chlists = []
            for i in range(0, len(self.test_data_split_features[k])-1):

                found_flag=0
                chlist=self.process.extract(self.test_data_split_features[k][i], choices, limit=self.top_suggestions)
                self.chlists.append(chlist)

            self.chlists_flat_list = [item for sublist in self.chlists for item in sublist]
            b  = dict(Counter(self.chlists_flat_list))
            # to get the max count value of dict in order to find most voted choice
            maxValue = max(b.values())  #<-- max of values
            self.screened_max_list = [key for key in b if b[key]==maxValue]

            # to get the most common count value of dict in order to find most frequent choice
            track={}
            for key,value in b.items():
                if value not in track:
                    track[value]=0
                else:
                    track[value]+=1

            common_val= max(track,key=track.get)
            self.screened_common_list = [key for key in b if b[key]==common_val]
        #     import pdb; pdb.set_trace()
            self.screened_list = [self.screened_common_list, self.screened_max_list]
            self.screened_list = [item for sublist in self.screened_list for item in sublist]
            #     print("{},   --test--{}".format(test_data[i], mcn_arr_test[i]))

            for mcn,value in self.golden_dict_small.items():
        #         import pdb;pdb.set_trace()
                for j in range(0,len(self.screened_list)):
                    if value == self.screened_list[j][0]:
                        if self.mcn_arr_test[k] == mcn:
                            self.score_3+=1
                            print('Done for test sample', k)
                            t = self.time.time()-t1
                            print("SELFTIMED:", t)

        #                     print("{},   --MDM--{}".format(chlist[j][0], mcn))
        #                     print("===================================================")
                            found_flag=1
                            break
            if found_flag == 0:
                self.not_found_iteration.append(k)
        #         not_found[k] = not_found_iteration

            self.accuracy = 100* self.score_3/len(self.test_data_split_features)
            self.accuracy_list.append(self.accuracy)

        print("Total matches found are {} out of {}".format(self.score_3, len(self.test_data_split_features)))        
        print('accuracy is', self.accuracy_list, '%')
        print('mismatched sample indexes in test data are:')
        print(self.not_found_iteration)
        print("===================================================")
        print('We are done with 1st level of matching')
        
        

                




In [98]:
obj = Record_Linkage('test_data.csv', 'golden_data.csv',match_threshold_fast_string=0.45, 
                     first_iter_lim=1, ngram_no=3, match_score_fast_string=0.75, 
                     ensemble_string_split_lim=2, top_suggestions=15 )

  obj = obj._drop_axis(labels, axis, level=level, errors=errors)


In [100]:
obj.match_first_iteration()

Barwick Cpa 218 Fulford Ave Bel Air 210140000 MD US,   --test--MCN-47996044
Little Apple Toyota Honda 2828 Amherst Ave Manhattan 665023096 KS US,   --test--MCN-05952813
Little Apple Toyota Honda 2828 Amherst Ave Manhattan KS 66502 US,   --MDM--MCN-05952813
Adp Network Solutions 76002379 16177 Sw 155 Ave Miami 331870000 FL US,   --test--MCN-67486028
Adp Network Solutions 76002379 16177 Sw 155 Ave Miami FL 33187 US,   --MDM--MCN-67486028
Prestige Subaru Suzuki08320005 585 Tunnel Rd Asheville 288050000 NC US,   --test--MCN-09000630
Prestige Subaru Suzuki 08320005 585 Tunnel Rd Asheville NC 28805 US,   --MDM--MCN-09000630
WOLBERG ELECTRICAL SUPPLY COMPANY I 35 INDUSTRIAL PARK ROAD ALBANY 12206 NY US,   --test--MCN-30243660
Wolberg Electrical Supply Company I 35 Industrial Park Road Albany NY 12206 US,   --MDM--MCN-30243660
ST FRANCIS COUNTY CIRCUIT CLERK 313 S IZARD ST FORREST CITY 72335 AR US,   --test--MCN-81965570
St Francis County Circuit Clerk 313 S Izard St Forrest City AR 72335 US, 

CAROLINA ORTHOPAEDIC & SPORTS 2345 COURT DRIVE GASTONIA 28054 NC US,   --test--MCN-24247020
Carolina Orthopaedic & Sports 2345 Court Drive Gastonia NC 28054 US,   --MDM--MCN-24247020
STAR CONCRETE PUMPING 24223 HUFSMITH KOHRVILLE ROAD TOMBALL 77375 TX US,   --test--MCN-71550775
Star Concrete Pumping 24223 Hufsmith Kohrville Road Tomball TX 77375 US,   --MDM--MCN-71550775
MECKLENBURG COUNTY PUBLIC SCHOOLS 105 MAYFIELD DR BOYDTON 23917 VA US,   --test--MCN-80115450
WHITESIDE MANAGEMENT 2 HENRY ADAMS ST SUITE M-3 SAN FRANCISCO 941030000 CA US,   --test--MCN-09256637
Whiteside Management 2 Henry Adams St Suite M-3 San Francisco CA 94103 US,   --MDM--MCN-09256637
Next Step HealthCare, LLC 392 MAIN ST SACO 04072-1521 ME US,   --test--MCN-59249559
BRANDYWINE REALTY TRUST 555 E LANCASTER AVE WAYNE 19087 PA US,   --test--MCN-83098269
Brandywine Realty & Trust 555 E Lancaster Ave Radnor PA 19087 US,   --MDM--MCN-83098269
Concentric Advisors-hq 4030 Lake Washington Blvd Ne Ste 20 Kirkland 9803300

SUNPOWER INC 430 INDIO WAY SUNNYVALE 94085 CA US,   --test--MCN-04055926
STAFFORD DEVELOPEMENT INC 1805 US HIGHWAY 82 W TIFTON 317938165 GA US,   --test--MCN-53026954
SPEEDWAY MOTORS INC 340 VICTORY LN LINCOLN 68528 NE US,   --test--MCN-69524634
GLOBAL VALUE COMMERCE INC 7320 ACC BLVD RALEIGH 27617 NC US,   --test--MCN-40479299
NEWSOM EYE & LASER CENTER INC 13904 N DALE MABRY HWY STE 200 TAMPA 33618 FL US,   --test--MCN-38802425
Newsom Eye & Laser Center Inc 13904 N Dale Mabry Hwy Ste 200Accounts Payable / Cdw Rwx7180 /cpo Tampa FL 33618 US,   --MDM--MCN-38802425
F.J O'HARA & SONS, INC 7 FID KENNEDY AVE BOSTON 22102321 MA US,   --test--MCN-89641398
F.j. O Hara & Sons 7 Fid Kennedy Ave. Boston MA 2210 US,   --MDM--MCN-89641398
CRAFTWORKS 301 W MAIN ST CHATTANOOGA 37408 TN US,   --test--MCN-27714047
PRAVIS LLC 113 SAVANNAH LOOP MOUNTAIN VIEW 940433871 CA US,   --test--MCN-76272798
AMERIPRISE BANK, FSB 207 E LIVINGSTON ST ORLANDO 32801 FL US,   --test--MCN-50529301
VIBE CREDIT UNION 44575

In [101]:
obj.match_second_iteration()

SELFTIMED: 0.11500000953674316
MCN Number in test data is MCN-56591783 and test sample is Harwood International Inc 3130 N. Harwood Street Dallas 752010000 TX US
MCN Number in golden data is MCN-56591783 and golden sample is Harwood International Inc 2501 N Harwood St Ste 1400 Dallas TX 75201 US
score is 0.7686105953893523
Accuracy on mismatched data is 100.0 %


In [102]:
obj.match_third_iteration_ensemble()

[['Barwick Cpa', 'Barwick Cpa 218', 'Barwick Cpa 218 Fulford', 'Barwick Cpa 218 Fulford Ave', 'Barwick Cpa 218 Fulford Ave Bel', 'Barwick Cpa 218 Fulford Ave Bel Air', 'Barwick Cpa 218 Fulford Ave Bel Air 210140000', 'Barwick Cpa 218 Fulford Ave Bel Air 210140000 MD', 'Barwick Cpa 218 Fulford Ave Bel Air 210140000 MD US'], ['THE DAIRY', 'THE DAIRY FARM', 'THE DAIRY FARM COMPANY', 'THE DAIRY FARM COMPANY 6', 'THE DAIRY FARM COMPANY 6 MAROON', 'THE DAIRY FARM COMPANY 6 MAROON EXT', 'THE DAIRY FARM COMPANY 6 MAROON EXT BONITA', 'THE DAIRY FARM COMPANY 6 MAROON EXT BONITA HOMES', 'THE DAIRY FARM COMPANY 6 MAROON EXT BONITA HOMES CONCEPCION', 'THE DAIRY FARM COMPANY 6 MAROON EXT BONITA HOMES CONCEPCION MARIKINA', 'THE DAIRY FARM COMPANY 6 MAROON EXT BONITA HOMES CONCEPCION MARIKINA 1800', 'THE DAIRY FARM COMPANY 6 MAROON EXT BONITA HOMES CONCEPCION MARIKINA 1800 METRO', 'THE DAIRY FARM COMPANY 6 MAROON EXT BONITA HOMES CONCEPCION MARIKINA 1800 METRO MANILA', 'THE DAIRY FARM COMPANY 6 MAROON

Done for test sample 36
SELFTIMED: 828.9828898906708
Done for test sample 50
SELFTIMED: 1153.0615842342377
Total matches found are 2 out of 58
accuracy is [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.7241379310344827, 1.7241379310344827, 1.7241379310344827, 1.7241379310344827, 1.7241379310344827, 1.7241379310344827, 1.7241379310344827, 1.7241379310344827, 1.7241379310344827, 1.7241379310344827, 1.7241379310344827, 1.7241379310344827, 1.7241379310344827, 1.7241379310344827, 3.4482758620689653, 3.4482758620689653, 3.4482758620689653, 3.4482758620689653, 3.4482758620689653, 3.4482758620689653, 3.4482758620689653, 3.4482758620689653] %
mismatched sample indexes in test data are:
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,

In [103]:
obj.first_iter_lim

1

In [104]:
obj.match_score_fast_string

0.75