# Wyszukiwanie podobnych tekstów w tabeli rekordów (0.6m)

### Wykorzystane :
### - Pandas  https://en.wikipedia.org/wiki/Pandas_%28software%29
### - Wektoryzacja TF/IDF  https://pl.wikipedia.org/wiki/TFIDF
### - N-gramy https://pl.wikipedia.org/wiki/N-gram
### - Cosine similarity jako miara odległości między wektorami https://en.wikipedia.org/wiki/Cosine_similarity


Na podstawie 
"Super Fast String Matching in Python"
https://bergvca.github.io/2017/10/14/super-fast-string-matching.html

Funkcja sparse_dot_topn pisana w Cythonie, do lokalnej kompilacji
https://github.com/ing-bank/sparse_dot_topn

Tutaj tylko lokalny wrapper.

In [1]:
import pandas as pd

pd.set_option('display.max_colwidth', -1)
names =  pd.read_csv('data/sec__edgar_company_info.csv')
print('The shape: %d x %d' % names.shape)
names.head()

The shape: 663000 x 3


Unnamed: 0,Line Number,Company Name,Company CIK Key
0,1,!J INC,1438823
1,2,"#1 A LIFESAFER HOLDINGS, INC.",1509607
2,3,#1 ARIZONA DISCOUNT PROPERTIES LLC,1457512
3,4,#1 PAINTBALL CORP,1433777
4,5,$ LLC,1427189


In [2]:
import re

def ngrams(string, n=3):
    string = re.sub(r'[ ,-./]|\sBD', r'', string)
    ngrams = zip(*[string[i:] for i in range(n)])
    return [''.join(ngram) for ngram in ngrams]

print('All 3-grams in "McDonalds":')
ngrams('McDonalds')

All 3-grams in "McDonalds":


['McD', 'cDo', 'Don', 'ona', 'nal', 'ald', 'lds']

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [4]:
company_names=names['Company Name']
vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams)
tf_idf_matrix = vectorizer.fit_transform(company_names)

In [5]:
tf_idf_matrix.shape

(663000, 34860)

In [6]:
print(tf_idf_matrix[0])

  (0, 12)	0.843178066011
  (0, 18655)	0.513342599768
  (0, 17933)	0.159781489106


In [7]:
from sklearn.metrics.pairwise import linear_kernel
cosine_similarities = linear_kernel(tf_idf_matrix[0:1], tf_idf_matrix).flatten()
related_docs_indices = cosine_similarities.argsort()[:-5:-1].flatten()
print(related_docs_indices)
print(cosine_similarities[related_docs_indices])


[     0 314941 315004 402247]
[ 1.          0.32937889  0.32505872  0.31667197]


In [8]:
names.loc[[0,314941,315004,402247]]

Unnamed: 0,Line Number,Company Name,Company CIK Key
0,1,!J INC,1438823
314941,314942,JIN LI,1566263
315004,315005,JINTI INC,1355948
402247,402248,"MONJ, INC.",1695884


In [9]:
import numpy as np
from scipy.sparse import csr_matrix
import sparse_dot_topn.sparse_dot_topn as ct

def awesome_cossim_top(A, B, ntop, lower_bound=0):
    # force A and B as a CSR matrix.
    # If they have already been CSR, there is no overhead
    A = A.tocsr()
    B = B.tocsr()
    M, _ = A.shape
    _, N = B.shape
 
    idx_dtype = np.int32
 
    nnz_max = M*ntopr
 
    indptr = np.zeros(M+1, dtype=idx_dtype)
    indices = np.zeros(nnz_max, dtype=idx_dtype)
    data = np.zeros(nnz_max, dtype=A.dtype)

    ct.sparse_dot_topn(
        M, N, np.asarray(A.indptr, dtype=idx_dtype),
        np.asarray(A.indices, dtype=idx_dtype),
        A.data,
        np.asarray(B.indptr, dtype=idx_dtype),
        np.asarray(B.indices, dtype=idx_dtype),
        B.data,
        ntop,
        lower_bound,
        indptr, indices, data)

    return csr_matrix((data,indices,indptr),shape=(M,N))

In [10]:
import time
t1 = time.time()
matches = awesome_cossim_top(tf_idf_matrix, tf_idf_matrix.transpose(), 10, 0.8)
t = time.time()-t1
print("SELFTIMED:", t)


SELFTIMED: 6830.938807249069


In [12]:
def get_matches_df(sparse_matrix, name_vector, top=100):
    non_zeros = sparse_matrix.nonzero()
    
    sparserows = non_zeros[0]
    sparsecols = non_zeros[1]
    
    if top:
        nr_matches = top
    else:
        nr_matches = sparsecols.size
    
    left_side = np.empty([nr_matches], dtype=object)
    right_side = np.empty([nr_matches], dtype=object)
    similairity = np.zeros(nr_matches)
    
    for index in range(0, nr_matches):
        left_side[index] = name_vector[sparserows[index]]
        right_side[index] = name_vector[sparsecols[index]]
        similairity[index] = sparse_matrix.data[index]
    
    return pd.DataFrame({'left_side': left_side,
                          'right_side': right_side,
                           'similairity': similairity})

In [14]:
matches_df = get_matches_df(matches, company_names, top=100000)
matches_df = matches_df[matches_df['similairity'] < 0.99999] # Remove all exact matches
matches_df.head(20)

Unnamed: 0,left_side,right_side,similairity
15,"0210, LLC",90210 LLC,0.853456
19,03 ENTERTAINMENT GROUP INC,O3 ENTERTAINMENT GROUP INC,0.810585
168,1 800 MUTUALS ADVISOR SERIES,1 800 MUTUALS ADVISORS SERIES,0.947396
170,1 800 MUTUALS ADVISORS SERIES,1 800 MUTUALS ADVISOR SERIES,0.947396
179,1 FINANCIAL MARKETPLACE SECURITIES LLC /BD,"1 FINANCIAL MARKETPLACE SECURITIES, LLC",0.9507
181,"1 FINANCIAL MARKETPLACE SECURITIES, LLC",1 FINANCIAL MARKETPLACE SECURITIES LLC /BD,0.9507
192,1 USA V ACQUISITION CORP,SAV ACQUISITION CORP,0.858988
345,"1060 CAPITAL OPPORTUNITY FUND, LP","1060 CAPITAL OPPORTUNITY FUND, LTD",0.960713
347,"1060 CAPITAL OPPORTUNITY FUND, LTD","1060 CAPITAL OPPORTUNITY FUND, LP",0.960713
409,11 MADISON INVESTOR II LLC,11 MADISON INVESTOR LLC,0.848684


In [None]:
matches_df.head(20)