In [1]:
import pandas as pd
from helper import *

df = pd.read_csv('data/TED - Contract award notices 2020.csv', usecols=['WIN_NAME'])

winner_names = df.copy().dropna()
winner_names = winner_names['WIN_NAME'].str.split('---').explode()

#### Simple Preprocess

In [2]:
preprocessed_winner_names = preprocess(winner_names)

print(winner_names[winner_names.apply(lambda x: isinclude(x, 'mediplus'))].value_counts())
print(preprocessed_winner_names[preprocessed_winner_names.apply(lambda x: isinclude(x, 'mediplus'))].value_counts())

MEDIPLUS EXIM                5436
MEDIPLUS EXIM S.R.L.          692
Mediplus Exim                 287
Mediplus Solutions              6
MEDIPLUS SOLUTIONS SRL          5
Mediplus Ltd                    4
Mediplus Krzysztof Wypych       2
MEDIPLUS SOLUTIONS              1
MEDIPLUS EXIM SRL               1
S.C. Mediplus Exim S.R.L.       1
Name: WIN_NAME, dtype: int64
mediplus exim                5723
mediplus exim srl             693
mediplus solutions              7
mediplus solutions srl          5
mediplus ltd                    4
mediplus krzysztof wypych       2
sc mediplus exim srl            1
Name: WIN_NAME, dtype: int64


#### Removing Common Words

In [3]:
most_common_words = most_common(preprocessed_winner_names, 10)
extracted_preprocessed_winner_names = extract_sw(preprocessed_winner_names, most_common_words)

print(preprocessed_winner_names[preprocessed_winner_names.apply(lambda x: isinclude(x, 'mediplus'))].value_counts())
print(extracted_preprocessed_winner_names[extracted_preprocessed_winner_names.apply(lambda x: isinclude(x, 'mediplus'))].value_counts())

100%|██████████| 824267/824267 [16:45<00:00, 819.87it/s]  


[('z', 90547), ('sp', 90210), ('oo', 75885), ('srl', 65798), ('gmbh', 59425), ('doo', 53136), ('sa', 44315), ('ltd', 35706), ('s', 35541), ('in', 32980)]
mediplus exim                5723
mediplus exim srl             693
mediplus solutions              7
mediplus solutions srl          5
mediplus ltd                    4
mediplus krzysztof wypych       2
sc mediplus exim srl            1
Name: WIN_NAME, dtype: int64
mediplus exim                6416
mediplus solutions             12
mediplus                        4
mediplus krzysztof wypych       2
sc mediplus exim                1
Name: WIN_NAME, dtype: int64


#### Vectorization

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

unique_winner_names = pd.Series(extracted_preprocessed_winner_names.unique()).reset_index(drop=True)
unique_winner_names = unique_winner_names[unique_winner_names.apply(lambda x: len(str(x)) > 3)]
unique_winner_names.reset_index(drop=True, inplace=True)

vectorizer = TfidfVectorizer(min_df=1, analyzer=ngrams)
vectorizer_fitted = vectorizer.fit(unique_winner_names)

tf_idf_matrix = transform_tqdm(unique_winner_names, vectorizer_fitted)

222895


22it [00:01, 11.03it/s]


#### Building Search Index Using Sklearn

In [5]:
from sklearn.neighbors import NearestNeighbors

# build the search index!
neigh = NearestNeighbors(n_neighbors=5, n_jobs=-1)
neigh.fit(tf_idf_matrix)

In [7]:
distances_list, indexes_list = search_tqdm(tf_idf_matrix, neigh, 1_000, 5)
# np.savetxt(fname='distances.csv', X=distances_list, delimiter=',')
# np.savetxt(fname='indexes.csv', X=indexes_list, delimiter=',')
# distances_list = np.loadtxt('distances.csv', delimiter=',')
# indexes_list = np.loadtxt('indexes.csv', delimiter=',').astype(int)

#### Search

In [8]:
threshold = .6
closer_candidates_matrix = np.argwhere(np.array(distances_list) < threshold)

mapper = [(pair[0], np.array(indexes_list)[pair[0], pair[1]]) for c, pair in enumerate(tqdm(closer_candidates_matrix)) if pair[1] != 0]

matchings = [(unique_winner_names[src], unique_winner_names[des]) for (src,des) in mapper if src != des]
print([matching for matching in matchings if 'mediplus' in matching[0]])

100%|██████████| 308711/308711 [00:46<00:00, 6605.91it/s]


[('mediplus exim', 'sc mediplus exim'), ('sc mediplus exim', 'mediplus exim')]
