In [1]:
import datapackage
#import gensim
import pandas as pd
import re

package=datapackage.Package('http://next.obudget.org/datapackages/procurement/spending/latest-contract-spending/datapackage.json')
response = package.resources[0]
iterator = response.iter(keyed=True)

df=pd.DataFrame()

items = []
counter = 0
for row in iterator:
    for column_name in list(row.keys()):
        if column_name not in df.columns:
            df.insert(column=column_name,loc=len(df.columns),value=None)
    items.append(row)
    counter+=1
    if counter % 10000 == 0:
        print("downloaded: ",counter," items")
print("downloaded: ",counter," items")

df = pd.DataFrame(items)

# add a row index to df
df['doc_index'] = df.index    


downloaded:  10000  items
downloaded:  20000  items
downloaded:  30000  items
downloaded:  40000  items
downloaded:  50000  items
downloaded:  60000  items
downloaded:  70000  items
downloaded:  80000  items
downloaded:  90000  items
downloaded:  100000  items
downloaded:  110000  items
downloaded:  120000  items
downloaded:  130000  items
downloaded:  140000  items
downloaded:  150000  items
downloaded:  160000  items
downloaded:  170000  items
downloaded:  180000  items
downloaded:  190000  items
downloaded:  200000  items
downloaded:  210000  items
downloaded:  220000  items
downloaded:  230000  items
downloaded:  240000  items
downloaded:  250000  items
downloaded:  260000  items
downloaded:  270000  items
downloaded:  280000  items
downloaded:  290000  items
downloaded:  300000  items
downloaded:  310000  items
downloaded:  320000  items
downloaded:  330000  items
downloaded:  340000  items
downloaded:  350000  items
downloaded:  360000  items
downloaded:  370000  items
downloaded

In [2]:
# tokenize verbal column, add meta information without modifying it
def tokenize(row):
    # add here the columns you would like to index
    columns_list = ['explanation','purpose'] 
    meta_columns_list = ['budget_code','budget_title','buyer_description','exemption_reason','publisher','purchase_method','purchasing_unit','supplier_name']

    tokenized = []
    for column_name in columns_list:
        tokenized+=tokenize_text(row[column_name])
    for meta_column in meta_columns_list:
        tokenized+=[meta_column+": "+str(row[meta_column])]
    return ", ".join(tokenized)


def tokenize_text(text):    # tokenie textual documents
                            # get rid of non alpha numberics, split to words in selected columns
    regxlst = [re.compile(x) for x in [r'\d+',]]    # regular expressions list for clean ups: 
                                                             # 1. only digits    
    stringed_text = str(text)
    ignore_signs = ["\,","\:","\;","\.","\&","\$","\-","\=","\(","\)","\d+","\\n"]
    cleaned_1 = re.sub("|".join(ignore_signs),"",stringed_text) # remove non-alphanumberic characters
    cleaned_2 = re.sub("  "," ",cleaned_1)                # no more double spaces
    cleaned_3 = cleaned_2.split()                       # split into separate words list
    cleaned_4 =  [word for regex in regxlst for word in cleaned_3 if not(regex.match(word))] # filter by regular expressions
    cleaned_5 = [word for word in cleaned_4 if word is not None]
    return cleaned_5

# creata words list per doc, add to Dataframe ('tokenized')

df['tokenized'] = df.apply(tokenize,axis=1)

In [3]:
import gensim
import numpy
gen_docs = [[ word for word in doc.split(", ") if (word!= "" and word!='None')] for doc in df['tokenized']]
dictionary = gensim.corpora.Dictionary(gen_docs)

print("Number of words in dictionary:",len(dictionary))

corpus = [dictionary.doc2bow(gen_doc) for gen_doc in gen_docs]
tf_idf = gensim.models.TfidfModel(corpus)
print(tf_idf)

Number of words in dictionary: 110783
TfidfModel(num_docs=422342, num_nnz=5331017)


In [4]:
sims = gensim.similarities.Similarity('',
                                      tf_idf[corpus],
                                      num_features=len(dictionary))

In [124]:
expected_results_num = 10 # set how many (maximum) similar entities are wanted

def entity_id_to_df_index(column_name, id_str):   #catch the index number of the desired record by a chosen column                   
    return df.index[df[column_name] == id_str]

def index_to_entity_id(column_name, row_index):   #catch the id (by a chosen column) of a row indexy                   
    return df.iloc[row_index][column_name]


def add_id_to_result(column_name,result):             #tranform 2 elements tuples (row index, rank) to 3 elements tuples (row index, id, rank)
    to_list = list(result)
    to_list.insert(1, index_to_entity_id(column_name,result[0]))
    return tuple(to_list)
    
def find_similiar(column_name,id): # choose a Dataframe row index to query
    query_index = entity_id_to_df_index(column_name,id)
    query_doc = tokenize(df.iloc[query_index]).split(", ")
    # print(query_doc)
    query_doc_bow = dictionary.doc2bow(query_doc)
    # print(query_doc_bow)
    query_doc_tf_idf = tf_idf[query_doc_bow]
    # print(query_doc_tf_idf)

    results = sims[query_doc_tf_idf]                   # get the similarity ranks
    results = sorted(enumerate(results), key=lambda item: -item[1])[0:expected_results_num] # sort by relevancy
    
    results = [add_id_to_result(column_name,result) for result in results if result[1] > 0]  # add the result entity id + filter out results with 0 correlation
    
    print(len(results),"results:\n ",results,"\n")
    
    return results

In [135]:
query_index = "4501007357"
results = find_similiar("order_id",query_index)



print("queried item: \n ----------")
print(df.iloc[results[0][0]])
print(" \n ---- similiar: ---- \n ")
print(df.iloc[results[2][0]])

5 results:
  [(63079, '4501089884', 0.5420446), (264417, '4501347241', 0.5420446), (6234, '4501007357', 0.5315435), (193116, '4501258714', 0.50989056), (193117, '4501258714', 0.48782334)] 

queried item: 
 ----------
amount_contracts                                                       3275.89
budget_code                                                         0024400910
budget_title                                                       רכש - קניות
buyer_description                                          [בריאות-גזברות ת"א]
company_id                                                                None
currency                                                                   ILS
end_date                                                            2015-11-30
entity_id                                                            511076572
entity_kind                                                            company
entity_name                                                 פלאפון תקשור

In [131]:
results[0][0]

63079

Int64Index([6234], dtype='int64')