In [1]:
from collections import defaultdict

import pandas as pd

# default value

word_freq = defaultdict(lambda: [0,0])

# how many times a word appears in the corpus
num_list = [200, 180, 170, 160,160]

# text
text_list = ['france', 'spain', 'spain beaches', 'france beaches', 'spain best beaches']



# loop over the text and the number
for text, num in zip(text_list, num_list):
    for word in text.split():
        word_freq[word][0] += 1
        word_freq[word][1] += num


columns = {0: 'abs_freq', 1:'wtd_freq'}

abs_wtd_df = pd.DataFrame.from_dict(word_freq,orient='index')\
            .rename(columns=columns) \
            .sort_values('wtd_freq', ascending=False) \
            .assign(rel_value=lambda df: df['wtd_freq'] / df['abs_freq'])\
            .round()


abs_wtd_df.insert(1, 'abs_perc', value=abs_wtd_df['abs_freq']/abs_wtd_df['abs_freq'].sum())
abs_wtd_df.insert(2, 'abs_perc_cum', abs_wtd_df['abs_perc'].cumsum())
abs_wtd_df.insert(4, 'wtd_freq_perc', abs_wtd_df['wtd_freq'] / abs_wtd_df['wtd_freq'].sum())
abs_wtd_df.insert(5, 'wtd_freq_perc_cum', abs_wtd_df['wtd_freq_perc'].cumsum())
abs_wtd_df.style.background_gradient(low=0, high=.8)


Unnamed: 0,abs_freq,abs_perc,abs_perc_cum,wtd_freq,wtd_freq_perc,wtd_freq_perc_cum,rel_value
spain,3,0.333333,0.333333,510,0.335526,0.335526,170
beaches,3,0.333333,0.666667,490,0.322368,0.657895,163
france,2,0.222222,0.888889,360,0.236842,0.894737,180
best,1,0.111111,1.0,160,0.105263,1.0,160


In [209]:
import pandas as pd

### load a datapackage and transform into a pandas dataframe 

import datapackage

package=datapackage.Package('http://next.obudget.org/datapackages/procurement/tenders/processed/datapackage.json')
response = package.resources[0]
iterator = response.iter(keyed=True)

df=pd.DataFrame()

items = []
counter = 0
for row in iterator:
    for column_name in list(row.keys()):
        if column_name not in df.columns:
            df.insert(column=column_name,loc=len(df.columns),value=None)
    items.append(row)
    counter+=1
    if counter % 10000 == 0:
        print("downloaded: ",counter," items")
print("downloaded: ",counter," items")

df = pd.DataFrame(items)




import json, re
from collections import defaultdict
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics.pairwise import cosine_similarity



# add a row index to df
df['doc_index'] = df.index    

# get rid of non alpha numberics, split to words in selected columns
def tokenize(row):
    columns_list = (['description','entity_id']) 
    tokenized = []
    for column_name in columns_list:
        tokenized+=tokenize_text(row[column_name])
    return ", ".join(tokenized)




def tokenize_text(text):
    regxlst = [re.compile(x) for x in [r'\d+',]]    # regular expressions list for clean ups: 
                                                             # 1. only digits    
    stringed_text = str(text)
    ignore_signs = ["\,","\:","\;","\.","\&","\$","\-","\=","\(","\)","\d+","\\n"]
    cleaned_1 = re.sub("|".join(ignore_signs),"",stringed_text) # remove non-alphanumberic characters
    cleaned_2 = re.sub("  "," ",cleaned_1)                # no more double spaces
    cleaned_3 = cleaned_2.split(" ")                       # split into separate words list
    cleaned_4 =  [v for regex in regxlst for v in set(cleaned_3) if not(regex.match(v)) and not cleaned_3.remove(v)] # filter by regular expressions
    cleaned_5 = [word for word in cleaned_4 if word is not None]
    return cleaned_4

# creata words list per doc, add to Dataframe ('tokenized')

df['tokenized'] = df.apply(tokenize,axis=1)


downloaded:  10000  items
downloaded:  20000  items
downloaded:  30000  items
downloaded:  40000  items
downloaded:  50000  items
downloaded:  60000  items
downloaded:  70000  items
downloaded:  80000  items
downloaded:  90000  items
downloaded:  100000  items
downloaded:  110000  items
downloaded:  117414  items


In [210]:
### tfidf 
import operator


def create_words_index(texts_list):  # create words counter index
    def defaultvalue():   # set word counter to 0 
        return 0
    
    words_index = defaultdict(defaultvalue) # create an index with 0 as default
    
    for text in texts_list:
        words = text.split(", ")
        for word in words:
            words_index[word] += 1
    return words_index

# create a words counter index
data = df['tokenized'] 
words_index = create_words_index(data)

words_list = sorted(words_index.items(), key=operator.itemgetter(1), reverse=True) # words list by frequency order


In [83]:
# create a dataframe with words as features, each row is a doc vector
term_in_doc_index = pd.DataFrame(0, index=np.arange(len(df)), columns=words_index.keys())

def count_word(l, word):
    return l.count(word)

# count terms in corpus on index (terms_index)
for word in term_in_doc_index.keys():
    term_in_doc_index[word] = df['tokenized'].apply(count_word, args=[word])
    
    
tfidf = TfidfTransformer(norm="l2")
tfidf.fit(terms_index)

tf_idf_matrix = tfidf.transform(terms_index)

def similiar_five_docs (row):  # check the 5 most similiar texts to the doc
    doc_index = row['doc_index']
    cosines = cosine_similarity(tf_idf_matrix[doc_index:doc_index+1], tf_idf_matrix)
    return list(np.argsort(-cosines, axis=1)[0][1:6]) #exclude the current doc from the results list

# add a a new column with a reference (row num) of similar 5 items 
df['similar_docs'] = df.apply(similiar_five_docs, axis=1)


MemoryError: 

In [211]:
def initials_clearance(words):                             # unify counts of returnning terms with initials
    initials = ["ה","ו","י","ב","כ","ל","מ","וב","ול","ומ","ש"]
    words_corpus = list(words_index.keys())

    def with_initials(word):                            # create list of intitial+word combinations
        combinations_to_check = []
        for initial in initials:
            combinations_to_check.append(initial+word)
        return combinations_to_check

    def without_initials(word):                         # create a list of word witout initial combinations
        combinations_to_check = []
        for initial in initials:
            if len(word) > 2 and word[0] == initial:
                combinations_to_check.append(word[1:])
        return combinations_to_check

    

    for word in words:
        justify_unification = [word]

        if words_index[word] > 0 :
                combinations_to_check = with_initials(word)     # add word+initials to combinations list  
                combinations_to_check += without_initials(word)    # add word - initials to combinations list

                for combination in combinations_to_check:
                    if combination in words_corpus:
                        justify_unification.append(combination)
        
        if len(justify_unification) > 2:     # we will unifiy key-values only if we have two exmaples of initial+word in the corpus

            justify_unification.sort(key = len)
            new_entry = justify_unification[0]    # the shortest combination (hopefully, the root word, to be used in the index)
            
            for combination in justify_unification:
                words_index[new_entry] += words_index[combination]
                words_index.pop(combination)

In [None]:
words = list(words_index.keys())
print("before: ",len(words))
initials_clearance(words)
print("after: ",len(words_index.keys()))

before:  93478


In [None]:
ignore_signs = ["\,","\:","\;","\.","\&","\$","\-","\=","\(","\)","\d+","\\n"]
"|".join(ignore_signs)