In [1]:
from collections import defaultdict

import pandas as pd

# default value

word_freq = defaultdict(lambda: [0,0])

# how many times a word appears in the corpus
num_list = [200, 180, 170, 160,160]

# text
text_list = ['france', 'spain', 'spain beaches', 'france beaches', 'spain best beaches']



# loop over the text and the number
for text, num in zip(text_list, num_list):
    for word in text.split():
        word_freq[word][0] += 1
        word_freq[word][1] += num


columns = {0: 'abs_freq', 1:'wtd_freq'}

abs_wtd_df = pd.DataFrame.from_dict(word_freq,orient='index')\
            .rename(columns=columns) \
            .sort_values('wtd_freq', ascending=False) \
            .assign(rel_value=lambda df: df['wtd_freq'] / df['abs_freq'])\
            .round()


abs_wtd_df.insert(1, 'abs_perc', value=abs_wtd_df['abs_freq']/abs_wtd_df['abs_freq'].sum())
abs_wtd_df.insert(2, 'abs_perc_cum', abs_wtd_df['abs_perc'].cumsum())
abs_wtd_df.insert(4, 'wtd_freq_perc', abs_wtd_df['wtd_freq'] / abs_wtd_df['wtd_freq'].sum())
abs_wtd_df.insert(5, 'wtd_freq_perc_cum', abs_wtd_df['wtd_freq_perc'].cumsum())
abs_wtd_df.style.background_gradient(low=0, high=.8)


Unnamed: 0,abs_freq,abs_perc,abs_perc_cum,wtd_freq,wtd_freq_perc,wtd_freq_perc_cum,rel_value
spain,3,0.333333,0.333333,510,0.335526,0.335526,170
beaches,3,0.333333,0.666667,490,0.322368,0.657895,163
france,2,0.222222,0.888889,360,0.236842,0.894737,180
best,1,0.111111,1.0,160,0.105263,1.0,160


In [7]:
import json, re
from collections import defaultdict
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfTransformer



#https://next.obudget.org/api/query?query=select%20*%20from%20quarterly_contract_spending_reports%20Limit%2010


with open("example_json.json", encoding="utf-8") as file:
    raw_data = json.load(file)
    

# create a raw data frame
df = pd.DataFrame(raw_data['rows'])


# common words we should ignore
ignore_list = []


# get rid of non alpha numberics, split to words
def tokenize(doc):
    ignore_signs = ["\,","\:","\-","\;","\.","\&","\$"]
    cleaned = re.sub("|".join(ignore_signs),"",doc) # remove non-alphanumberic characters
    cleaned = re.sub("  "," ",cleaned)                # no more double spaces
    return cleaned.split(" ")



def defaultvalue():   # word initial counter set to 0 for upcomiong counter
    return 0

def index_words(texts_list):  # create words counter index
    for text in texts_list:
        text_words = tokenize(text)
        for word in text_words:
            if word not in ignore_list:
                index[word] += 1
            


# creat words list per doc
df['tokenized'] = df.apply(lambda row: tokenize (row['description']), axis=1)

# create a words counter index
index = defaultdict(defaultvalue) # create an index
data = df['description']
index_words(data)

# create a dataframe with words as features, each row is a doc vector
terms_index = pd.DataFrame(0, index=np.arange(len(df)), columns=index.keys())


def count_word(l, word):
    return l.count(word)

# count terms in corpus on index (terms_index)
for word in index.keys():
    terms_index[word] = df['tokenized'].apply(count_word, args=[word])
    
    
tfidf = TfidfTransformer(norm="l2")
tfidf.fit(terms_index)

tf_idf_matrix = tfidf.transform(terms_index)
print(tf_idf_matrix.todense())

[[0.31032948 0.28028886 0.56057772 ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 ...
 [0.254737   0.         0.         ... 0.         0.42224874 0.42224874]
 [0.254737   0.         0.         ... 0.         0.42224874 0.42224874]
 [0.         0.         0.         ... 0.         0.         0.        ]]


In [8]:
print(tf_idf_matrix.shape)

(100, 483)


In [10]:
from sklearn.metrics.pairwise import cosine_similarity
similiar = cosine_similarity(tf_idf_matrix[0:1], tf_idf_matrix)