In [4]:
import pandas as pd 
import numpy as np 
import re 
import matplotlib.pyplot as plt 
import statistics 
from nltk.tokenize import TreebankWordTokenizer


In [5]:
#SQL set up
import duckdb, sqlalchemy
%load_ext sql

%config SqlMagic.autopandas = True
%config SqlMagic.feedback = False
%config SqlMagic.displaycon = False

%sql duckdb:///:memory:

The sql extension is already loaded. To reload it, use:
  %reload_ext sql


In [6]:
treebank_tokenizer = TreebankWordTokenizer()

df = pd.read_csv('./archive/simplified_coffee.csv')
#placeholder until we get database

In [11]:
%sql reviews << SELECT review FROM df

Returning data to local variable reviews


In [12]:
%sql names << SELECT name FROM df

Returning data to local variable names


In [32]:
print(type(names))

<class 'pandas.core.frame.DataFrame'>


In [14]:
#hard code query for sake of demo
query = 'citrus chocolate bean'

In [23]:
def tokenize(text):
    """Returns a list of words that make up the text.
    Params: {text: String}
    Returns: List
    """
    return re.findall('[a-z]+', text.lower())

def tokenize_reviews(reviews, names):
    '''
    Returns a dictionary with all reviews and their tokenized words
    '''
    tokens = set()
    review_dict = dict()
    for i in range(len(reviews)):
        print(reviews[0])
        review_dict[names[i]] = tokenize(reviews[i])
    return review_dict


In [24]:
review_dict = tokenize_reviews(reviews, names)

1246


KeyError: 0

In [9]:
def jaccard(x,y):
    num =set(x).intersection(set(y))
    denom = len(x) + len(y) - len(num)
    return float(len(num) / denom)

def build_cbeans_sims_jac(n_cbeans, input_query_cats, input_data):
    """Returns a cbeans_sims_jac matrix of size (num_cbeans,num_cbeans) where for (i,j) :
        [i,j] should be the jaccard similarity between the category sets for cbeans i and j
        such that cbeans_sims_jac[i,j] = cbeans_sims_jac[j,i]. 
        
    
    Params: {n_bean: Integer, the number of coffeebeans,
            input_data: List<Dictionary>, a list of dictionaries where each dictionary 
                     represents the review_data including the script and the metadata of each movie script}
            input_query_cats: user's input query categories
    Returns: Numpy Array 
    """
    cbeans_sims_jac = np.ones((n_cbeans))
    for cbean_idx in range(n_mov):
        cat1 = input_data[cbean_idx]['categories']
        jac = jaccard(cat1, input_query_cats)
        cbeans_sims_jac[movie1_idx, movie2_idx] = jac
                
    return cbeans_sims_jac


In [10]:
#cosim
#assume reviews is a dict with bean: tokenized review
def build_inverted_index(review_dict):
    inverted_index = dict() #dictionary with word: list of tuples
    doc_id = 0
    for bean, review in review_dict.keys(): #go thru each dict 
        #create a temp dict for count of words in tokenized_dict
        temp_dict = {}
        for token in review:
            temp_dict[token] = temp_dict.get(token, 0) + 1 #get count of each token
        
        #go thru every word in temp_dict
        for word, count in temp_dict.items():
            if word in inverted_index:
                inverted_index[word].append( (doc_id, count))
            else: 
                inverted_index[word] = list() #initialize as list first idk if necessary
                inverted_index[word].append((doc_id, count))
        #move onto next doc
        doc_id += 1 
        
        #now add counts to overall dictionary 

    return inverted_index


In [11]:
def compute_idf(inv_idx, n_docs, min_df=10, max_df_ratio=0.95):
    """ Compute term IDF values from the inverted index."""
    
    idf_vals = dict()
    max_thresh = max_df_ratio * n_docs
    for term, docs in inv_idx.items():
        #print(type(docs))
        len_docs = len(docs)
        if len_docs<=max_thresh and len_docs>=10:
            pre_log_idf = (n_docs/(1+len_docs))
            idf = math.log2(pre_log_idf)
            idf_vals[term] = idf
    return idf_vals


In [12]:
def compute_doc_norms(index, idf, n_docs):
    """ Precompute the euclidean norm of each document.
    
    norms: np.array, size: n_docs
        norms[i] = the norm of document i.
    """

    
    norms = np.zeros(n_docs)
    for word in index:
        
        if word in idf:
            idf_weight = idf[word]
        else:
            idf_weight = 0 #prune to 0
        for doc in index[word]:
            tf_weight = doc[1]
            doc_id = doc[0]
            norms[doc_id] += (tf_weight * idf_weight) ** 2
    norms = np.sqrt(norms)
     #go thru all possible docs, find the word and its invertex index, 
     #keep sum of product of tf number of times the word i appears in document j * idf[word]
    return norms 


In [13]:
def accumulate_dot_scores(query_word_counts, index, idf):
    """ Perform a term-at-a-time iteration to efficiently compute the numerator term of cosine similarity across multiple documents.
   
    Returns
    =======
    
    doc_scores: dict
        Dictionary mapping from doc ID to the final accumulated score for that doc
    """

    doc_scores = dict() 
    
    for word, qf in query_word_counts.items(): 
        if word in index:
            documents = index[word]
            for doc in documents: 
                doc_id, tf = doc[0], doc[1]

                if word not in idf: 
                    idf_val = 0
                else:
                    idf_val = idf[word]

                acc = idf_val * qf * tf * idf_val
                if doc_id not in doc_scores:
                    doc_scores[doc_id] = acc
                else:
                    doc_scores[doc_id] = doc_scores[doc_id] + acc
    return doc_scores


In [36]:
inv_idx = build_inverted_index(review_dict) #TO DO : CHANGE IT TO A LIST

idf = compute_idf(inv_idx, len(review_dict),
                  min_df=10,
                  max_df_ratio=0.1)  # documents are very short so we can use a small value here
                                     # examine the actual DF values of common words like "the"
                                     # to set these values

inv_idx = {key: val for key, val in inv_idx.items()
           if key in idf}            # prune the terms left out by idf

doc_norms = compute_doc_norms(inv_idx, idf, len(review_dict))


TypeError: string indices must be integers

In [37]:
def index_search(query, index, idf, doc_norms, score_func=accumulate_dot_scores, tokenizer=TreebankWordTokenizer):
    """ Search the collection of documents for the given query
   
    Returns
    =======
    
    results, list of tuples (score, doc_id)
        Sorted list of results such that the first element has
        the highest score, and `doc_id` points to the document
        with the highest score.

    """
    query = query.lower() 
    query_tokens = tokenizer.tokenize(query)
    query_word_counts = dict()
    

    for word in query_tokens:
        query_word_counts[word] = query_word_counts.get(word, 0) + 1 
    results = list() 
    doc_scores = score_func(query_word_counts, index, idf)
    #q_norms
    q_norm = 0 
    for term, freq in query_word_counts.items():
        if term in idf:
            idf_weight = idf[term]
        else:
            idf_weight = 0 #prune to 0
        q_norm += ((freq  * idf_weight)  ** 2)
    q_norm = math.sqrt(q_norm)
    
    for doc_id, doc_score in doc_scores.items():
        cossim_val = doc_score / (doc_norms[doc_id] *  q_norm)
        results.append((cossim_val,doc_id))
        
    results = sorted(results, key=lambda x: x[0], reverse=True)
    return results[0:10] #return first top ten similar


In [38]:
output_scores, output_ids = index_search(query, inv_idx, idf, doc_norms) #score, doc id 
rel_beans = name_arry[output_ids]
rel_beans_revs = rev_array[output_ids]
#rel_beans should be top 10 most similar cbeans & reviews & what frontend displays 

NameError: name 'inv_idx' is not defined