In [130]:
import sys, os
sys.path.append(os.path.abspath(os.path.join('..')))

import json
from collections import Counter

import numpy as np

import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
stop_words = set(stopwords.words('english'))
stop_words_es = set(stopwords.words('spanish'))
from nltk.corpus import wordnet

lemmatizer = WordNetLemmatizer()

**Get all places**

In [131]:
def get_places():
    places_file_dir = "/Users/pabloblanco/Desktop/Places/server/dummies/places.json"
    data = {}
    
    with open(places_file_dir) as file:
        data = json.load(file)
        
    places = []
    for places_in_category in data.values():
        for place in places_in_category:
            places.append({
                "name": place.get("name", ""),
                "description": place.get("description", "")
            })
        
    return places

In [132]:
places = get_places()

**Tokenize**

In [133]:
for place in places:
    place["name"] = word_tokenize(place["name"])
    place["description"] = word_tokenize(place["description"])

**Remove stopwords, convert to lower case, remove punctuation**

In [134]:
def get_wordnet_pos(treebank_tag):

    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return None

In [135]:
def preprocess(words: list) -> list:
    final = []
    punctuation = "!\"#$%&()*+-./:;<=>?@[\]^_`{|}~\n'"
    
    for word, tag in words:
        # lowercase
        word = word.lower()
        
        # remove stopwords
        if word in stop_words or word in stop_words_es:
            continue
            
        # remove punctuation
        for letter in punctuation:
            word = word.replace(letter, "")
        
        # remove one-letter words
        if len(word) <= 1:
            continue
        
        # lemmatize word
        pos = get_wordnet_pos(tag)
        if pos is None:
            continue
            
        word = lemmatizer.lemmatize(word, pos)
        
        final.append(word)
    
    return final

In [136]:
for place in places:
    # preprocess description
    words = nltk.pos_tag(place["description"])
    place["description"] = preprocess(words)
    
    # preprocess title
    words = nltk.pos_tag(place["name"])
    place["name"] = preprocess(words)

**Create dictionary of document frequency (DF)**

In [137]:
df = {}
for i in range(len(places)):
    tokens = places[i]["description"]
    tokens.extend(places[i]["name"])
    for w in tokens:
        try:
            df[w].add(i)
        except KeyError:
            df[w] = {i}

df doesn't need the id's of the documents where the word is present. Hence we just take the total number of times it occurs.

In [138]:
for word in df.keys():
    df[word] = len(df[word])

In [139]:
total_vocab = df.keys()
total_vocab_size = len(total_vocab)
N = len(places)

In [140]:
def doc_freq(word):
    freq = 0
    try:
        freq = df[word]
    except KeyError:
        pass
    return freq

In [149]:
tf_idf_text = {}
tf_idf_title = {}
alpha = 0.3

In [150]:
# for each place calculate tf-idf for description
for i, place in enumerate(places):
    
    # get tokens of that place
    tokens = place["description"]

    # get word counts per document and total words
    counter = Counter(tokens)
    words_count = len(tokens)
    
    # for each word in document
    for token in np.unique(tokens):    
        
        # get term-frequency
        tf = counter[token]/words_count
        
        # get inverse document frequency
        idf = np.log(N/(doc_freq(token)+1))
        
        tf_idf_text[i, token] = tf*idf * alpha

In [151]:
# for each place calculate tf-idf for title
for i, place in enumerate(places):
    
    # get tokens of that place
    tokens = place["description"]

    # get word counts per document and total words
    counter = Counter(tokens)
    words_count = len(tokens)
    
    # for each word in document
    for token in np.unique(tokens):    
        
        # get term-frequency
        tf = counter[token]/words_count
        
        # get inverse document frequency
        idf = np.log(N/(doc_freq(token)+1))
        
        tf_idf_title[i, token] = tf*idf * (1-alpha)

In [153]:
tf_idf = {}
for key in tf_idf_text.keys():
    tf_idf[key] = tf_idf_text[key] + tf_idf_title[key]

In [164]:
def matching_score(query):
    tokenized_query = word_tokenize(query)
    words = nltk.pos_tag(tokenized_query)
    tokens = preprocess(words)
    
    query_weights = {}

    for key in tf_idf:
        
        if key[1] in tokens:
            try:
                query_weights[key[0]] += tf_idf[key]
            except:
                query_weights[key[0]] = tf_idf[key]
    
    query_weights = sorted(query_weights.items(), key=lambda x: x[1], reverse=True)
    
    results = []
    for i in query_weights[:10]:
        results.append(i[0])
    
    return results

In [167]:
matching_score("good drinks")

[4, 2, 21, 13, 63, 9, 0, 37, 54, 48]