In [86]:
import sys, os
sys.path.append(os.path.abspath(os.path.join('..')))

import json
from collections import Counter

import numpy as np
import pandas as pd

import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
stop_words = set(stopwords.words('english'))
stop_words_es = set(stopwords.words('spanish'))
from nltk.corpus import wordnet

lemmatizer = WordNetLemmatizer()

**Get all places**

In [2]:
def get_places():
    places_file_dir = "/Users/pabloblanco/Desktop/Places/server/dummies/places.json"
    data = {}
    
    with open(places_file_dir) as file:
        data = json.load(file)
        
    places = []
    for places_in_category in data.values():
        for place in places_in_category:
            places.append({
                "name": place.get("name", ""),
                "description": place.get("description", "")
            })
        
    return places

In [42]:
def read_all_places():
    """
    Reads all places from json file and creates dict object containing them
    """
    places_file = "/Users/pabloblanco/Desktop/Places/server/ml/places.json"
    data = {}

    with open(places_file) as file:
        data = json.load(file)

    places = data["places"]

    return places

In [99]:
places = read_all_places()

**Tokenize**

In [100]:
for place in places:
    place["name"] = word_tokenize(place["name"])
    place["description"] = word_tokenize(place["description"])

**Remove stopwords, convert to lower case, remove punctuation**

In [101]:
def get_wordnet_pos(treebank_tag):

    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return None

In [102]:
def preprocess(words: list) -> list:
    final = []
    punctuation = "!\"#$%&()*+-./:;<=>?@[\]^_`{|}~\n'"
    
    for word, tag in words:
        # lowercase
        word = word.lower()
        
        # remove stopwords
        if word in stop_words or word in stop_words_es:
            continue
            
        # remove punctuation
        for letter in punctuation:
            word = word.replace(letter, "")
        
        # remove one-letter words
        if len(word) <= 1:
            continue
        
        # lemmatize word
        pos = get_wordnet_pos(tag)
        if pos is None:
            continue
            
        word = lemmatizer.lemmatize(word, pos)
        
        final.append(word)
    
    return final

In [103]:
for place in places:
    # preprocess description
    words = nltk.pos_tag(place["description"])
    place["description"] = preprocess(words)
    
    # preprocess title
    words = nltk.pos_tag(place["name"])
    place["name"] = preprocess(words)

**Create dictionary of document frequency (DF)**

In [104]:
df = {}
for i in range(len(places)):
    tokens = places[i]["description"]
    tokens.extend(places[i]["name"])
    for w in tokens:
        try:
            df[w].add(i)
        except KeyError:
            df[w] = {i}

df doesn't need the id's of the documents where the word is present. Hence we just take the total number of times it occurs.

In [105]:
for word in df.keys():
    df[word] = len(df[word])

In [106]:
total_vocab = list(df.keys())
total_vocab_size = len(total_vocab)
N = len(places)

In [107]:
def doc_freq(word):
    freq = 0
    try:
        freq = df[word]
    except KeyError:
        pass
    return freq

In [108]:
tf_idf_text = {}
tf_idf_title = {}
alpha = 0.3

In [109]:
# for each place calculate tf-idf for description
for i, place in enumerate(places):
    
    # get tokens of that place
    tokens = place["description"]

    # get word counts per document and total words
    counter = Counter(tokens)
    words_count = len(tokens)
    
    # for each word in document
    for token in np.unique(tokens):    
        
        # get term-frequency
        tf = counter[token]/words_count
        
        # get inverse document frequency
        idf = np.log(N/(doc_freq(token)+1))
        
        tf_idf_text[i, token] = tf*idf * alpha

In [110]:
# for each place calculate tf-idf for title
for i, place in enumerate(places):
    
    # get tokens of that place
    tokens = place["name"]

    # get word counts per document and total words
    counter = Counter(tokens)
    words_count = len(tokens)
    
    # for each word in document
    for token in np.unique(tokens):    
        
        # get term-frequency
        tf = counter[token]/words_count
        
        # get inverse document frequency
        idf = np.log(N/(doc_freq(token)+1))
        
        tf_idf_title[i, token] = tf*idf * (1-alpha)

In [111]:
# Create total tf_idf of all words of all documents
tf_idf = {}
for key in tf_idf_text.keys():
    tf_idf[key] = tf_idf_text[key] + tf_idf_title.pop(key, 0)
for key in tf_idf_title.keys():
    tf_idf[key] = tf_idf_title.get(key, 0)

In [112]:
def matching_score(query):
    tokenized_query = word_tokenize(query)
    words = nltk.pos_tag(tokenized_query)
    tokens = preprocess(words)
    
    query_weights = {}

    for key in tf_idf:
        
        if key[1] in tokens:
            try:
                query_weights[key[0]] += tf_idf[key]
            except:
                query_weights[key[0]] = tf_idf[key]
    
    query_weights = sorted(query_weights.items(), key=lambda x: x[1], reverse=True)
    
    results = []
    for i in query_weights[:10]:
        results.append(i[0])
    
    return results

In [113]:
matching_score("best sushi")

[46, 41, 36, 71, 32, 54, 31, 12, 64, 6]

**Implement cosine similarity**

In [129]:
matrix = np.zeros((N, total_vocab_size))
for word in tf_idf:
    try:
        index = total_vocab.index(word[1])
        matrix[word[0]][index] = tf_idf[word]
    except KeyError:
        pass

**Create DataFrame with data**

In [130]:
data_frame = pd.DataFrame(matrix)

# Set index as objectId
data_frame["objectId"] = pd.Series([x["id"] for x in places])
data_frame = data_frame.set_index("objectId", drop=True)

In [131]:
new_cols = {}
for i, word in enumerate(total_vocab):
    new_cols.setdefault(i, word)

data_frame.rename(columns=new_cols)

Unnamed: 0_level_0,rustic,mediterraneanstyle,dining,room,checkered,tablecloth,specialise,pizza,pasta,trattoria,...,accept,crema,di,crowded,still,snack,mega,hotdog,reasonable,popeye
objectId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
RikovbyDNm,0.098846,0.098846,0.087788,0.098846,0.098846,0.098846,0.098846,0.073856,0.061038,1.367365,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
jsBHY6yy0H,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
WkXldk0zqP,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
6vwlMUoQHC,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
qzknzFsH7g,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
t6WNqDjjES,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
yOTDvpqdsP,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
oYozICIEez,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
uWnJDy7xKr,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.038832,0.884512,0.884512,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [133]:
def preprocess_sentence(sentence: str) -> list:
    """Preprocess a sentence by next steps:
        1. Getting tokens
        2. Turning them to lower case
        3. Removing stopwords (english and spanish)
        4. Removing punctuation
        5. Removing one letter words
        6. Lemmatizing the words to get root

    Args:
        sentence (str): [description]

    Returns:
        list: [description]
    """
    tokens = word_tokenize(sentence)
    words = nltk.pos_tag(tokens)

    final = []
    punctuation = "!\"#$%&()*+-./:;<=>?@[\]^_`{|}~\n'"

    for word, tag in words:
        # lowercase
        word = word.lower()

        # remove stopwords
        if word in stop_words or word in stop_words_es:
            continue

        # remove punctuation
        for letter in punctuation:
            word = word.replace(letter, "")

        # remove one-letter words
        if len(word) <= 1:
            continue

        # lemmatize word
        pos = get_wordnet_pos(tag)
        if pos is None:
            continue

        word = lemmatizer.lemmatize(word, pos)

        final.append(word)

    return final

In [134]:
def gen_vector(text: str):

    tokens = preprocess_sentence(text)
    
    V = np.zeros((len(total_vocab)))
    
    counter = Counter(tokens)
    words_count = total_vocab_size

    query_weights = {}
    
    for token in np.unique(tokens):
        
        tf = counter[token]/words_count
        df = doc_freq(token)
        idf = np.log((N)/(df+1))

        try:
            index = total_vocab.index(token)
            V[index] = tf*idf
        except:
            pass
    return V

In [135]:
def cosine_similarity(a, b):
    cos_sim = np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
    return cos_sim

In [137]:
def cosine_search(query):
    d_cosines = []
    
    query_vector = gen_vector(query)
    
    for index, row in data_frame.iterrows():
        d_cosines.append(cosine_similarity(np.array(query_vector), (np.array(row))))
        
    out = np.array(d_cosines).argsort()
    out = out[::-1][:10]
    
    return [data_frame.index[x] for x in out]

In [138]:
cosine_search("best sushi and place. Good beer")

['XFwBIOBqFk',
 '6GDrn0l3nd',
 'tHtbefDJ6o',
 'WkXldk0zqP',
 'olBnrE0XAv',
 'yOTDvpqdsP',
 'FpGYTCEEf0',
 'XYtnuOvOrU',
 'CTx0UwvcUT',
 'Iu8Kqcbq9M']

In [136]:
data_frame.index

Index(['RikovbyDNm', 'jsBHY6yy0H', 'WkXldk0zqP', '6vwlMUoQHC', 'qzknzFsH7g',
       '4PWS5AlhBS', 'AhoMtdIvVQ', '67G7wVmxiH', 'tgbWcG5bcP', 'sx23GQkGS9',
       'uCg01ne0rU', 'mqTLx15zKL', 'olBnrE0XAv', 'hIAC9EYb18', 'zYeJ9hXCmV',
       'hlmD7YXRSZ', 'dFXSLkA5GH', 'FRzeo68LSt', '7MZDYdeQ90', 'ar0HuYZEJq',
       't4JVRsh7F2', 'uocGLUaNid', 'uPDdTM1vjO', 'HpJkHNZ5u8', 'nz1YWNPJHA',
       'hCOeNgcDVU', 'gnnVYOd2Vx', 'i3LndVCgud', 'NzPSN7A2aa', '1znr7xjJmc',
       'zpbaAHJPh3', 'reTyTA1JlH', 'Iu8Kqcbq9M', 'qFnXU8tVmq', '7rzHCPMaMn',
       '60aXlIln71', 'tHtbefDJ6o', 'N6CgzqclRR', 'MHXR6u2JJQ', 'WXt6KBXtIF',
       'HuadGVieXe', '6GDrn0l3nd', 'SbrxQe0Y7u', 'DMMb9IhAew', 'kMyV93ysyk',
       'HBa1yXo31Z', 'XFwBIOBqFk', '7a0cDFsv8P', 'sZJTdHPQEG', 'CMmwPQEuwD',
       '1a7o93N171', 'z2Xwml9PqV', '6oo32HUEgT', 'NKqrZHcwOK', '2roHEcjYRG',
       'O7pO1kcNPa', 'kGqDgwqvOs', 'CTx0UwvcUT', 'lfbqwdy2Dv', 'CHFk6CCry5',
       'B7LNsbs2UD', 'V04masGKPI', 'FpGYTCEEf0', '0m7LCaGjm8', 'XYtnuOvOrU',