In [2]:
%%capture
!wget https://www.dropbox.com/scl/fi/uznju0fkgnwz1yi4cmgol/review-Vermont.json.gz?rlkey=4l3mithogu5c08x5lxe6pbb6t&st=wcprbsjo&dl=0
!wget https://www.dropbox.com/scl/fi/16vy3q077mz01n4r2nml1/meta-Vermont.json.gz?rlkey=zu5joaocmaqdg0p17oj14z5lz&st=kih98lc6&dl=0
!mv review-Vermont.json.gz?rlkey=4l3mithogu5c08x5lxe6pbb6t review-Vermont.json.gz
!mv meta-Vermont.json.gz?rlkey=zu5joaocmaqdg0p17oj14z5lz meta-Vermont.json.gz
!pip install cornac

In [3]:
%%capture
!pip install pyreclab

In [224]:
import pandas as pd
import numpy as np
import json
import gzip
import cornac
import pyreclab
import random
from sklearn.model_selection import train_test_split
import scipy.sparse as sparse

In [5]:
def parse(path):
  g = gzip.open(path, 'r')
  for l in g:
    yield json.loads(l)

In [6]:
metadata = list(parse("meta-Vermont.json.gz"))
reviews = list(parse("review-Vermont.json.gz"))

In [7]:
data = {
    'user_id' : [],
    'gmap_id' : [],
    'rating' : [],
    'text' : [],
    'img_url' : []
}

for review in reviews:
    has_img = review['pics'] is not None
    has_text = review['text'] is not None

    if has_img and has_text:
        # business pic
        pics = review['pics']
        first_pic = pics[0]
        url_first_pic = first_pic['url'][0] # select the first pic

        # business text
        text = review['text']

        # reviews's rating
        rating = review['rating']

        # user id
        user_id = review['user_id']

        # business id (restaurant id)
        gmap_id = review['gmap_id']

        data['user_id'].append(user_id)
        data['gmap_id'].append(gmap_id)
        data['rating'].append(rating)
        data['text'].append(text)
        data['img_url'].append(url_first_pic)

df = pd.DataFrame(data)


In [18]:
df_uir = df[['user_id', 'gmap_id', 'rating']]

In [141]:
SIZE = 0.7
n_rows = int(df_uir.shape[0] * SIZE)

In [156]:
X_train, X_test = train_test_split(df_uir, test_size=0.2, random_state=42)

In [158]:
print(X_test.shape[0] + X_train.shape[0], df_uir.shape[0])

27167 27167


In [159]:
X_train.to_csv('reviews_train.csv', index=False)
X_test.to_csv('reviews_test.csv', index=False)

# Models

## Most popular

In [277]:
model_mp = pyreclab.MostPopular(
    dataset= 'reviews_train.csv',
    header = True,
    usercol = 0,
    itemcol = 1,
    ratingcol = 2
)

In [278]:
model_mp.train(progress=True)

In [279]:
reclist, map, ndcg = model_mp.testrec(
    input_file = 'reviews_test.csv',
    header = True,
    usercol = 0,
    itemcol = 1,
    ratingcol = 2,
    topn = 10,
    output_file = 'results.csv',
    relevance_threshold = 3.5,
    includeRated = False
)

In [280]:
def find_user(user_id):
    for review in reviews:
        review_uid = review['user_id']
        if review_uid != user_id: continue
        user_name = review['name']
        return user_name


In [281]:
users = list(reclist.keys())
user_1 = random.choice(users)
user_name = find_user(user_1)
user_reclist = reclist[user_1]
print(user_name)

ryan lyford


### Ejemplo

In [282]:
def show_recommendations(user, reclist):
    rec_business_cols = {
        'name' : [],
        'description' : [],
        'url' : [],
        'categories': []

    }
    for d in metadata:
        if d['gmap_id'] not in reclist: continue
        categories = ", ".join(d['category'])

        rec_business_cols['name'].append(d['name'])
        rec_business_cols['description'].append(d['description'])
        rec_business_cols['url'].append(d['url'])

        rec_business_cols['categories'].append(categories)

    reclist_df = pd.DataFrame(rec_business_cols)
    return reclist_df

In [283]:
show_recommendations(user_1, user_reclist)

Unnamed: 0,name,description,url,categories
0,Jay Peak Resort,"Ideal for skiing, this mountainside resort als...",https://www.google.com/maps/place//data=!4m2!3...,"Ski resort, Golf course, Ice skating rink, Res..."
1,Bennington Battle Monument,This 306-ft. stone obelisk offers an elevator ...,https://www.google.com/maps/place//data=!4m2!3...,"Monument, Historical place, Tourist attraction"
2,Okemo Mountain Resort,Resort with ample skiing terrain & modern amen...,https://www.google.com/maps/place//data=!4m2!3...,Resort hotel
3,Killington Ski Area,"Boasting the largest vertical drop, this vast ...",https://www.google.com/maps/place//data=!4m2!3...,"Ski resort, Tourist attraction"
4,Green Mountain National Forest,"Backcountry area with waterfalls, cliffs, pond...",https://www.google.com/maps/place//data=!4m2!3...,"National forest, Tourist attraction"
5,Ben & Jerry’s,"Iconic Vermont-based ice cream parlor chain, k...",https://www.google.com/maps/place//data=!4m2!3...,"Ice cream shop, Bakery, Candy store, Caterer, ..."
6,Smugglers' Notch Resort,Modern condo units in a scenic mountain settin...,https://www.google.com/maps/place//data=!4m2!3...,"Resort hotel, Ski resort, Summer camp, Water park"
7,Church Street Marketplace,"Local shops, eateries & entertainment in an ou...",https://www.google.com/maps/place//data=!4m2!3...,Market
8,Waterfront Park,Community park on the lake featuring a boardwa...,https://www.google.com/maps/place//data=!4m2!3...,"Park, Tourist attraction"
9,Prohibition Pig,"Hip joint with 20+ craft beers on tap, old-fas...",https://www.google.com/maps/place//data=!4m2!3...,"American restaurant, Brewery, Cocktail bar, Re..."


### Metricas

In [284]:
def metrics(user_id, reclist):
    parameters = (user_id, reclist, 10, 3.5, False)
    auc = round(model_mp.AUC(*parameters), 6)
    ndcg = model_mp.nDCG(*parameters)
    print(f'MAP@10: {map}')
    print(f'DCG@10: {ndcg}')


In [285]:
metrics(user_1, user_reclist)

MAP@10: 0.03595287842781492
DCG@10: 0.0


## Random

In [264]:
def precision_at_k(r, k):
    assert 1 <= k <= r.size
    return (np.asarray(r)[:k] != 0).mean()

def average_precision_at_k(r, k):
    r = np.asarray(r)
    score = 0.
    for i in range(min(k, r.size)):
        score += precision_at_k(r, i + 1)
    return score / k

def dcg_at_k(r, k):
    r = np.asfarray(r)[:k]
    if r.size:
        return np.sum(np.subtract(np.power(2, r), 1) / np.log2(np.arange(2, r.size + 2)))
    return 0.

def idcg_at_k(k):
    return dcg_at_k(np.ones(k), k)

def ndcg_at_k(r, k, max_relevant):
    idcg = idcg_at_k(min(k, max_relevant))
    if not idcg:
        return 0.
    return dcg_at_k(r, k) / idcg

def evaluate_model(random_reclist, n):
    mean_ap = 0.
    mean_ndcg = 0.
    for u in user_items_test.keys():
        reclist = random_reclist[u]
        rel_vector = [np.isin(user_items_test[u], reclist, assume_unique=True).astype(int)]
        mean_ap += average_precision_at_k(rel_vector, n)
        mean_ndcg += ndcg_at_k(rel_vector, n, len(user_items_test[u]))

    mean_ap /= len(user_items_test)
    mean_ndcg /= len(user_items_test)

    return mean_ap, mean_ndcg


In [265]:
user_items_train = {}
itemset = set()

for row in X_train.itertuples():
    if row[1] not in user_items_train:
        user_items_train[row[1]] = []

    user_items_train[row[1]].append(row[2])
    itemset.add(row[2])

itemset = np.sort(list(itemset))


user_items_test = {}

for row in X_test.itertuples():
    if row[1] not in user_items_test:
        user_items_test[row[1]] = []

    user_items_test[row[1]].append(row[2])

In [266]:
random_recommendations = {}

for user_id in X_test['user_id'].to_list():
    reclist = set(random.sample(X_train['gmap_id'].to_list(), k=10))
    random_recommendations[user_id] = reclist

### Ejemplo

In [267]:
user_2 = random.choice(list(user_items_test.keys()))

In [268]:
show_recommendations(user_2, random_recommendations[user_2])

Unnamed: 0,name,description,url,categories
0,Lye Brook Falls Trail,,https://www.google.com/maps/place//data=!4m2!3...,Hiking area
1,Mulligan's Irish Pub,,https://www.google.com/maps/place//data=!4m2!3...,Pub
2,Bennington Battle Monument,This 306-ft. stone obelisk offers an elevator ...,https://www.google.com/maps/place//data=!4m2!3...,"Monument, Historical place, Tourist attraction"
3,Killington Bear Mountain Lodge,,https://www.google.com/maps/place//data=!4m2!3...,Ski resort
4,Maple Sugar & Vermont Spice,All-day breakfasts & homey lunch classics prep...,https://www.google.com/maps/place//data=!4m2!3...,"Breakfast restaurant, Restaurant"
5,Camel's Hump State Park,This primitive timber management & wildlife pr...,https://www.google.com/maps/place//data=!4m2!3...,"State park, Park, Tourist attraction"
6,The Skinny Pancake,"Local chain featuring breakfast, crepes, panin...",https://www.google.com/maps/place//data=!4m2!3...,Restaurant
7,Quechee Gorge Bridge,,https://www.google.com/maps/place//data=!4m2!3...,"Bridge, Historical landmark, Tourist attraction"
8,Ben & Jerry’s,"Iconic Vermont-based ice cream parlor chain, k...",https://www.google.com/maps/place//data=!4m2!3...,"Ice cream shop, Bakery, Candy store, Caterer, ..."
9,The Friendly Toast,,https://www.google.com/maps/place//data=!4m2!3...,"Brunch restaurant, Restaurant"


### Metricas

In [271]:
mean_ap, mean_ndcg = evaluate_model(random_recommendations, n=10)
print("nDCG@10", mean_ndcg)
print("MAP@10", mean_ap)


nDCG@10 0.0
MAP@10 0.0
