# Preditor de Review

- A partir do texto do review, e conjunto de categorias, tenta predizer o score.
- Não funciona :'(

### References
- https://www.tensorflow.org/hub/modules/google/universal-sentence-encoder/2


In [1]:
# Install the latest Tensorflow version.
!pip install --quiet tensorflow-hub seaborn

In [2]:
from util import dataset
from collections import defaultdict, OrderedDict
from datetime import datetime
import itertools
import os

import tensorflow as tf
import tensorflow_hub as hub
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

# Reduce logging output.
tf.logging.set_verbosity(tf.logging.ERROR)

In [3]:
def sort_dict_by_key(data, reverse=False):
    return OrderedDict([
        (k, v)
        for v, k in sorted([(v,k) for k, v in data.items()], reverse=reverse)
    ])

In [4]:
business_categories = {
    business['business_id']: business['categories']
    for business in dataset.read('business')
    if 'Restaurants' in business['categories'] or 'Food' in business['categories']
}

categories = defaultdict(int)
for biz_categories in business_categories.values():
    for category in biz_categories:
        categories[category] += 1
        
categories = sort_dict_by_key(categories, reverse=True)
categories

OrderedDict([('Restaurants', 57172),
             ('Food', 27117),
             ('Nightlife', 8394),
             ('Bars', 8030),
             ('Coffee & Tea', 6936),
             ('Sandwiches', 6912),
             ('Fast Food', 6812),
             ('American (Traditional)', 6659),
             ('Pizza', 6602),
             ('Burgers', 5126),
             ('Breakfast & Brunch', 5023),
             ('Italian', 4550),
             ('Mexican', 4419),
             ('Specialty Food', 4304),
             ('Chinese', 4247),
             ('American (New)', 4230),
             ('Bakeries', 3509),
             ('Grocery', 3380),
             ('Cafes', 3056),
             ('Desserts', 2991),
             ('Event Planning & Services', 2774),
             ('Ice Cream & Frozen Yogurt', 2613),
             ('Shopping', 2607),
             ('Japanese', 2566),
             ('Chicken Wings', 2538),
             ('Seafood', 2369),
             ('Salad', 2350),
             ('Sushi Bars', 2156),
         

In [5]:
category_scores = defaultdict(lambda : [0,0,0,0,0])
for i, review in enumerate(dataset.read('review')):   
    if review['business_id'] not in business_categories:
        continue
    for category in business_categories[review['business_id']]:
        category_scores[category][review['stars'] - 1] += 1

#Filter categories with a reasonable amount of data
#category_scores = {
#    category: scores
#    for category, scores in category_scores.items()
#    if sum(scores) > 100
#}
mean_category_scores = {
    category: sum((star+1)*count for star, count in enumerate(scores)) / sum(scores)
    for category, scores in category_scores.items()
}
sort_dict_by_key(mean_category_scores)

OrderedDict([('Boat Repair', 1.0),
             ('Damage Restoration', 1.0),
             ('Payroll Services', 1.0),
             ('Clowns', 1.3333333333333333),
             ('Accountants', 1.375),
             ('Roofing', 1.6666666666666667),
             ('Office Equipment', 1.6699346405228759),
             ('Aircraft Repairs', 1.7272727272727273),
             ('Airsoft', 1.7272727272727273),
             ('Flooring', 1.7647058823529411),
             ('Optometrists', 1.79375),
             ('Estate Liquidation', 2.0),
             ('Cosmetic Surgeons', 2.0526315789473686),
             ('Taxis', 2.0526315789473686),
             ('Air Duct Cleaning', 2.076923076923077),
             ('Airport Lounges', 2.1176470588235294),
             ('Haunted Houses', 2.130434782608696),
             ('Paintball', 2.130434782608696),
             ('Cosmetic Dentists', 2.142857142857143),
             ('General Dentistry', 2.142857142857143),
             ('Medical Transportation', 2.1428571428

In [6]:
main_categories = [
    category
    for category, business_count in categories.items()
    if business_count > 100 and sum(category_scores[category]) > 100
]
int_to_category = dict(enumerate(main_categories))
category_to_int = {v:k for k,v in int_to_category.items()}
category_to_int

{'Active Life': 97,
 'Afghan': 162,
 'African': 152,
 'American (New)': 15,
 'American (Traditional)': 7,
 'Arts & Entertainment': 46,
 'Asian Fusion': 33,
 'Automotive': 53,
 'Bagels': 70,
 'Bakeries': 16,
 'Barbeque': 36,
 'Bars': 3,
 'Beauty & Spas': 79,
 'Beer': 29,
 'Beer Bar': 95,
 'Bistros': 153,
 'Brasseries': 155,
 'Breakfast & Brunch': 10,
 'Breweries': 76,
 'British': 148,
 'Bubble Tea': 87,
 'Buffets': 57,
 'Burgers': 9,
 'Butcher': 122,
 'Cafes': 18,
 'Cajun/Creole': 107,
 'Canadian (New)': 32,
 'Candy Stores': 98,
 'Car Wash': 163,
 'Caribbean': 69,
 'Casinos': 160,
 'Caterers': 30,
 'Cheese Shops': 133,
 'Cheesesteaks': 129,
 'Chicken Shop': 75,
 'Chicken Wings': 24,
 'Chinese': 14,
 'Chocolatiers & Shops': 82,
 'Cocktail Bars': 52,
 'Coffee & Tea': 4,
 'Coffee Roasteries': 144,
 'Comfort Food': 61,
 'Convenience Stores': 35,
 'Cosmetics & Beauty Supply': 94,
 'Creperies': 112,
 'Cupcakes': 96,
 'Custom Cakes': 110,
 'Dance Clubs': 140,
 'Delicatessen': 151,
 'Delis': 31

In [7]:
def _categories_to_vec(categories):
    ret = [-1] * len(category_to_int)
    for category in categories:
        if category in category_to_int:
            ret[category_to_int[category]] = +1
    return ret

business_category_vector = {
    business_id: _categories_to_vec(biz_categories)
    for business_id, biz_categories in business_categories.items()
}
business_category_vector

{'marrqbEXmEgyY90Or2vqKw': [-1,
  1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1,
  -1

In [8]:
def gen_data_batches(batch_size=50):
    for chunk_num, batch in enumerate(dataset.read('review', batch_size=batch_size, filter=lambda x: x['business_id'] in business_category_vector)):
        chunk_stars      = [review['stars'] for review in batch]
        chunk_texts      = [review['text'] for review in batch]
        chunk_categories = [business_category_vector[review['business_id']] for review in batch]
        yield (
            np.array(chunk_stars, dtype=np.float32),
            np.array(chunk_texts, dtype=str),
            np.array(chunk_categories, dtype=np.float32),
        )
        
next(gen_data_batches(3))

(array([5., 1., 2.], dtype=float32),
 array(['I love this place! My fiance And I go here atleast once a week. The portions are huge! Food is amazing. I love their carne asada. They have great lunch specials... Leticia is super nice and cares about what you think of her restaurant. You have to try their cheese enchiladas too the sauce is different And amazing!!!',
        'Terrible. Dry corn bread. Rib tips were all fat and mushy and had no flavor. If you want bbq in this neighborhood go to john mulls roadkill grill. Trust me.',
        'Back in 2005-2007 this place was my FAVORITE thai place EVER. I\'d go here ALLLLL the time. I never had any complaints. Once they started to get more known and got busy, their service started to suck and their portion sizes got cut in half. I have a huge problem with paying MORE for way less food. The last time I went there I had the Pork Pad se Ew and it tasted good, but I finished my plate and was still hungry. I used to know the manager here and she 

In [15]:
def go_tensorflor():
    def build_predictor(stars_input, embedded_text, categories_input, is_train):
        #TODO: layer_norm?
        embedded_categories = categories_input
        embedded_categories = tf.layers.dropout(embedded_categories, rate=0.2, training=is_train)
        embedded_categories = tf.layers.dense(embedded_categories, units=64, activation=tf.nn.relu)
        embedded_categories = tf.layers.dropout(embedded_categories, rate=0.2, training=is_train)
        embedded_categories = tf.layers.dense(embedded_categories, units=32, activation=tf.nn.relu)
        embedded_all = tf.concat([embedded_text, embedded_categories], axis=-1)

        prediction = embedded_all
        prediction = tf.layers.dropout(prediction, rate=0.2, training=is_train)
        prediction = tf.layers.dense(prediction, units=256, activation=tf.nn.relu)
        prediction = tf.layers.dropout(prediction, rate=0.2, training=is_train)
        prediction = tf.layers.dense(prediction, units=64, activation=tf.nn.relu)
        prediction = tf.layers.dropout(prediction, rate=0.2, training=is_train)
        prediction = tf.layers.dense(prediction, units=1, activation=tf.nn.sigmoid)
        prediction = tf.layers.dense(prediction, units=1, activation=None)
        #prediction = .1*prediction + .9 * stars_input
        
        prediction_error = tf.subtract(prediction, stars_input, name='prediction_error')
        tf.summary.histogram('expected', stars_input)
        tf.summary.histogram('prediction', prediction)
        tf.summary.histogram('error', prediction_error)

        mse_loss = tf.reduce_mean(prediction_error**2, name='MSE')
        mae_loss = tf.reduce_mean(tf.abs(prediction_error), name='MAE')
        tf.summary.scalar('mse', mse_loss)
        tf.summary.scalar('mae', mae_loss)
        
        #for i in range(1, 5+1):
        #    tf.summary.histogram('prediction-%d-star' % (i), tf.boolean_mask(prediction, tf.equal(stars_input, 5)))
        
        return prediction, mse_loss, mae_loss

    tensorboard_dir = 'tensorboard_logs/' + datetime.utcnow().strftime('%Y-%m-%d-%H-%M-%S')
    os.makedirs(tensorboard_dir, exist_ok=True)
    with tf.Graph().as_default(), tf.Session() as session,tf.summary.FileWriter(tensorboard_dir, tf.get_default_graph()) as tensorboard_writer :
        with tf.name_scope("input"):
            is_train = tf.placeholder(tf.bool, shape=(), name='is_train')
            stars_input = tf.placeholder(tf.float32, shape=(None,), name="stars_input")
            text_input = tf.placeholder(tf.string, shape=(None,), name="text_input")
            categories_input = tf.placeholder(tf.float32, shape=(None, len(category_to_int)), name="categories_input")
            
        with tf.name_scope('text_embedding'):
            with tf.device('/cpu:0'):
                embed_sentence = hub.Module("https://tfhub.dev/google/universal-sentence-encoder/2")
                embedded_text = embed_sentence(text_input)

        with tf.name_scope("model"):
            prediction, mse, mae = build_predictor(stars_input, embedded_text, categories_input, is_train)

        train_step = tf.train.AdamOptimizer(learning_rate=.1).minimize(mse)

        iteration = tf.Variable(0, name='iteration', dtype=tf.int32, expected_shape=())
        iteration_inc = tf.assign_add(iteration, 1, name='iteration_increment')

        all_summaries = tf.summary.merge_all()

        session.run([tf.global_variables_initializer(), tf.tables_initializer()])

        for i, (batch_stars, batch_texts, batch_categories) in enumerate(gen_data_batches(batch_size=100)):
            if i % 50 == 0:
                # Run valiation and display metrics
                it_ret, mae_ret, mse_ret, prediction_ret, summaries_ret, z = session.run(
                    [iteration, mae, mse, prediction, all_summaries,
                    tf.boolean_mask(prediction, tf.equal(stars_input, 2))
                    ],
                    feed_dict={
                        stars_input: batch_stars,
                        text_input: batch_texts,
                        categories_input: batch_categories,
                        is_train: False
                    })
                print('Batch {it_ret:5} -- MSE={mse_ret:.3f} MAE={mae_ret:.3f}'.format(
                    it_ret=it_ret, 
                    mse_ret=mse_ret,
                    mae_ret=mae_ret))
                #print(z)
                tensorboard_writer.add_summary(summaries_ret, it_ret)
                
            #Do actual trainning
            session.run(
                [iteration_inc, train_step],
                feed_dict={
                    stars_input: batch_stars,
                    text_input: batch_texts,
                    categories_input: batch_categories,
                    is_train: True
                })
go_tensorflor()

Batch     0 -- MSE=9.698 MAE=2.809
Batch    50 -- MSE=1.754 MAE=1.183
Batch   100 -- MSE=2.140 MAE=1.294
Batch   150 -- MSE=2.006 MAE=1.252
Batch   200 -- MSE=3.484 MAE=1.623
Batch   250 -- MSE=2.257 MAE=1.326
Batch   300 -- MSE=2.235 MAE=1.278
Batch   350 -- MSE=2.339 MAE=1.361
Batch   400 -- MSE=1.862 MAE=1.155
Batch   450 -- MSE=1.889 MAE=1.208
Batch   500 -- MSE=2.322 MAE=1.398
Batch   550 -- MSE=2.087 MAE=1.261
Batch   600 -- MSE=2.529 MAE=1.411
Batch   650 -- MSE=1.626 MAE=1.084
Batch   700 -- MSE=2.107 MAE=1.211
Batch   750 -- MSE=2.468 MAE=1.421
Batch   800 -- MSE=1.973 MAE=1.254
Batch   850 -- MSE=2.132 MAE=1.265
Batch   900 -- MSE=2.601 MAE=1.440
Batch   950 -- MSE=2.254 MAE=1.321
Batch  1000 -- MSE=2.077 MAE=1.246
Batch  1050 -- MSE=1.790 MAE=1.181
Batch  1100 -- MSE=2.188 MAE=1.298
Batch  1150 -- MSE=2.312 MAE=1.307
Batch  1200 -- MSE=2.070 MAE=1.179
Batch  1250 -- MSE=2.170 MAE=1.292
Batch  1300 -- MSE=2.210 MAE=1.239
Batch  1350 -- MSE=2.049 MAE=1.245
Batch  1400 -- MSE=2

Batch 11750 -- MSE=2.038 MAE=1.233
Batch 11800 -- MSE=1.235 MAE=0.910
Batch 11850 -- MSE=1.709 MAE=1.007
Batch 11900 -- MSE=2.070 MAE=1.162
Batch 11950 -- MSE=2.411 MAE=1.414
Batch 12000 -- MSE=2.177 MAE=1.273
Batch 12050 -- MSE=1.624 MAE=1.046
Batch 12100 -- MSE=1.939 MAE=1.277
Batch 12150 -- MSE=2.036 MAE=1.199
Batch 12200 -- MSE=2.565 MAE=1.485
Batch 12250 -- MSE=1.614 MAE=1.002
Batch 12300 -- MSE=1.197 MAE=0.818
Batch 12350 -- MSE=1.614 MAE=1.098
Batch 12400 -- MSE=2.426 MAE=1.322
Batch 12450 -- MSE=2.087 MAE=1.150
Batch 12500 -- MSE=1.491 MAE=1.016
Batch 12550 -- MSE=1.794 MAE=1.095
Batch 12600 -- MSE=1.520 MAE=1.039
Batch 12650 -- MSE=1.500 MAE=0.984
Batch 12700 -- MSE=1.036 MAE=0.840
Batch 12750 -- MSE=1.465 MAE=1.007
Batch 12800 -- MSE=2.043 MAE=1.169
Batch 12850 -- MSE=2.309 MAE=1.219
Batch 12900 -- MSE=1.557 MAE=1.036
Batch 12950 -- MSE=1.898 MAE=1.183
Batch 13000 -- MSE=1.845 MAE=1.210
Batch 13050 -- MSE=1.826 MAE=1.116
Batch 13100 -- MSE=2.119 MAE=1.178
Batch 13150 -- MSE=1

Batch 23500 -- MSE=0.995 MAE=0.854
Batch 23550 -- MSE=2.011 MAE=1.156
Batch 23600 -- MSE=1.508 MAE=1.083
Batch 23650 -- MSE=1.411 MAE=0.994
Batch 23700 -- MSE=1.894 MAE=1.228
Batch 23750 -- MSE=1.980 MAE=1.182
Batch 23800 -- MSE=2.084 MAE=1.283
Batch 23850 -- MSE=1.506 MAE=0.990
Batch 23900 -- MSE=1.865 MAE=1.006
Batch 23950 -- MSE=1.459 MAE=0.944
Batch 24000 -- MSE=1.546 MAE=1.040
Batch 24050 -- MSE=2.727 MAE=1.395
Batch 24100 -- MSE=2.744 MAE=1.312
Batch 24150 -- MSE=2.143 MAE=1.197
Batch 24200 -- MSE=1.471 MAE=1.047
Batch 24250 -- MSE=1.208 MAE=0.923
Batch 24300 -- MSE=2.685 MAE=1.326
Batch 24350 -- MSE=2.123 MAE=1.224
Batch 24400 -- MSE=1.436 MAE=1.031
Batch 24450 -- MSE=1.508 MAE=1.019
Batch 24500 -- MSE=2.068 MAE=1.232
Batch 24550 -- MSE=1.323 MAE=0.916
Batch 24600 -- MSE=2.316 MAE=1.224
Batch 24650 -- MSE=2.806 MAE=1.304
Batch 24700 -- MSE=2.001 MAE=1.129
Batch 24750 -- MSE=1.469 MAE=1.034
Batch 24800 -- MSE=1.942 MAE=1.226
Batch 24850 -- MSE=1.790 MAE=1.134
Batch 24900 -- MSE=1

Batch 35250 -- MSE=1.986 MAE=1.162
Batch 35300 -- MSE=1.671 MAE=1.130
Batch 35350 -- MSE=1.402 MAE=1.013
Batch 35400 -- MSE=3.027 MAE=1.579
Batch 35450 -- MSE=1.705 MAE=1.112
Batch 35500 -- MSE=1.977 MAE=1.228
Batch 35550 -- MSE=1.432 MAE=0.875
Batch 35600 -- MSE=1.871 MAE=1.183
Batch 35650 -- MSE=2.588 MAE=1.406
Batch 35700 -- MSE=1.670 MAE=1.037
Batch 35750 -- MSE=1.852 MAE=1.163
Batch 35800 -- MSE=1.757 MAE=1.131
Batch 35850 -- MSE=2.054 MAE=1.075
Batch 35900 -- MSE=1.114 MAE=0.862
Batch 35950 -- MSE=1.333 MAE=0.960
Batch 36000 -- MSE=2.018 MAE=1.249
Batch 36050 -- MSE=1.666 MAE=1.092
Batch 36100 -- MSE=1.884 MAE=1.132
Batch 36150 -- MSE=0.685 MAE=0.619
Batch 36200 -- MSE=2.929 MAE=1.453
Batch 36250 -- MSE=1.974 MAE=1.163
Batch 36300 -- MSE=1.959 MAE=1.219
Batch 36350 -- MSE=0.946 MAE=0.806
Batch 36400 -- MSE=1.264 MAE=1.008
Batch 36450 -- MSE=1.762 MAE=1.172
Batch 36500 -- MSE=1.307 MAE=0.868
Batch 36550 -- MSE=2.281 MAE=1.368
Batch 36600 -- MSE=1.857 MAE=1.177
Batch 36650 -- MSE=1