# 0. Set Up

In [2]:
print("Importando librerias")
import random
from io import BytesIO
from requests import get
import concurrent.futures
import gzip
import json
import pandas as pd
import numpy as np
import os
from tqdm import tqdm
import time
from itertools import chain
print("* Librerias importadas * ")

Importando librerias
* Librerias importadas * 


# `Input: City`

En **Archivos** debe existir dos tipos de archivos:
1. meta-CITY_NAME.json.gz
2. reviews-CITY_NAME.json.gz

In [23]:
CITY_NAME = "Washington"

# 1. Load Metadata and Reviews

In [24]:
def parse(path, sample_ptge=100, take_sample=False):
  g = gzip.open(path, 'r')

  if take_sample:
    total_lines = sum(1 for _ in g)
    g.seek(0)  # Reiniciamos puntero del archivo
    sample_size = int(total_lines * (sample_ptge) / 100)

    # Calculamos la probabilidad de seleccionar una línea.
    prob = sample_ptge / 100.0

    iterator = tqdm(g, total=total_lines, desc="Cargando datos")
    # Seleccionamos aleatoriamente por probabilidad -> mas eficiente que sacar una muestra aleatoria

    count = 0
    for i, l in enumerate(iterator):
        if random.random() < prob:  # Seleccionamos la línea con la probabilidad calculada

            yield json.loads(l)
            count += 1
            if count >= sample_size:
              break
  else:
    # Creamos un iterador tqdm para mostrar el progreso
    iterator = tqdm(g, desc="Cargando datos")

    for l in iterator:
        yield json.loads(l)

# Input: Nombre de la ciudad y el porcentaje de la muestra.
# R = booleano para cargar reviews
# M = booleanos para cargar metadata.
def load_data(city, sample_ptge, R=True, M=True):
    REVIEW_PATH = f"review-{city}_10.json.gz"
    METADATA_PATH = f"meta-{city}.json.gz"

    result = [None, None]

    if not os.path.exists(REVIEW_PATH):
        print(f"El archivo {REVIEW_PATH} no existe.")
        return

    if not os.path.exists(METADATA_PATH):
        print(f"El archivo {METADATA_PATH} no existe.")
        return

    if M:
        print("Cargando Metada ... ")
        metadata = list(parse(METADATA_PATH))
        result[0] = metadata
        print("Metadata Cargada!")

    if R:
        print("Cargando Reviews ... ")
        reviews = list(parse(REVIEW_PATH, sample_ptge=sample_ptge, take_sample=True))
        result[1] = reviews
        print("Reviews Cargadas!")

    print("Finalizado!")
    return result


Tienen que subir los archivos de reviews y metadata en Archivos. Luego ingresen el nombre de la ciudad que corresponde.

In [25]:
metadata, reviews = load_data(CITY_NAME, 10)

Cargando Metada ... 


Cargando datos: 121304it [00:11, 10677.92it/s]


Metadata Cargada!
Cargando Reviews ... 


Cargando datos: 100%|██████████| 10192020/10192020 [00:40<00:00, 251944.58it/s]

Reviews Cargadas!
Finalizado!





In [26]:
len(reviews)

1018538

In [27]:
# Obtenemos las categorias ...
categories = list(business['category'] for business in metadata if business['category'] is not None)
categories = list(chain.from_iterable(categories))
categories = list(set(categories))

In [28]:
# Obtenemos las categorias asociadas a restaurantes.
restaurant_categories = []
for categ in categories:
    categ_low = categ.lower()
    if 'restaurant' in categ_low or 'restaurante' in categ_low:
        restaurant_categories.append(categ)

In [29]:
gmap_id_to_metadata = {business['gmap_id'] : business for business in metadata}

In [30]:
def verify_is_restaurant(categories):
    # si no presenta informacion, no lo agregamos
    if categories is None:
        return False
    # si es restaurante, retornamos true
    for categ in categories:
        categ_low = categ.lower()
        if 'restaurant' in categ_low:
            return True
    # en otro caso, no se agrega
    return False

# Guardamos todas las reviews asociadas a restaurantes.
restaurant_reviews = []

for rev in reviews:
    gmap_id = rev['gmap_id']
    category_list = gmap_id_to_metadata[gmap_id]['category']
    is_restaurant = verify_is_restaurant(category_list) # booleano
    if is_restaurant:
        restaurant_reviews.append(rev)

In [31]:
# Creamos el dataframe asociado.
data = {
    'user_id' : [],
    'gmap_id' : [],
    'rating' : [],
    'text' : [],
    'img_url' : []
}

for review in restaurant_reviews:
    gmap_id = review['gmap_id']
    has_img = review['pics'] is not None
    has_text = review['text'] is not None
    has_all_info = has_img and has_text
    is_duplicated = gmap_id in data['gmap_id']

    if not has_all_info: continue

    if is_duplicated: continue

    # business pic
    pics = review['pics']
    first_pic = pics[0]
    url_first_pic = first_pic['url'][0] # select the first pic

    # business text
    text = review['text']

    # reviews's rating
    rating = review['rating']

    # user id
    user_id = review['user_id']

    data['user_id'].append(user_id)
    data['gmap_id'].append(gmap_id)
    data['rating'].append(rating)
    data['text'].append(text)
    data['img_url'].append(url_first_pic)

df = pd.DataFrame(data)

In [32]:
df.head()

Unnamed: 0,user_id,gmap_id,rating,text,img_url
0,115912106958485835917,0x549014f358189e23:0x7e6b6cb7dc73611,5,My favorite Cafe in Seattle for almost a decad...,https://lh5.googleusercontent.com/p/AF1QipOIp9...
1,117414084847255331315,0x5490695aceaf3363:0xd32e2f939b7713df,5,"Amazing food , clean,super friendly stuff, hap...",https://lh5.googleusercontent.com/p/AF1QipPsZP...
2,110436980949770086658,0x549057a7db40d70d:0xe085b6f6463ca410,5,Everything I've tried from Kim's has been deli...,https://lh5.googleusercontent.com/p/AF1QipMTBT...
3,103135870946137978485,0x549e192202a1824d:0xcfe740022f6bbc69,4,Great restaurant. Their Bruschettas are really...,https://lh5.googleusercontent.com/p/AF1QipPF1W...
4,115403328553351118144,0x54906ad592d30191:0xd9116a4a65dda919,5,👌🏼👌🏼,https://lh5.googleusercontent.com/p/AF1QipOFiu...


In [33]:
df.shape

(6238, 5)

In [34]:
image_data = df[['gmap_id', 'img_url']]
indexes_to_drop = []

"""
fuente: https://medium.com/analytics-vidhya/how-to-download-images-faster-using-multithreading-in-python-a31110468770
"""

def download_image(row, folder_name):
    gmap_id, img_url = row.gmap_id, row.img_url
    specific_data = df[(df.gmap_id == gmap_id) & (df.img_url == img_url)]
    index = specific_data.index[0]
    prefix = folder_name
    img_path = f'{prefix}/{gmap_id}.png'

    if os.path.isfile(img_path): # caso en que exista
        print(f"- [⚠️] La imagen {img_path} ya esta en la carpeta")
        return
    response = get(img_url)

    if response.status_code == 200:
        curr_images = len(os.listdir(folder_name))
        print(f"- [✅] ({curr_images}/{image_data.shape[0]}) Descargando y guardando imagen {gmap_id}.png con exito")
        img_bytes = response.content
        with open(img_path, 'wb') as img_file:
            img_file.write(img_bytes)

        df.at[index, 'img_url'] = img_path
    else:
        print(f"- [❌] Imagen {img_url} no puede ser descargada")
        indexes_to_drop.append(index)

def create_folder(folder_name):
    print(f"Creando carpeta '{folder_name}/'")
    os.makedirs(folder_name, exist_ok=True)

    t1 = time.perf_counter()

    with concurrent.futures.ThreadPoolExecutor(max_workers=50) as executor: # Pueden alterar la cantidad de workers segun el rendimiento
        for row in image_data.itertuples():
            executor.submit(download_image, row, folder_name)

    t2 = time.perf_counter()

    print(f'Finished in {t2-t1} seconds')


# 2. Images Requests

In [35]:
FOLDER_NAME = CITY_NAME

In [36]:
create_folder(FOLDER_NAME)

[1;30;43mSe truncaron las últimas líneas 5000 del resultado de transmisión.[0m
- [✅] (1211/6238) Descargando y guardando imagen 0x54858fa1fac6bd8d:0x3c4c1efa27394e52.png con exito
- [✅] (1215/6238) Descargando y guardando imagen 0x54906abba2d8b401:0xdac01c6c6cc04ff4.png con exito
- [✅] (1216/6238) Descargando y guardando imagen 0x549878d73cd55ed1:0x8ba3e5384317e4c4.png con exito
- [✅] (1216/6238) Descargando y guardando imagen 0x54906ab0f714f6c3:0xd1e3fda983b1a256.png con exito
- [✅] (1212/6238) Descargando y guardando imagen 0x54a1cadc75b1138d:0xd0989e32ab9c9f7d.png con exito
- [✅] (1212/6238) Descargando y guardando imagen 0x5485871e3896179d:0x608c46b7a90706de.png con exito
- [✅] (1220/6238) Descargando y guardando imagen 0x54923a0013f118fb:0x1e4cd46e6bc35b61.png con exito
- [✅] (1221/6238) Descargando y guardando imagen 0x54906cfe6cf18133:0x91d139316ba8f9b3.png con exito
- [✅] (1212/6238) Descargando y guardando imagen 0x548512c045e7f7a3:0x6c0a230dbe5feac7.png con exito
- [✅] (122

In [37]:
# ojo, probablemente no todos los request sean efectivos,
# por lo tanto, es probable que algunas imagenes no se descarguen
len(os.listdir(FOLDER_NAME))

6136

In [74]:
%%capture
!zip -r Washington.zip Washington # Cambiar al nombre de la ciudad

In [38]:
def image_in_folder(row):
    img_url = row.gmap_id + '.png'
    if img_url not in os.listdir(FOLDER_NAME):
        return False
    return True

df['image_in_folder'] = df.apply(image_in_folder, axis=1)

In [39]:
df_filtered = df[df.image_in_folder]

In [40]:
df_filtered.drop(columns=['image_in_folder'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered.drop(columns=['image_in_folder'], inplace=True)


In [41]:
for row in df_filtered.itertuples():
    filename = row.gmap_id + '.png'
    if filename not in os.listdir(FOLDER_NAME):
        print(filename)

In [42]:
df_filtered['img_url'] = df_filtered['gmap_id'] + '.png'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['img_url'] = df_filtered['gmap_id'] + '.png'


# 3. Final Dataframe

In [45]:
print('Reviews de restaurantes vs Total (%): ', df_filtered.shape[0] / len(reviews  * 100))
print('Cantidad total de reviews de restaurantes: ', df_filtered.shape[0])
print('Cantidad total de reviews: ', len(reviews))

Reviews de restaurantes vs Total (%):  6.0243211348030215e-05
Cantidad total de reviews de restaurantes:  6136
Cantidad total de reviews:  1018538


In [46]:
df_filtered.head()

Unnamed: 0,user_id,gmap_id,rating,text,img_url
0,115912106958485835917,0x549014f358189e23:0x7e6b6cb7dc73611,5,My favorite Cafe in Seattle for almost a decad...,0x549014f358189e23:0x7e6b6cb7dc73611.png
1,117414084847255331315,0x5490695aceaf3363:0xd32e2f939b7713df,5,"Amazing food , clean,super friendly stuff, hap...",0x5490695aceaf3363:0xd32e2f939b7713df.png
2,110436980949770086658,0x549057a7db40d70d:0xe085b6f6463ca410,5,Everything I've tried from Kim's has been deli...,0x549057a7db40d70d:0xe085b6f6463ca410.png
3,103135870946137978485,0x549e192202a1824d:0xcfe740022f6bbc69,4,Great restaurant. Their Bruschettas are really...,0x549e192202a1824d:0xcfe740022f6bbc69.png
4,115403328553351118144,0x54906ad592d30191:0xd9116a4a65dda919,5,👌🏼👌🏼,0x54906ad592d30191:0xd9116a4a65dda919.png


In [47]:
df_filtered.to_csv(f'Reviews{FOLDER_NAME}.csv', index=False)

In [49]:
def save_reviews(filename, reviews, metadata):
    # Guardamos las reviews filtradas con su metadata.
    for review in reviews:
        gmap_id = review['gmap_id']
        rev_metadata = gmap_id_to_metadata[gmap_id]
        review['metadata'] = rev_metadata

    with gzip.open(f'{filename}_reviews_and_metadata.json.gz', 'wt') as f:
        json.dump(reviews, f)

In [50]:
save_reviews(FOLDER_NAME, restaurant_reviews, metadata)