# 0. Set Up

In [None]:
print("Importando librerias")
import random
from io import BytesIO
from requests import get
import concurrent.futures
import gzip
import json
import pandas as pd
import numpy as np
import os
from tqdm import tqdm
import time
from itertools import chain
print("* Librerias importadas * ")

Importando librerias


# `Input: City`

En **Archivos** debe existir dos tipos de archivos:
1. meta-CITY_NAME.json.gz
2. reviews-CITY_NAME.json.gz

In [None]:
CITY_NAME = "Florida"

# 1. Load Metadata and Reviews

In [None]:
def parse(path, sample_ptge=100, take_sample=False):
  g = gzip.open(path, 'r')

  if take_sample:
    total_lines = sum(1 for _ in g)
    g.seek(0)  # Reiniciamos puntero del archivo
    sample_size = int(total_lines * (sample_ptge) / 100)

    iterator = tqdm(g, total=total_lines, desc="Cargando datos")

    count = 0
    for i, l in enumerate(iterator):
        if random.random() > 0.5:

            yield json.loads(l)
            count += 1
            if count >= sample_size:
              break
  else:
    # Creamos un iterador tqdm para mostrar el progreso
    iterator = tqdm(g, desc="Cargando datos")

    for l in iterator:
        yield json.loads(l)

# Input: Nombre de la ciudad y el porcentaje de la muestra.
# R = booleano para cargar reviews
# M = booleanos para cargar metadata.
def load_data(city, sample_ptge, R=True, M=True):
    REVIEW_PATH = f"review-{city}_10.json.gz"
    METADATA_PATH = f"meta-{city}.json.gz"

    result = [None, None]

    if not os.path.exists(REVIEW_PATH):
        print(f"El archivo {REVIEW_PATH} no existe.")
        return

    if not os.path.exists(METADATA_PATH):
        print(f"El archivo {METADATA_PATH} no existe.")
        return

    if M:
        print("Cargando Metada ... ")
        metadata = list(parse(METADATA_PATH))
        result[0] = metadata
        print("Metadata Cargada!")

    if R:
        print("Cargando Reviews ... ")
        reviews = list(parse(REVIEW_PATH, sample_ptge=sample_ptge, take_sample=True))
        result[1] = reviews
        print("Reviews Cargadas!")

    print("Finalizado!")
    return result


Tienen que subir los archivos de reviews y metadata en Archivos. Luego ingresen el nombre de la ciudad que corresponde.

In [None]:
metadata, reviews = load_data(CITY_NAME, 1)

In [None]:
len(reviews)

In [None]:
# Obtenemos las categorias ...
categories = list(business['category'] for business in metadata if business['category'] is not None)
categories = list(chain.from_iterable(categories))
categories = list(set(categories))

In [None]:
# Obtenemos las categorias asociadas a restaurantes.
restaurant_categories = []
for categ in categories:
    categ_low = categ.lower()
    if 'restaurant' in categ_low or 'restaurante' in categ_low:
        restaurant_categories.append(categ)

In [None]:
gmap_id_to_metadata = {business['gmap_id'] : business for business in metadata}

In [None]:
def verify_is_restaurant(categories):
    # si no presenta informacion, no lo agregamos
    if categories is None:
        return False
    # si es restaurante, retornamos true
    for categ in categories:
        categ_low = categ.lower()
        if 'restaurant' in categ_low:
            return True
    # en otro caso, no se agrega
    return False

# Guardamos todas las reviews asociadas a restaurantes.
restaurant_reviews = []

for rev in reviews:
    gmap_id = rev['gmap_id']
    category_list = gmap_id_to_metadata[gmap_id]['category']
    is_restaurant = verify_is_restaurant(category_list) # booleano
    if is_restaurant:
        restaurant_reviews.append(rev)

In [None]:
# Creamos el dataframe asociado.
data = {
    'user_id' : [],
    'gmap_id' : [],
    'rating' : [],
    'text' : [],
    'img_url' : [],
    'img_filename' : [],
    'state': [],
}

for review in restaurant_reviews:
    gmap_id = review['gmap_id']
    has_img = review['pics'] is not None
    has_text = review['text'] is not None
    has_all_info = has_img and has_text
    is_duplicated = gmap_id in data['gmap_id']

    if not has_all_info: continue

    if is_duplicated: continue

    # business pic
    pics = review['pics']
    first_pic = pics[0]
    url_first_pic = first_pic['url'][0] # select the first pic

    # business text
    text = review['text']

    # reviews's rating
    rating = review['rating']

    # user id
    user_id = review['user_id']

    # estado asociada
    data['state'].append(CITY_NAME)

    data['user_id'].append(user_id)
    data['gmap_id'].append(gmap_id)
    data['rating'].append(rating)
    data['text'].append(text)
    data['img_url'].append(url_first_pic)
    data['img_filename'].append(None)

df = pd.DataFrame(data)

In [None]:
df.head()

In [None]:
df.shape

In [None]:
image_data = df[['gmap_id', 'img_url']]
indexes_to_drop = []

"""
fuente: https://medium.com/analytics-vidhya/how-to-download-images-faster-using-multithreading-in-python-a31110468770
"""

def download_image(row, folder_name):
    gmap_id, img_url = row.gmap_id, row.img_url
    specific_data = df[(df.gmap_id == gmap_id) & (df.img_url == img_url)]
    index = specific_data.index[0]
    prefix = folder_name
    img_path = f'{prefix}/{gmap_id}.png'

    if os.path.isfile(img_path): # caso en que exista
        print(f"- [⚠️] La imagen {img_path} ya esta en la carpeta")
        return
    response = get(img_url)

    if response.status_code == 200:
        curr_images = len(os.listdir(folder_name))
        print(f"- [✅] ({curr_images}/{image_data.shape[0]}) Descargando y guardando imagen {gmap_id}.png con exito")
        img_bytes = response.content
        with open(img_path, 'wb') as img_file:
            img_file.write(img_bytes)

        df.at[index, 'img_filename'] = img_path
    else:
        print(f"- [❌] Imagen {img_url} no puede ser descargada")
        indexes_to_drop.append(index)

def create_folder(folder_name):
    print(f"Creando carpeta '{folder_name}/'")
    os.makedirs(folder_name, exist_ok=True)

    t1 = time.perf_counter()

    with concurrent.futures.ThreadPoolExecutor(max_workers=50) as executor: # Pueden alterar la cantidad de workers segun el rendimiento
        for row in image_data.itertuples():
            executor.submit(download_image, row, folder_name)

    t2 = time.perf_counter()

    print(f'Finished in {t2-t1} seconds')


# 2. Images Requests

In [None]:
FOLDER_NAME = CITY_NAME

In [None]:
create_folder(FOLDER_NAME)

In [None]:
# ojo, probablemente no todos los request sean efectivos,
# por lo tanto, es probable que algunas imagenes no se descarguen
len(os.listdir(FOLDER_NAME))

In [None]:
%%capture
!zip -r Florida.zip Florida # Cambiar al nombre de la ciudad

In [None]:
def image_in_folder(row):
    img_url = row.gmap_id + '.png'
    if img_url not in os.listdir(FOLDER_NAME):
        return False
    return True

df['image_in_folder'] = df.apply(image_in_folder, axis=1)

In [None]:
df_filtered = df[df.image_in_folder]

In [None]:
df_filtered.drop(columns=['image_in_folder'], inplace=True)

In [None]:
for row in df_filtered.itertuples():
    filename = row.gmap_id + '.png'
    if filename not in os.listdir(FOLDER_NAME):
        print(filename)

# 3. Final Dataframe

In [None]:
print('Reviews de restaurantes vs Total (%): ', df_filtered.shape[0] / len(reviews  * 100))
print('Cantidad total de reviews de restaurantes: ', df_filtered.shape[0])
print('Cantidad total de reviews: ', len(reviews))

In [None]:
df_filtered.head()

In [None]:
df_filtered.to_csv(f'Reviews{FOLDER_NAME}.csv', index=False)