In [41]:
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq

In [42]:
metadata = pd.read_csv('datasets/raw/metadata-sitios.csv')

In [43]:
# Contabilizar numero de filas duplicadas tanto para 'name' como 'adress'
numero_filas_duplicadas = metadata.duplicated(['name', 'address']).sum()

In [44]:
# Eliminar filas duplicadas tanto para 'name' como 'adress'
metadata = metadata.drop_duplicates(['name', 'address'])

In [45]:
# Contabilizar filas nulas
cantidad_filas_nulas = metadata.isnull().sum()
cantidad_filas_nulas

name                  5
address               0
gmap_id               0
latitude              0
longitude             0
category           1368
avg_rating            0
num_of_reviews        0
hours             49366
url                   0
dtype: int64

In [46]:
# Filtrar columnas deseadas
metadata = metadata[['name', 'address', 'gmap_id', 'latitude', 'longitude','category', 'avg_rating', 'num_of_reviews', 'hours', 'url']]

In [47]:
# Elimnar nulos para la columna 'category'
metadata = metadata.dropna(subset=['category'])

In [48]:
lengths = metadata['category'].apply(lambda x: len(x))
conteo = lengths.value_counts()
conteo

category
20     11187
15     10863
14     10692
16      8469
22      7510
       ...  
310        1
320        1
281        1
289        1
271        1
Name: count, Length: 289, dtype: int64

In [49]:
metadata.head()

Unnamed: 0,name,address,gmap_id,latitude,longitude,category,avg_rating,num_of_reviews,hours,url
0,NTK OUTDOOR,"NTK OUTDOOR, 2315 NW 107th Ave #1B18, Miami, F...",0x88d9beb4fe0532c1:0xef0555c169299d6,25.795204,-80.366038,['Corporate office'],5.0,35,"[['Thursday', '9AM–6PM'], ['Friday', '9AM–6PM'...",https://www.google.com/maps/place//data=!4m2!3...
1,Cruises Inc. - Connie Stewart,"Cruises Inc. - Connie Stewart, 6602 52nd Ln, P...",0x88c2e49b79f06c31:0x4ed8c8ce27e926e0,27.832187,-82.704805,['Cruise agency'],5.0,2,"[['Thursday', '9AM–5PM'], ['Friday', '9AM–5PM'...",https://www.google.com/maps/place//data=!4m2!3...
3,First Impressions Barbershop Inc.,"First Impressions Barbershop Inc., 577 Deltona...",0x88e711bd2244fe3b:0x8406dd780f0574d1,28.877581,-81.281276,['Barber shop'],4.0,8,"[['Thursday', '8AM–8PM'], ['Friday', '8AM–8PM'...",https://www.google.com/maps/place//data=!4m2!3...
4,"Brian Shaheen, MD","Brian Shaheen, MD, 2421 Thomas Dr, Panama City...",0x8893863ea87bd5dd:0x9383ebf973e74abb,30.159982,-85.752277,"['Family practice physician', 'General practit...",4.2,18,"[['Thursday', '8AM–5PM'], ['Friday', '8AM–5PM'...",https://www.google.com/maps/place//data=!4m2!3...
5,Cape Seafood Shack,"Cape Seafood Shack, 603 Del Prado Blvd S, Cape...",0x88db4147b1d9e6f3:0x943dbd10a92ba1b1,26.641377,-81.940545,['Restaurant'],5.0,1,,https://www.google.com/maps/place//data=!4m2!3...


In [50]:
def extraer_nombre_y_direccion(address, name):
    import re
    # Verificar si address o name es NaN
    if pd.isna(address) or pd.isna(name):
        return None
    
    # Escapar caracteres especiales en el nombre
    escaped_name = re.escape(name)
    
    # Utilizamos expresiones regulares para encontrar el nombre seguido de una coma y un espacio en la dirección
    match = re.search(rf"\b{escaped_name}, \b", address)
    if match:
        # Si se encuentra el nombre seguido de una coma y un espacio en la dirección, extraemos la parte de la dirección después de eso
        parte_direccion = address[match.end():].strip()
        # Devolvemos la parte de la dirección después del nombre y la coma seguida de un espacio
        return parte_direccion
    
# Aplicar la funcion a la columna address corrigiendo los valores existentes
metadata['address'] = metadata.apply(lambda x: extraer_nombre_y_direccion(x['address'], x['name']), axis=1)

# Visualizar resultado
metadata.head()

Unnamed: 0,name,address,gmap_id,latitude,longitude,category,avg_rating,num_of_reviews,hours,url
0,NTK OUTDOOR,"2315 NW 107th Ave #1B18, Miami, FL 33172",0x88d9beb4fe0532c1:0xef0555c169299d6,25.795204,-80.366038,['Corporate office'],5.0,35,"[['Thursday', '9AM–6PM'], ['Friday', '9AM–6PM'...",https://www.google.com/maps/place//data=!4m2!3...
1,Cruises Inc. - Connie Stewart,"6602 52nd Ln, Pinellas Park, FL 33781",0x88c2e49b79f06c31:0x4ed8c8ce27e926e0,27.832187,-82.704805,['Cruise agency'],5.0,2,"[['Thursday', '9AM–5PM'], ['Friday', '9AM–5PM'...",https://www.google.com/maps/place//data=!4m2!3...
3,First Impressions Barbershop Inc.,"577 Deltona Blvd suite #9, Deltona, FL 32725",0x88e711bd2244fe3b:0x8406dd780f0574d1,28.877581,-81.281276,['Barber shop'],4.0,8,"[['Thursday', '8AM–8PM'], ['Friday', '8AM–8PM'...",https://www.google.com/maps/place//data=!4m2!3...
4,"Brian Shaheen, MD","2421 Thomas Dr, Panama City, FL 32408",0x8893863ea87bd5dd:0x9383ebf973e74abb,30.159982,-85.752277,"['Family practice physician', 'General practit...",4.2,18,"[['Thursday', '8AM–5PM'], ['Friday', '8AM–5PM'...",https://www.google.com/maps/place//data=!4m2!3...
5,Cape Seafood Shack,"603 Del Prado Blvd S, Cape Coral, FL 33990",0x88db4147b1d9e6f3:0x943dbd10a92ba1b1,26.641377,-81.940545,['Restaurant'],5.0,1,,https://www.google.com/maps/place//data=!4m2!3...


In [51]:
# Eliminamos el campo 'pais' de address ya que solo trabajamos con Estados Unidos
metadata['address'] = metadata['address'].str.replace(', United States', '').str.strip()

In [52]:
# Dividir la cadena de dirección en partes utilizando la coma como delimitador
address_parts = metadata['address'].str.rsplit(', ', n=2, expand=True)

# Asignar las partes a las columnas correspondientes
metadata['address'] = address_parts[0]
metadata['city'] = address_parts[1]
metadata['state'] = address_parts[2]

# Dividir la cadena de dirección en partes utilizando la coma como delimitador
state_parts = metadata['state'].str.split(' ', expand=True)

# Asignar las partes a las columnas correspondientes
metadata['state'] = state_parts[0]
metadata['postal_code'] = state_parts[1]

In [53]:
# Renombrar columna 'avg_rating' a 'stars'
metadata.rename(columns={'avg_rating': 'stars'}, inplace=True)

# Renombrar columna 'num_of_reviews' a 'review_count'
metadata.rename(columns={'num_of_reviews': 'review_count'}, inplace=True)

# Reordenar columnas
metadata = metadata[['gmap_id','name', 'address','city','state','category', 'postal_code','latitude','longitude','stars','review_count','hours','url']]

In [54]:
# Convertir valores a float
metadata['latitude'] = metadata['latitude'].astype(float)
metadata['longitude'] = metadata['longitude'].astype(float)
metadata['stars'] = metadata['stars'].astype(float)

In [55]:
# Filtrar las filas donde el estado es Florida
metadata = metadata.loc[metadata['state'] == 'FL']

In [56]:
metadata.info()

<class 'pandas.core.frame.DataFrame'>
Index: 207866 entries, 0 to 215615
Data columns (total 13 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   gmap_id       207866 non-null  object 
 1   name          207866 non-null  object 
 2   address       207866 non-null  object 
 3   city          207866 non-null  object 
 4   state         207866 non-null  object 
 5   category      207866 non-null  object 
 6   postal_code   207857 non-null  object 
 7   latitude      207866 non-null  float64
 8   longitude     207866 non-null  float64
 9   stars         207866 non-null  float64
 10  review_count  207866 non-null  int64  
 11  hours         163874 non-null  object 
 12  url           207866 non-null  object 
dtypes: float64(3), int64(1), object(9)
memory usage: 22.2+ MB


# Cambio de filtros y re categorizacion

In [57]:
# Lista de palabras clave gatronomia
gastronomia = ['Restaurant','Restaurants','Café','Coffeehouse','Bistro','Tavern','Buffet','Brewpub','Pub','Brasserie','Specialty Coffee','Pub','Churrería','Dinner','Dining','Teahouse','Tea Room','Coffee']

# Filtrar los registros que contienen al menos una palabra clave en 'tags'
metadata_filtrado = metadata[metadata['category'].str.contains('|'.join(gastronomia))]

In [58]:
# Eliminar nulos para 'category'
metadata_filtrado = metadata_filtrado.dropna(subset=['category'])

In [59]:
metadata_filtrado['category'].unique()

array(["['Restaurant']", "['Buffet restaurant']",
       "['Coffee shop', 'Bubble tea store', 'Cafe']", ...,
       "['Health food restaurant', 'Brazilian restaurant', 'Coffee shop', 'Nuevo Latino restaurant', 'Vegan restaurant']",
       "['Restaurant', 'Chicken shop', 'Fried chicken takeaway', 'Delivery Restaurant']",
       "['Gift shop', 'Cafe', 'Restaurant']"], dtype=object)

In [60]:
# Función para encontrar palabras clave en la columna 'tags' y simplificar clasificacion restaurantes
def encontrar_palabra_clave(tags):

    # Lista de palabras clave que determian categorias mas abarcativas/comunes
    palabras_clave = ['Latin','Asian','European','Others','American','Italian','Chinese','Mexican']
    # Diccionario para determinar categoria segun palabras claves secundarias
    sub_tags = {'European' : ['Italian','Greek','Pasta','French','German','Spanish','Catalan','Basque','Galician','Sicilian','English','Scottish','Welsh','Irish','Dutch','Swedish','Polish','Portuguese','Irish Pub'],
                'Italian' : ['Pasta'],
                'Latin' : ['Empanadas','Lomo','Asado','Locro','Dulce de Leche','Choripan','Argentinian','Peruvian','Ceviche','Pisco','Mexican','Tacos','Burritos','Enchiladas','Colombian','Arepas','Brazilian'],
                'Asian' : ['Sushi','Ramen','Pan-Asian','Korean','Kimchi','Japanese','Sashimi','Tempura','Indian','Thai','Vietnamese'],
                'American' : ['Steak','Fast Food','Hamburger','Fast','Chicken wings','Cheesesteak','Sandwich','Fried chicken','Buffet','Restaurant','Brasserie'],
                'Cafe': ['Tea','Dinner','Cafe','Coffee'],
                'Beverages':['Beer','Caterer','Cocktail','Pub','Brewpub'],
                'Others' : ['Healthy','Vegetarian','Vegan','Casual','Seafood','Bistro','African','Middle Eastern','Hummus','Falafel','Shawarma','Tabbouleh','Israeli','Shakshuka','Lebanese','Iranian''Jewish','Turkish']
                }

 
    # Buscar en palabras_clave
    for palabra in palabras_clave:
        if palabra in tags:
            return palabra
    
    # Buscar en sub_tags
    for clave, valor in sub_tags.items():
        if any(sub_categoria in tags for sub_categoria in valor):
            return clave
        
    # Para aquellos registros donde se etiquete como desconocido, revisar columna tags para agregar etiquetas al diccionario sub_categories y poder reclasificarlo
    return 'Desconocido'


In [61]:
# Renombrar columna 'category' a 'tags'
metadata_filtrado = metadata_filtrado.copy()
metadata_filtrado.rename(columns={'category': 'tags'}, inplace=True)

In [62]:
# Encontrar el índice de la columna 'tags'
tags_index = metadata_filtrado.columns.get_loc('tags')

# Insertar la nueva columna 'category' antes de 'tags'
metadata_filtrado.insert(tags_index - 1, 'category', metadata_filtrado['tags'].apply(encontrar_palabra_clave))

# Completar la columna category con la palabra clave encontrada
metadata_filtrado.loc[:, 'category'] = metadata_filtrado['tags'].apply(encontrar_palabra_clave)


In [63]:
metadata_filtrado['category'].value_counts()

category
American     6724
Cafe         1229
Beverages    1033
Asian         268
Italian       260
Latin         168
Mexican       144
Chinese       138
European      122
Others         31
Name: count, dtype: int64

In [64]:
# Control de etiquetas de categoria 'Desconocido' para recategorizar segun corresponda
metadata_filtrado[metadata_filtrado['category'] == 'Desconocido']['tags'].unique()

array([], dtype=object)

In [65]:
# eliminar aquellos registros donde category se 'Desconocido'
metadata_filtrado = metadata_filtrado[metadata_filtrado['tags'] != 'Desconocido']


In [66]:
lengths_filtrado = metadata_filtrado['category'].apply(lambda x: len(x))
conteo_filtrado = lengths_filtrado.value_counts()
conteo_filtrado

category
8    6846
4    1229
9    1033
7     542
5     436
6      31
Name: count, dtype: int64

In [67]:
filtro = metadata_filtrado['tags'] == 'supply store'

# Elimina las filas que cumplen con el filtro
metadata_filtrado = metadata_filtrado.drop(metadata_filtrado[filtro].index)

In [68]:
metadata_filtrado = metadata_filtrado.reset_index(drop=True)

In [69]:
metadata_filtrado.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10117 entries, 0 to 10116
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   gmap_id       10117 non-null  object 
 1   name          10117 non-null  object 
 2   address       10117 non-null  object 
 3   city          10117 non-null  object 
 4   category      10117 non-null  object 
 5   state         10117 non-null  object 
 6   tags          10117 non-null  object 
 7   postal_code   10117 non-null  object 
 8   latitude      10117 non-null  float64
 9   longitude     10117 non-null  float64
 10  stars         10117 non-null  float64
 11  review_count  10117 non-null  int64  
 12  hours         8861 non-null   object 
 13  url           10117 non-null  object 
dtypes: float64(3), int64(1), object(10)
memory usage: 1.1+ MB


In [70]:
null_counts = metadata_filtrado.isnull().sum(axis=0)
null_counts

gmap_id            0
name               0
address            0
city               0
category           0
state              0
tags               0
postal_code        0
latitude           0
longitude          0
stars              0
review_count       0
hours           1256
url                0
dtype: int64

In [71]:
null_counts = metadata_filtrado.isnull().sum(axis=0)
null_counts

gmap_id            0
name               0
address            0
city               0
category           0
state              0
tags               0
postal_code        0
latitude           0
longitude          0
stars              0
review_count       0
hours           1256
url                0
dtype: int64

In [72]:
# Eliminar la columna 'hours'
metadata_filtrado = metadata_filtrado.drop('hours', axis=1)

In [73]:
metadata_filtrado = metadata_filtrado.reset_index(drop=True)

In [74]:
metadata_filtrado.to_parquet('datasets/procesados/metadata-restaurants-FL.parquet')