# ETL Google Maps datasets

In [48]:
# Importacion de librerias necesarias para procesamiento
import pandas as pd
import json
import datetime
from datetime import datetime
from textblob import TextBlob

## Metadata_sitios

In [49]:
metadata = pd.read_csv('metadata_raw.csv')

In [50]:
# Contabilizar numero de filas duplicadas tanto para 'name' como 'adress'
numero_filas_duplicadas = metadata.duplicated(['name', 'address']).sum()

In [51]:
# Eliminar filas duplicadas tanto para 'name' como 'adress'
metadata = metadata.drop_duplicates(['name', 'address'])

In [52]:
# Contabilizar filas nulas
cantidad_filas_nulas = metadata.isnull().sum()
cantidad_filas_nulas

name                  5
address               0
gmap_id               0
latitude              0
longitude             0
category           1368
avg_rating            0
num_of_reviews        0
hours             49366
url                   0
dtype: int64

#### Limpieza y transformacion de datos

In [53]:
# Filtrar columnas deseadas
metadata = metadata[['name', 'address', 'gmap_id', 'latitude', 'longitude','category', 'avg_rating', 'num_of_reviews', 'hours', 'url']]

In [54]:
# Elimnar nulos para la columna 'category'
metadata = metadata.dropna(subset=['category'])

In [55]:
metadata.info()

<class 'pandas.core.frame.DataFrame'>
Index: 214248 entries, 0 to 215615
Data columns (total 10 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   name            214243 non-null  object 
 1   address         214248 non-null  object 
 2   gmap_id         214248 non-null  object 
 3   latitude        214248 non-null  float64
 4   longitude       214248 non-null  float64
 5   category        214248 non-null  object 
 6   avg_rating      214248 non-null  float64
 7   num_of_reviews  214248 non-null  int64  
 8   hours           166042 non-null  object 
 9   url             214248 non-null  object 
dtypes: float64(3), int64(1), object(6)
memory usage: 18.0+ MB


In [56]:
def extraer_nombre_y_direccion(address, name):
    import re
    # Verificar si address o name es NaN
    if pd.isna(address) or pd.isna(name):
        return None
    
    # Escapar caracteres especiales en el nombre
    escaped_name = re.escape(name)
    
    # Utilizamos expresiones regulares para encontrar el nombre seguido de una coma y un espacio en la dirección
    match = re.search(rf"\b{escaped_name}, \b", address)
    if match:
        # Si se encuentra el nombre seguido de una coma y un espacio en la dirección, extraemos la parte de la dirección después de eso
        parte_direccion = address[match.end():].strip()
        # Devolvemos la parte de la dirección después del nombre y la coma seguida de un espacio
        return parte_direccion
    
# Aplicar la funcion a la columna address corrigiendo los valores existentes
metadata['address'] = metadata.apply(lambda x: extraer_nombre_y_direccion(x['address'], x['name']), axis=1)

# Visualizar resultado
metadata.head()

Unnamed: 0,name,address,gmap_id,latitude,longitude,category,avg_rating,num_of_reviews,hours,url
0,NTK OUTDOOR,"2315 NW 107th Ave #1B18, Miami, FL 33172",0x88d9beb4fe0532c1:0xef0555c169299d6,25.795204,-80.366038,['Corporate office'],5.0,35,"[['Thursday', '9AM–6PM'], ['Friday', '9AM–6PM'...",https://www.google.com/maps/place//data=!4m2!3...
1,Cruises Inc. - Connie Stewart,"6602 52nd Ln, Pinellas Park, FL 33781",0x88c2e49b79f06c31:0x4ed8c8ce27e926e0,27.832187,-82.704805,['Cruise agency'],5.0,2,"[['Thursday', '9AM–5PM'], ['Friday', '9AM–5PM'...",https://www.google.com/maps/place//data=!4m2!3...
3,First Impressions Barbershop Inc.,"577 Deltona Blvd suite #9, Deltona, FL 32725",0x88e711bd2244fe3b:0x8406dd780f0574d1,28.877581,-81.281276,['Barber shop'],4.0,8,"[['Thursday', '8AM–8PM'], ['Friday', '8AM–8PM'...",https://www.google.com/maps/place//data=!4m2!3...
4,"Brian Shaheen, MD","2421 Thomas Dr, Panama City, FL 32408",0x8893863ea87bd5dd:0x9383ebf973e74abb,30.159982,-85.752277,"['Family practice physician', 'General practit...",4.2,18,"[['Thursday', '8AM–5PM'], ['Friday', '8AM–5PM'...",https://www.google.com/maps/place//data=!4m2!3...
5,Cape Seafood Shack,"603 Del Prado Blvd S, Cape Coral, FL 33990",0x88db4147b1d9e6f3:0x943dbd10a92ba1b1,26.641377,-81.940545,['Restaurant'],5.0,1,,https://www.google.com/maps/place//data=!4m2!3...


In [57]:
# Eliminamos el campo 'pais' de address ya que solo trabajamos con Estados Unidos
metadata['address'] = metadata['address'].str.replace(', United States', '').str.strip()

In [58]:
# Dividir la cadena de dirección en partes utilizando la coma como delimitador
address_parts = metadata['address'].str.rsplit(', ', n=2, expand=True)

# Asignar las partes a las columnas correspondientes
metadata['address'] = address_parts[0]
metadata['city'] = address_parts[1]
metadata['state'] = address_parts[2]

# Dividir la cadena de dirección en partes utilizando la coma como delimitador
state_parts = metadata['state'].str.split(' ', expand=True)

# Asignar las partes a las columnas correspondientes
metadata['state'] = state_parts[0]
metadata['postal_code'] = state_parts[1]

In [59]:
# Renombrar columna 'avg_rating' a 'stars'
metadata.rename(columns={'avg_rating': 'stars'}, inplace=True)

# Renombrar columna 'category' a 'tags'
metadata.rename(columns={'category': 'tags'}, inplace=True)

# Renombrar columna 'num_of_reviews' a 'review_count'
metadata.rename(columns={'num_of_reviews': 'review_count'}, inplace=True)

# Reordenar columnas
metadata = metadata[['gmap_id','name', 'address','city','state','postal_code','latitude','longitude','stars','review_count','tags','hours','url']]

In [60]:
cantidad_filas_nulas = metadata.isnull().sum()
cantidad_filas_nulas

gmap_id             0
name                5
address           390
city              570
state            5449
postal_code      5465
latitude            0
longitude           0
stars               0
review_count        0
tags                0
hours           48206
url                 0
dtype: int64

In [61]:
# Convertir valores a float
metadata['latitude'] = metadata['latitude'].astype(float)
metadata['longitude'] = metadata['longitude'].astype(float)
metadata['stars'] = metadata['stars'].astype(float)

#### Filtros

In [62]:
# Filtrar las filas donde el estado es Florida
metadata = metadata.loc[metadata['state'] == 'FL']

In [63]:
# Capitalizar todas las palabras en la columna 'tags'
metadata['tags'] = metadata['tags'].str.title()

In [64]:
# Lista de palabras clave gatronomia
gastronomia = ['Restaurant', 'Restaurants','Food','Bars','Bar','Café','Coffeehouse','Bistro','Tavern','Buffet','Brewpub','Pub','Brasserie','Specialty Coffee Shop','Pub','Churrería','Diner','Dining','Teahouse','Tea Room','Gas Station', 'Gas','Fuel Station','Fuel']

# Filtrar los registros que contienen al menos una palabra clave en 'tags'
metadata_gastronomia = metadata[metadata['tags'].str.contains('|'.join(gastronomia))]

# Lista de palabras clave comida ethnica
comida_ethnica = ['Chinese','Indian','Thai','Italian','Greek','Helthy','Helth','Latin','Mexican','Tacos','Burritos','Enchiladas','Argentinian','Vegan','Vegetarian','Peruvian','Ceviche','Lomo','Pisco','Colombian','Empanadas','Arepas','Asian','Japanese','Sushi','Ramen','Sashimi','Tempura','Korean','Kimchi','Vietnamese','African','Ethiopian','Nigerian','Middle Eastern','Lebanese','Hummus','Falafel','Shawarma','Tabbouleh','Israeli','Shakshuka','Falafel','Hummus','Iranian','Healthy','Vegetarian','Vegan','Gas Station', 'Gas','Fuel Station','Fuel']

# Filtrar los registros que contienen al menos una palabra clave en 'tags'
df_metadata_ethnica = metadata_gastronomia[metadata_gastronomia['tags'].str.contains('|'.join(comida_ethnica))]

In [65]:
# Filtrar los registros donde 'name' se repite más de una vez
# Estos registros se consideraran como franquicias al identificarse con el mismo nombre y encontrarse en distintas locaciones
metadata = df_metadata_ethnica[df_metadata_ethnica.duplicated(subset='name', keep=False)]

In [66]:
# Función para encontrar palabras clave en la columna 'tags' y simplificar clasificacion restaurantes

def encontrar_palabra_clave(tags):

    # Lista de palabras clave que determian categorias mas abarcativas/comunes
    palabras_clave = ['Chinese','Indian','Thai','Italian','Greek','Mexican','Argentinian','Peruvian','Colombian','Japanese','Korean','Vietnamese','Ethiopian','Nigerian','Lebanese','Israeli','Iranian','Latin','Asian','African','Middle Eastern','Healthy','Vegetarian','Vegan','Casual','Gas Station']
    # Diccionario para determinar categoria segun palabras claves secundarias
    sub_tags = {  'Mexican' : ['Tacos','Burritos','Enchiladas'],
                        'Perubian' : ['Ceviche','Pisco'],
                        'Argentinian' : ['Asado','Locro','Dulce de Leche','Choripan'],
                        'Colombian' : ['Arepas'],
                        'Latin' : ['Empanadas','Lomo'],
                        'Asian' : ['Sushi','Ramen','Pan-Asian'],
                        'Korean' : ['Kimchi'],
                        'Japanese' : ['Sashimi','Tempura'],
                        'Middle_estern' : ['Hummus','Falafel','Shawarma','Tabbouleh'],
                        'Israeli' : ['Shakshuka'],
                        'Gas Station' : ['Gas', 'Gas Station', 'Convenience Store', 'Atm', 'Coffee Shop','Fuel Supplier','Fuel']}


    # Buscar en palabras_clave
    for palabra in palabras_clave:
        if palabra in tags:
            return palabra
    
    # Buscar en sub_tags
    for clave, valor in sub_tags.items():
        if any(sub_categoria in tags for sub_categoria in valor):
            return clave
        
    # Para aquellos registros donde se etiquete como desconocido, revisar columna tags para agregar etiquetas al diccionario sub_categories y poder reclasificarlo
    return 'Desconocido'


In [67]:
# Encontrar el índice de la columna 'tags'
tags_index = metadata.columns.get_loc('tags')

# Insertar la nueva columna 'categories' antes de 'tags'
metadata.insert(tags_index - 1, 'categories', metadata['tags'].apply(encontrar_palabra_clave))

# Completar la columna categories con la palabra clave encontrada
metadata.loc[:, 'categories'] = metadata['tags'].apply(encontrar_palabra_clave)

metadata.reset_index(drop=True)

Unnamed: 0,gmap_id,name,address,city,state,postal_code,latitude,longitude,stars,categories,review_count,tags,hours,url
0,0x88e635378f43352f:0xa1b53c63436fa428,Shell,"15877 E, FL-40",Silver Springs,FL,34488,29.183272,-81.889965,1.8,Gas Station,5,"['Gas Station', 'Atm', 'Convenience Store', 'R...","[['Thursday', 'Open 24 hours'], ['Friday', 'Op...",https://www.google.com/maps/place//data=!4m2!3...
1,0x88d901cf9b30e949:0x45ea9286f8c596c2,Exxon,621 W Broward Blvd,Fort Lauderdale,FL,33312,26.122500,-80.150300,5.0,Gas Station,1,['Gas Station'],"[['Thursday', 'Open 24 hours'], ['Friday', 'Op...",https://www.google.com/maps/place//data=!4m2!3...
2,0x88e8a13da9b3fea3:0x59a99647bb951969,Circle K,4310 SE Hawthorne Rd,Gainesville,FL,32641,29.635547,-82.272016,3.5,Gas Station,4,"['Convenience Store', 'Atm', 'Coffee Shop', 'G...","[['Thursday', 'Open 24 hours'], ['Friday', 'Op...",https://www.google.com/maps/place//data=!4m2!3...
3,0x88db592776a2303f:0x217bffde2d2e30c8,7 Eleven,26920 Jones Loop Rd,Punta Gorda,FL,33982,26.896282,-81.995669,5.0,Gas Station,1,['Gas Station'],,https://www.google.com/maps/place//data=!4m2!3...
4,0x88e76156b262d32b:0x1db1a887284e0fd8,Speedway,10024 Lee Vista Blvd,Orlando,FL,32829,28.477886,-81.246483,4.1,Gas Station,17,"['Convenience Store', 'Gas Station']","[['Wednesday', 'Open 24 hours'], ['Thursday', ...",https://www.google.com/maps/place//data=!4m2!3...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3464,0x88dd3fdb842040c5:0xe68b63c06e077638,Mobil,1315 US-98,Lakeland,FL,33801,28.026260,-81.927710,4.3,Gas Station,8,['Gas Station'],"[['Saturday', 'Open 24 hours'], ['Sunday', 'Op...",https://www.google.com/maps/place//data=!4m2!3...
3465,0x88dc5595622d279b:0x25248868a3fad9ad,Chevron,1412 Memorial Dr,Avon Park,FL,33825,27.595469,-81.493968,4.5,Gas Station,18,['Gas Station'],,https://www.google.com/maps/place//data=!4m2!3...
3466,0x88e6d78ad375c7bf:0x85f0d88e4e13e6bc,Speedway,4590 S Ridgewood Ave,Port Orange,FL,32127,29.137442,-80.984760,3.6,Gas Station,8,"['Convenience Store', 'Gas Station']","[['Saturday', '5AM–11PM'], ['Sunday', '5AM–11P...",https://www.google.com/maps/place//data=!4m2!3...
3467,0x88db39443c074c89:0xb20f9cd6006f0bc2,Circle K,20021 Summerlin Rd,Fort Myers,FL,33908,26.492749,-81.968294,3.4,Gas Station,7,"['Convenience Store', 'Atm', 'Coffee Shop', 'C...","[['Saturday', 'Open 24 hours'], ['Sunday', 'Op...",https://www.google.com/maps/place//data=!4m2!3...


## Review

In [68]:
df_review_state = pd.read_csv('review_state_raw.csv')

#### Limpieza y transformacion de datos

In [69]:
# Funcion para limpiar dataframe
def limpiar_dataframe(df):
    # Conservar solo las columnas especificadas
    df = df[['user_id','name','time','rating','text','gmap_id']]
    
    # Eliminar filas duplicadas
    df = df.drop_duplicates()
    
    # Eliminar filas completamente nulas
    df = df.dropna(how='all')
    
    # Resetear el índice
    df = df.reset_index(drop=True)
    
    return df

In [70]:
# Aplicar la funcion limpiar_dataframe a df_review_state
df_review_state = limpiar_dataframe(df_review_state)

In [71]:
# Obtener los 'gmap_id' únicos de metadata
gmap_ids_metadata = metadata['gmap_id'].unique()

# Filtrar los registros de df_review_Florida que coinciden con los 'gmap_id' de metadata
df_review_state = df_review_state[df_review_state['gmap_id'].isin(gmap_ids_metadata)]

In [72]:
def convertir_timestamp(timestamp):
    import datetime
    # Convertir milisegundos a segundos y luego crear objeto de fecha y hora
    fecha_hora = datetime.datetime.fromtimestamp(timestamp / 1000)
    # Redondear para mostrar solo hora, minuto y segundo
    fecha_hora_redondeada = fecha_hora.replace(microsecond=0)
    return fecha_hora_redondeada

In [73]:
# Aplicar la función a la columna 'time' del DataFrame df_review
df_review_state['time'] = df_review_state['time'].apply(convertir_timestamp)
df_review_state['time']

2803      2019-08-23 10:03:30
2804      2019-06-20 17:17:01
2805      2017-02-07 19:25:47
2806      2017-06-14 17:03:59
2807      2017-09-07 21:22:14
                  ...        
2729422   2019-10-20 18:36:25
2729423   2018-05-21 16:56:51
2729424   2018-04-09 00:38:20
2729425   2018-04-07 12:36:01
2729426   2019-01-22 00:27:36
Name: time, Length: 92104, dtype: datetime64[ns]

In [74]:
# Dividir la columna 'time' en 'date' y 'HH'
df_review_state['date'] = pd.to_datetime(df_review_state['time']).dt.date
df_review_state['hour'] = pd.to_datetime(df_review_state['time']).dt.strftime('%H:%M:%S')

# Eliminar la columna 'time' 
del df_review_state['time']

# Renombrar columna 'rating' a 'stars'
df_review_state.rename(columns={'rating': 'stars'}, inplace=True)

# Convertir valores 'stars' a float
df_review_state['stars'].astype(float)

# Mostrar el DataFrame resultante
df_review_state.head()

Unnamed: 0,user_id,name,stars,text,gmap_id,date,hour
2803,1.035138e+20,Vanessa Calderon,1,I just came in and wished I would have went to...,0x88e76156b262d32b:0x1db1a887284e0fd8,2019-08-23,10:03:30
2804,1.121645e+20,Georgimar Trujillo Briceno,1,The service of air is bad and tpi is only for ...,0x88e76156b262d32b:0x1db1a887284e0fd8,2019-06-20,17:17:01
2805,1.092032e+20,Chase Jackson,4,Not a bad place at all.. Now that Wawa is acro...,0x88e76156b262d32b:0x1db1a887284e0fd8,2017-02-07,19:25:47
2806,1.160331e+20,Carlos A Bonano,5,Excellent customer service.cheap price.,0x88e76156b262d32b:0x1db1a887284e0fd8,2017-06-14,17:03:59
2807,1.173752e+20,Manuel Camacho,4,,0x88e76156b262d32b:0x1db1a887284e0fd8,2017-09-07,21:22:14


In [75]:
# Convertir la cadena '2021-01-01' a un objeto datetime.date
date_threshold = datetime.strptime('2021-01-01', '%Y-%m-%d').date()

# Filtrar las filas a partir del 2021 
df_review_state = df_review_state[df_review_state['date'] >= date_threshold].reset_index(drop=True)

#### Sentiment Analysis

In [76]:
# Definir función para obtener la puntuación de sentimiento
def get_sentiment_score(text):
    if pd.isnull(text) or text == "":
        return 1  # Valor neutral si el texto está vacío o es NaN
    elif isinstance(text, str):
        # Crear un objeto TextBlob
        blob = TextBlob(text)

        # Obtener la puntuación de sentimiento
        sentiment_score = blob.sentiment.polarity

        # Escalar la puntuación entre 1 y 5
        scaled_score = int(round(5 * sentiment_score))

        # Asignar la puntuación
        if scaled_score <= 2:
            return 1
        elif scaled_score <= 3:
            return 2
        elif scaled_score <= 4:
            return 3
        else:
            return 4
    else:
        return 1  # Valor neutral para datos que no son de tipo cadena


In [77]:
# Asegurarse de que la columna 'text' sea de tipo cadena
df_review_state["text"] = df_review_state["text"].astype(str)

# Aplicar la función de análisis de sentimiento a la columna 'text' y crear una nueva columna 'sentiment_analysis'
df_review_state["sentiment_analysis"] = df_review_state["text"].apply(get_sentiment_score)

# Convertir la columna 'sentiment_analysis' al tipo de dato int
df_review_state["sentiment_analysis"] = df_review_state["sentiment_analysis"].astype(int)

# Elimnar columna 'text'
df_review_state.drop(["text"], axis=1, inplace=True)

In [78]:
df_review_state.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9519 entries, 0 to 9518
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   user_id             9519 non-null   float64
 1   name                9519 non-null   object 
 2   stars               9519 non-null   int64  
 3   gmap_id             9519 non-null   object 
 4   date                9519 non-null   object 
 5   hour                9519 non-null   object 
 6   sentiment_analysis  9519 non-null   int32  
dtypes: float64(1), int32(1), int64(1), object(4)
memory usage: 483.5+ KB


## Exportacion de Datos

In [79]:
# Exportacion de datasets resultantes
metadata.to_csv('datasets/procesados/metadata_procesado_GoogleMaps.csv', index=False)

df_review_state.to_csv('datasets/procesados/review_state_procesado_GoogleMaps.csv', index=False)