In [2]:
!pip install nltk

Collecting nltk
  Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m28.7 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting regex>=2021.8.3 (from nltk)
  Downloading regex-2023.12.25-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.9/40.9 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
Downloading regex-2023.12.25-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (773 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m774.0/774.0 kB[0m [31m50.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: regex, nltk
Successfully installed nltk-3.8.1 regex-2023.12.25


# ETL Google Maps datasets

In [3]:
from google.cloud import storage
import pyarrow.parquet as pq
import pandas as pd
import json
import nltk
nltk.download("vader_lexicon")
from nltk.sentiment.vader import SentimentIntensityAnalyzer

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/jupyter/nltk_data...


In [4]:
storage_client = storage.Client()

## Metadata_sitios

In [5]:
def read_parquet_from_gcs(bucket_name, file_path):

    # Especifica la ruta completa del archivo en GCS
    gcs_path = f'gs://{bucket_name}/{file_path}'

    # Lee directamente desde GCS sin descargar localmente
    table = pq.read_table(gcs_path)

    # (Opcional) Convierte los datos a DataFrame de Pandas
    df = table.to_pandas()

    return df

In [6]:
bucket_name = 'project_yelp_parquet'
file_path_user = 'metadata-sitios.parquet'
metadata = read_parquet_from_gcs(bucket_name, file_path_user)
metadata.head()

Unnamed: 0,name,address,gmap_id,description,latitude,longitude,category,avg_rating,num_of_reviews,price,hours,MISC,state,relative_results,url
0,Porter Pharmacy,"Porter Pharmacy, 129 N Second St, Cochran, GA ...",0x88f16e41928ff687:0x883dad4fd048e8f8,,32.3883,-83.3571,[Pharmacy],4.9,16,,"[[Friday, 8AM–6PM], [Saturday, 8AM–12PM], [Sun...",{'Accessibility': ['Wheelchair accessible entr...,Open ⋅ Closes 6PM,"[0x88f16e41929435cf:0x5b2532a2885e9ef6, 0x88f1...",https://www.google.com/maps/place//data=!4m2!3...
1,City Textile,"City Textile, 3001 E Pico Blvd, Los Angeles, C...",0x80c2c98c0e3c16fd:0x29ec8a728764fdf9,,34.018891,-118.21529,[Textile exporter],4.5,6,,,,Open now,"[0x80c2c624136ea88b:0xb0315367ed448771, 0x80c2...",https://www.google.com/maps/place//data=!4m2!3...
2,San Soo Dang,"San Soo Dang, 761 S Vermont Ave, Los Angeles, ...",0x80c2c778e3b73d33:0xbdc58662a4a97d49,,34.058092,-118.29213,[Korean restaurant],4.4,18,,"[[Thursday, 6:30AM–6PM], [Friday, 6:30AM–6PM],...",{'Accessibility': ['Wheelchair accessible entr...,Open ⋅ Closes 6PM,"[0x80c2c78249aba68f:0x35bf16ce61be751d, 0x80c2...",https://www.google.com/maps/place//data=!4m2!3...
3,Nova Fabrics,"Nova Fabrics, 2200 E 11th St, Los Angeles, CA ...",0x80c2c89923b27a41:0x32041559418d447,,34.023669,-118.23293,[Fabric store],3.3,6,,"[[Thursday, 9AM–5PM], [Friday, 9AM–5PM], [Satu...","{'Accessibility': None, 'Activities': None, 'A...",Open ⋅ Closes 5PM,"[0x80c2c8811477253f:0x23a8a492df1918f7, 0x80c2...",https://www.google.com/maps/place//data=!4m2!3...
4,Nobel Textile Co,"Nobel Textile Co, 719 E 9th St, Los Angeles, C...",0x80c2c632f933b073:0xc31785961fe826a6,,34.036694,-118.249421,[Fabric store],4.3,7,,"[[Thursday, 9AM–5PM], [Friday, 9AM–5PM], [Satu...","{'Accessibility': None, 'Activities': None, 'A...",Open ⋅ Closes 5PM,"[0x80c2c62c496083d1:0xdefa11317fe870a1, 0x80c2...",https://www.google.com/maps/place//data=!4m2!3...


In [7]:
# Contabilizar numero de filas duplicadas tanto para 'name' como 'adress'
numero_filas_duplicadas = metadata.duplicated(['name', 'address']).sum()

In [8]:
# Eliminar filas duplicadas tanto para 'name' como 'adress'
metadata = metadata.drop_duplicates(['name', 'address'])

In [9]:
# Contabilizar filas nulas
cantidad_filas_nulas = metadata.isnull().sum()
cantidad_filas_nulas

name                      1
address               77169
gmap_id                   0
description         2740110
latitude                  0
longitude                 0
category              17171
avg_rating                0
num_of_reviews            0
price               2719501
hours                777733
MISC                 680269
state                737124
relative_results     289875
url                       0
dtype: int64

In [10]:
# Filtrar columnas deseadas
metadata = metadata[['name', 'address', 'gmap_id', 'latitude', 'longitude','category', 'avg_rating', 'num_of_reviews', 'hours', 'url']]

In [11]:
# Elimnar nulos para la columna 'category'
metadata = metadata.dropna(subset=['category'])

In [12]:
metadata.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2975760 entries, 0 to 3025010
Data columns (total 10 columns):
 #   Column          Dtype  
---  ------          -----  
 0   name            object 
 1   address         object 
 2   gmap_id         object 
 3   latitude        float64
 4   longitude       float64
 5   category        object 
 6   avg_rating      float64
 7   num_of_reviews  int64  
 8   hours           object 
 9   url             object 
dtypes: float64(3), int64(1), object(6)
memory usage: 249.7+ MB


In [14]:
metadata["category"] = metadata["category"].apply(lambda x: ' '.join(word.capitalize() for word in str(x).split()))

In [15]:
print(metadata['category'])

0                                               ['pharmacy']
1                                       ['textile Exporter']
2                                      ['korean Restaurant']
3                                           ['fabric Store']
4                                           ['fabric Store']
                                 ...                        
3025006    ['steak House' 'fine Dining Restaurant' 'seafo...
3025007                                 ['chevrolet Dealer']
3025008                                     ['veterinarian']
3025009                                ['animal Feed Store']
3025010                                          ['lodging']
Name: category, Length: 2975760, dtype: object


In [16]:
import re

def extraer_nombre_y_direccion(address, name):
    # Verificar si address o name es NaN
    if pd.isna(address) or pd.isna(name):
        return None
    
    # Escapar caracteres especiales en el nombre
    escaped_name = re.escape(name)
    
    # Utilizamos expresiones regulares para encontrar el nombre seguido de una coma y un espacio en la dirección
    match = re.search(rf"\b{escaped_name}, \b", address)
    if match:
        # Si se encuentra el nombre seguido de una coma y un espacio en la dirección, extraemos la parte de la dirección después de eso
        parte_direccion = address[match.end():].strip()
        # Devolvemos la parte de la dirección después del nombre y la coma seguida de un espacio
        return parte_direccion
    
# Aplicar la funcion a la columna address corrigiendo los valores existentes
metadata['address'] = metadata.apply(lambda x: extraer_nombre_y_direccion(x['address'], x['name']), axis=1)

In [17]:
metadata.head()

Unnamed: 0,name,address,gmap_id,latitude,longitude,category,avg_rating,num_of_reviews,hours,url
0,Porter Pharmacy,"129 N Second St, Cochran, GA 31014",0x88f16e41928ff687:0x883dad4fd048e8f8,32.3883,-83.3571,['pharmacy'],4.9,16,"[[Friday, 8AM–6PM], [Saturday, 8AM–12PM], [Sun...",https://www.google.com/maps/place//data=!4m2!3...
1,City Textile,"3001 E Pico Blvd, Los Angeles, CA 90023",0x80c2c98c0e3c16fd:0x29ec8a728764fdf9,34.018891,-118.21529,['textile Exporter'],4.5,6,,https://www.google.com/maps/place//data=!4m2!3...
2,San Soo Dang,"761 S Vermont Ave, Los Angeles, CA 90005",0x80c2c778e3b73d33:0xbdc58662a4a97d49,34.058092,-118.29213,['korean Restaurant'],4.4,18,"[[Thursday, 6:30AM–6PM], [Friday, 6:30AM–6PM],...",https://www.google.com/maps/place//data=!4m2!3...
3,Nova Fabrics,"2200 E 11th St, Los Angeles, CA 90021",0x80c2c89923b27a41:0x32041559418d447,34.023669,-118.23293,['fabric Store'],3.3,6,"[[Thursday, 9AM–5PM], [Friday, 9AM–5PM], [Satu...",https://www.google.com/maps/place//data=!4m2!3...
4,Nobel Textile Co,"719 E 9th St, Los Angeles, CA 90021",0x80c2c632f933b073:0xc31785961fe826a6,34.036694,-118.249421,['fabric Store'],4.3,7,"[[Thursday, 9AM–5PM], [Friday, 9AM–5PM], [Satu...",https://www.google.com/maps/place//data=!4m2!3...


In [18]:
metadata['address'] = metadata['address'].str.replace(', United States', '').str.strip()

In [19]:
# Dividir la cadena de dirección en partes utilizando la coma como delimitador
address_parts = metadata['address'].str.rsplit(', ', n=2, expand=True)

# Asignar las partes a las columnas correspondientes
metadata['address'] = address_parts[0]
metadata['city'] = address_parts[1]
metadata['state'] = address_parts[2]

# Dividir la cadena de dirección en partes utilizando la coma como delimitador
state_parts = metadata['state'].str.split(' ', expand=True)

# Asignar las partes a las columnas correspondientes
metadata['state'] = state_parts[0]
metadata['postal_code'] = state_parts[1]

In [20]:
# Convertir valores a enteros
#metadata['avg_rating'] = metadata['avg_rating'].round().astype(int)

# Renombrar columna 'avg_rating' a 'stars'
metadata.rename(columns={'avg_rating': 'stars'}, inplace=True)

# Renombrar columna 'category' a 'categories'
metadata.rename(columns={'category': 'categories'}, inplace=True)

# Renombrar columna 'num_of_reviews' a 'review_count'
metadata.rename(columns={'num_of_reviews': 'review_count'}, inplace=True)

# Reordenar columnas
metadata = metadata[['name', 'address','city','state','postal_code','latitude','longitude','stars','review_count','categories','hours','gmap_id','url']]

In [23]:
cantidad_filas_nulas = metadata.isnull().sum()
cantidad_filas_nulas

name                 1
address          85336
city            113332
state           203228
postal_code     203574
latitude             0
longitude            0
stars                0
review_count         0
categories           0
hours           763701
gmap_id              0
url                  0
dtype: int64

In [21]:
metadata

Unnamed: 0,name,address,city,state,postal_code,latitude,longitude,stars,review_count,categories,hours,gmap_id,url
0,Porter Pharmacy,129 N Second St,Cochran,GA,31014,32.388300,-83.357100,4.9,16,['pharmacy'],"[[Friday, 8AM–6PM], [Saturday, 8AM–12PM], [Sun...",0x88f16e41928ff687:0x883dad4fd048e8f8,https://www.google.com/maps/place//data=!4m2!3...
1,City Textile,3001 E Pico Blvd,Los Angeles,CA,90023,34.018891,-118.215290,4.5,6,['textile Exporter'],,0x80c2c98c0e3c16fd:0x29ec8a728764fdf9,https://www.google.com/maps/place//data=!4m2!3...
2,San Soo Dang,761 S Vermont Ave,Los Angeles,CA,90005,34.058092,-118.292130,4.4,18,['korean Restaurant'],"[[Thursday, 6:30AM–6PM], [Friday, 6:30AM–6PM],...",0x80c2c778e3b73d33:0xbdc58662a4a97d49,https://www.google.com/maps/place//data=!4m2!3...
3,Nova Fabrics,2200 E 11th St,Los Angeles,CA,90021,34.023669,-118.232930,3.3,6,['fabric Store'],"[[Thursday, 9AM–5PM], [Friday, 9AM–5PM], [Satu...",0x80c2c89923b27a41:0x32041559418d447,https://www.google.com/maps/place//data=!4m2!3...
4,Nobel Textile Co,719 E 9th St,Los Angeles,CA,90021,34.036694,-118.249421,4.3,7,['fabric Store'],"[[Thursday, 9AM–5PM], [Friday, 9AM–5PM], [Satu...",0x80c2c632f933b073:0xc31785961fe826a6,https://www.google.com/maps/place//data=!4m2!3...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3025006,Steak 48,260 S Broad St,Philadelphia,PA,19102,39.947254,-75.164953,4.6,308,['steak House' 'fine Dining Restaurant' 'seafo...,"[[Monday, 4–10PM], [Tuesday, 4–10PM], [Wednesd...",0x89c6c74f43a49b55:0x6be6995921c58b12,https://www.google.com/maps/place//data=!4m2!3...
3025007,"Jack Mcnerney Chevrolet, Inc.",363 NY-281,Tully,NY,13159,42.788636,-76.122120,4.4,143,['chevrolet Dealer'],"[[Monday, 8:30AM–7PM], [Tuesday, 8:30AM–7PM], ...",0x89da1787d3fdc4a5:0xce92ea6e3cd4d1cc,https://www.google.com/maps/place//data=!4m2!3...
3025008,Central Ny Spay Neuter Assista,17 Salisbury St,Cortland,NY,13045,42.603831,-76.165286,4.3,88,['veterinarian'],,0x89da6b9ce832726f:0xbd0d3630f1e3b42e,https://www.google.com/maps/place//data=!4m2!3...
3025009,Ok Feed Store,16300 SW 296th St,Homestead,FL,33033,25.491641,-80.456076,4.9,13,['animal Feed Store'],"[[Monday, 9AM–6PM], [Tuesday, 9AM–6PM], [Wedne...",0x88d9e872a9e39f3d:0x5de29f50507e5d73,https://www.google.com/maps/place//data=!4m2!3...


In [24]:
# Convertir valores a float
metadata['latitude'] = metadata['latitude'].astype(float)
metadata['longitude'] = metadata['longitude'].astype(float)
metadata['stars'] = metadata['stars'].astype(float)

Filtros

In [25]:
# Filtrar las filas donde el estado es Florida
metadata = metadata.loc[metadata['state'] == 'FL']

In [26]:
# Lista de palabras clave gatronomia
gastronomia = ['Restaurant', 'Restaurants','Food','Bars','Bar','Café','Coffeehouse','Bistro','Tavern','Buffet','Brewpub','Pub','Brasserie','Specialty Coffee Shop','Pub','Churrería','Diner','Dining','Teahouse','Tea Room','Gas Station', 'Gas','Fuel Station','Fuel']

# Filtrar los registros que contienen al menos una palabra clave en 'categories'
metadata_gastronomia = metadata[metadata['categories'].str.contains('|'.join(gastronomia))]

# Lista de palabras clave comida ethnica
comida_ethnica = ['Chinese','Indian','Thai','Italian','Greek','Helthy','Helth','Latin','Mexican','Tacos','Burritos','Enchiladas','Argentinian','Vegan','Vegetarian','Peruvian','Ceviche','Lomo','Pisco','Colombian','Empanadas','Arepas','Asian','Japanese','Sushi','Ramen','Sashimi','Tempura','Korean','Kimchi','Vietnamese','African','Ethiopian','Nigerian','Middle Eastern','Lebanese','Hummus','Falafel','Shawarma','Tabbouleh','Israeli','Shakshuka','Falafel','Hummus','Iranian','Healthy','Vegetarian','Vegan','Gluten-free','Gluten-Free','Fresh','Seasonal','Casual','Gas Station', 'Gas','Fuel Station','Fuel']

# Filtrar los registros que contienen al menos una palabra clave en 'categories'
df_metadata_ethnica = metadata_gastronomia[metadata_gastronomia['categories'].str.contains('|'.join(comida_ethnica))]

In [27]:
df_metadata_ethnica

Unnamed: 0,name,address,city,state,postal_code,latitude,longitude,stars,review_count,categories,hours,gmap_id,url
8818,Circle K,100 Gateway Cir,Jacksonville,FL,32259,30.063509,-81.505388,1.8,8,['convenience Store' 'atm' 'diesel Fuel Suppli...,"[[Wednesday, Open 24 hours], [Thursday, Open 2...",0x88e42d57522561e1:0x2aabd91210f232e9,https://www.google.com/maps/place//data=!4m2!3...
10030,Kangaroo Express,1209 Monument Rd,Jacksonville,FL,32225,30.342057,-81.537668,3.1,26,['convenience Store' 'atm' 'coffee Shop' 'dies...,"[[Wednesday, Open 24 hours], [Thursday, Open 2...",0x88e5b4a1d816111b:0x38d7071333903369,https://www.google.com/maps/place//data=!4m2!3...
10061,Trastes,323 US 17 92 Highway N,Haines City,FL,33844,28.112577,-81.617701,5.0,2,['nuevo Latino Restaurant' 'restaurant'],"[[Wednesday, Closed], [Thursday, 12–6PM], [Fri...",0x88dd734f09df440d:0x868d00fdf7efd200,https://www.google.com/maps/place//data=!4m2!3...
16060,Kangaroo,392 N Hathaway Ave,Bronson,FL,32621,29.447731,-82.642001,3.6,7,['convenience Store' 'coffee Shop' 'diesel Fue...,"[[Wednesday, 6AM–11PM], [Thursday, 6AM–11PM], ...",0x88e8ec2cbe4ec8e3:0x33cd95344d6390a3,https://www.google.com/maps/place//data=!4m2!3...
19557,Texaco,7148 Philips Hwy,Jacksonville,FL,32256,30.243029,-81.599758,1.3,8,['gas Station' 'convenience Store' 'diesel Fue...,,0x88e5cafa069f1f99:0x9872a19ecfdab3db,https://www.google.com/maps/place//data=!4m2!3...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3009432,Circle K,6120 Van Dyke Rd,Lutz,FL,33558,28.127648,-82.542244,3.1,18,['convenience Store' 'atm' 'coffee Shop' 'coff...,"[[Monday, Open 24 hours], [Tuesday, Open 24 ho...",0x88c2bfb97dc032f5:0x3890eb2a89f08db4,https://www.google.com/maps/place//data=!4m2!3...
3015421,China King,3801 W Lake Mary Blvd #133,Lake Mary,FL,32746,28.755528,-81.345292,3.7,168,['delivery Chinese Restaurant'],"[[Monday, 11AM–10PM], [Tuesday, 11AM–10PM], [W...",0x88e772a5dea9255b:0xa65936cb4269f473,https://www.google.com/maps/place//data=!4m2!3...
3016386,Circle K,10595 66th St N,Pinellas Park,FL,33782,27.868198,-82.728213,3.1,18,['convenience Store' 'atm' 'coffee Shop' 'coff...,"[[Monday, Open 24 hours], [Tuesday, Open 24 ho...",0x88c2e4d6a36e72c3:0x5a66ffaf05211a10,https://www.google.com/maps/place//data=!4m2!3...
3024509,Cochic Gourmet,1631 SE 3rd Ct,Deerfield Beach,FL,33441,26.314034,-80.083215,4.6,225,['brazilian Restaurant' 'brunch Restaurant' 'b...,"[[Monday, Closed], [Tuesday, 12–9PM], [Wednesd...",0x88d8e263b35016c9:0x4e5fbaced9eef545,https://www.google.com/maps/place//data=!4m2!3...


In [28]:
# Filtrar los registros donde 'name' se repite más de una vez
# Estos registros se consideraran como franquicias al identificarse con el mismo nombre y encontrarse en distintas locaciones
metadata = df_metadata_ethnica[df_metadata_ethnica.duplicated(subset='name', keep=False)]
metadata

Unnamed: 0,name,address,city,state,postal_code,latitude,longitude,stars,review_count,categories,hours,gmap_id,url
8818,Circle K,100 Gateway Cir,Jacksonville,FL,32259,30.063509,-81.505388,1.8,8,['convenience Store' 'atm' 'diesel Fuel Suppli...,"[[Wednesday, Open 24 hours], [Thursday, Open 2...",0x88e42d57522561e1:0x2aabd91210f232e9,https://www.google.com/maps/place//data=!4m2!3...
10030,Kangaroo Express,1209 Monument Rd,Jacksonville,FL,32225,30.342057,-81.537668,3.1,26,['convenience Store' 'atm' 'coffee Shop' 'dies...,"[[Wednesday, Open 24 hours], [Thursday, Open 2...",0x88e5b4a1d816111b:0x38d7071333903369,https://www.google.com/maps/place//data=!4m2!3...
16060,Kangaroo,392 N Hathaway Ave,Bronson,FL,32621,29.447731,-82.642001,3.6,7,['convenience Store' 'coffee Shop' 'diesel Fue...,"[[Wednesday, 6AM–11PM], [Thursday, 6AM–11PM], ...",0x88e8ec2cbe4ec8e3:0x33cd95344d6390a3,https://www.google.com/maps/place//data=!4m2!3...
19557,Texaco,7148 Philips Hwy,Jacksonville,FL,32256,30.243029,-81.599758,1.3,8,['gas Station' 'convenience Store' 'diesel Fue...,,0x88e5cafa069f1f99:0x9872a19ecfdab3db,https://www.google.com/maps/place//data=!4m2!3...
57779,Kangaroo Express,100 W Miller St,Fruitland Park,FL,34731,28.861915,-81.907142,3.3,8,['convenience Store' 'atm' 'coffee Shop' 'coff...,"[[Tuesday, Open 24 hours], [Wednesday, Open 24...",0x88e7c0e2d7f1ab67:0xb5f9c4c5983a5055,https://www.google.com/maps/place//data=!4m2!3...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2996185,Circle K,10030 University Blvd,Orlando,FL,32817,28.596891,-81.244226,2.9,8,['convenience Store' 'atm' 'coffee Shop' 'coff...,"[[Monday, Open 24 hours], [Tuesday, Open 24 ho...",0x88e768b657b1fadb:0xd145ad3192dea3fe,https://www.google.com/maps/place//data=!4m2!3...
2998114,Kangaroo Express,7590 E Hwy 25,Belleview,FL,34420,29.058680,-82.027716,4.0,14,['convenience Store' 'atm' 'coffee Shop' 'coff...,"[[Monday, Open 24 hours], [Tuesday, Open 24 ho...",0x88e7ce3b7ebd1233:0x437c333afec965ac,https://www.google.com/maps/place//data=!4m2!3...
3005983,Circle K,1591 S Woodland Blvd,DeLand,FL,32724,28.999583,-81.302173,4.0,108,['convenience Store' 'atm' 'coffee Shop' 'coff...,"[[Monday, Open 24 hours], [Tuesday, Open 24 ho...",0x88e71b022de8e289:0xc6d28435dc83a045,https://www.google.com/maps/place//data=!4m2!3...
3009432,Circle K,6120 Van Dyke Rd,Lutz,FL,33558,28.127648,-82.542244,3.1,18,['convenience Store' 'atm' 'coffee Shop' 'coff...,"[[Monday, Open 24 hours], [Tuesday, Open 24 ho...",0x88c2bfb97dc032f5:0x3890eb2a89f08db4,https://www.google.com/maps/place//data=!4m2!3...


## Reviews

In [29]:
# Importacion de dataset review-Florida
bucket_name = 'project_yelp_parquet'
file_path_user = 'reviews-estados/review-Florida.parquet'
df_review_state = read_parquet_from_gcs(bucket_name, file_path_user)

In [30]:
def limpiar_dataframe(df):
    # Conservar solo las columnas especificadas
    df = df[['user_id', 'name', 'time', 'rating', 'gmap_id']]
    
    # Eliminar filas duplicadas
    df = df.drop_duplicates()
    
    # Eliminar filas completamente nulas
    df = df.dropna(how='all')
    
    # Resetear el índice
    df = df.reset_index(drop=True)
    
    return df

In [31]:
df_review_state = limpiar_dataframe(df_review_state)

In [32]:
# Obtener los 'gmap_id' únicos de metadata
gmap_ids_metadata = metadata['gmap_id'].unique()

# Filtrar los registros de df_review_Florida que coinciden con los 'gmap_id' de metadata
df_review_state = df_review_state[df_review_state['gmap_id'].isin(gmap_ids_metadata)]

In [33]:
import datetime

def convertir_timestamp(timestamp):
    # Convertir milisegundos a segundos y luego crear objeto de fecha y hora
    fecha_hora = datetime.datetime.fromtimestamp(timestamp / 1000)
    # Redondear para mostrar solo hora, minuto y segundo
    fecha_hora_redondeada = fecha_hora.replace(microsecond=0)
    return fecha_hora_redondeada

In [34]:
# Aplicar la función a la columna 'time' del DataFrame df_review
df_review_state['time'] = df_review_state['time'].apply(convertir_timestamp)

In [35]:
df_review_state['time']

3549      2021-04-24 17:29:22
3550      2020-09-26 12:31:24
3551      2019-12-11 01:09:44
3552      2017-05-23 23:57:29
3553      2020-02-12 15:07:40
                  ...        
2563145   2018-11-19 19:45:37
2563146   2020-06-09 00:14:17
2563147   2021-03-01 11:45:24
2563148   2019-01-08 19:22:34
2563149   2018-10-08 08:30:14
Name: time, Length: 5821, dtype: datetime64[ns]

In [36]:
# Dividir la columna 'time' en 'date' y 'HH'
df_review_state['date'] = pd.to_datetime(df_review_state['time']).dt.date
df_review_state['hour'] = pd.to_datetime(df_review_state['time']).dt.strftime('%H:%M:%S')

# Eliminar la columna 'time' 
del df_review_state['time']

# Renombrar columna 'rating' a 'stars'
df_review_state.rename(columns={'rating': 'stars'}, inplace=True)

# Convertir valores 'stars' a float
df_review_state['stars'].astype(float)

# Mostrar el DataFrame resultante
df_review_state.head()

Unnamed: 0,user_id,name,stars,text,gmap_id,date,hour
3549,107911502804324987451,Heather Diekman,1,The cashier was the first person I ever met. A...,0x88e5b4a1d816111b:0x38d7071333903369,2021-04-24,17:29:22
3550,103474868622180529503,Heather M,3,Cashier acted as though she was bothered by cu...,0x88e5b4a1d816111b:0x38d7071333903369,2020-09-26,12:31:24
3551,118146142741420564479,Mark Burns,4,Small and convenient gas station.,0x88e5b4a1d816111b:0x38d7071333903369,2019-12-11,01:09:44
3552,108947419249227108145,Gary Daughtry,4,As good as any other older style convenience s...,0x88e5b4a1d816111b:0x38d7071333903369,2017-05-23,23:57:29
3553,116973803927966787920,Cody Miller,1,Employees roam outside on smoke breaks all day...,0x88e5b4a1d816111b:0x38d7071333903369,2020-02-12,15:07:40


In [37]:
# Definir función para obtener la puntuación de sentimiento
def get_sentiment_score(text):
    # Instanciar el modelo de análisis de sentimiento
    sia = SentimentIntensityAnalyzer()
    
    if pd.isnull(text) or text == "":
        return 1  # Valor neutral si el texto está vacío o es NaN
    elif isinstance(text, str):
        # Realizar análisis de sentimiento
        sentiment = sia.polarity_scores(text)
        compound_score = sentiment["compound"]

        # Escalar la puntuación entre 1 y 4
        score = int(round(5 * compound_score))

        # Asignar la puntuación
        if score <= 2:
            return 1
        elif score <= 3:
            return 2
        elif score <= 4:
            return 3
        else:
            return 4
    else:
        return 1  # Valor neutral para datos que no son de tipo cadena


In [38]:
# Asegurarse de que la columna 'text' sea de tipo cadena
df_review_state["text"] = df_review_state["text"].astype(str)

# Aplicar la función de análisis de sentimiento a la columna 'text' y crear una nueva columna 'sentiment_analysis'
df_review_state["sentiment_analysis"] = df_review_state["text"].apply(get_sentiment_score)

# Convertir la columna 'sentiment_analysis' al tipo de dato int
df_review_state["sentiment_analysis"] = df_review_state["sentiment_analysis"].astype(int)

In [39]:
df_review_state.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5821 entries, 3549 to 2563149
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   user_id             5821 non-null   object
 1   name                5821 non-null   object
 2   stars               5821 non-null   int64 
 3   text                5821 non-null   object
 4   gmap_id             5821 non-null   object
 5   date                5821 non-null   object
 6   hour                5821 non-null   object
 7   sentiment_analysis  5821 non-null   int64 
dtypes: int64(2), object(6)
memory usage: 409.3+ KB


Exportar dataframes

In [40]:
from gcsfs import GCSFileSystem
def df_to_csv(dataframe, file_name):

    # Bucket name y file path
    bucket_name = 'project_yelp_parquet'
    file_path_user = 'archivos_csv/'

    # Ruta completa del archivo en GCS
    gcs_path = f'gs://{bucket_name}/{file_path_user}{file_name}'

    # Guardar el DataFrame como un archivo CSV en GCS
    with GCSFileSystem().open(gcs_path, 'w') as f:
        dataframe.to_csv(f, index=False, encoding='utf-8')

In [41]:
# Exportacion de datasets resultantes
metadata = df_to_csv(metadata, 'archivos_csv/datasets/procesados/df_metadata_procesado.csv')

florida = df_to_csv(df_review_state, 'archivos_csv/datasets/procesados/df_review_state_procesado.csv')