# ETL_Google

Importamos las librerias necesarias para comenzar con el proceso de ETL 

In [80]:
import os
import json 
import numpy as np
import pandas as pd
import herramientas
from datetime import datetime
import warnings
warnings.filterwarnings("ignore")

Leemos los archivos json de la carpeta "Califronia"

In [81]:
#Funcion para leer el estado de California
def read_json_files(folder_path):
    data = []
    for file_name in os.listdir(folder_path):
        if file_name.endswith(".json"):
            file_path = os.path.join(folder_path, file_name)
            with open(file_path, 'r') as file:
                for line in file:
                    line = line.strip()
                    if line:
                        try:
                            json_obj = json.loads(line)
                            data.append(json_obj)
                        except json.JSONDecodeError:
                            continue

    df = pd.DataFrame(data)
    return df

Leemos cada json y lo convertimos en un dataframe y lo agregamos a una lista de dataframe

In [82]:
folder_path = "data/google/reviews-estados/review-California"
df_list = []
df_california = read_json_files(folder_path)
df_list.append(df_california)

Unimos todos los dataframes generados en uno

In [83]:
df_california_concat = pd.concat(df_list, ignore_index=True)

In [84]:
df_california_concat

Unnamed: 0,user_id,name,time,rating,text,pics,resp,gmap_id
0,108991152262655788985,Song Ro,1609909927056,5,Love there korean rice cake.,,,0x80c2c778e3b73d33:0xbdc58662a4a97d49
1,111290322219796215751,Rafa Robles,1612849648663,5,Good very good,,,0x80c2c778e3b73d33:0xbdc58662a4a97d49
2,112640357449611959087,David Han,1583643882296,4,They make Korean traditional food very properly.,,,0x80c2c778e3b73d33:0xbdc58662a4a97d49
3,117440349723823658676,Anthony Kim,1551938216355,5,Short ribs are very delicious.,,,0x80c2c778e3b73d33:0xbdc58662a4a97d49
4,100580770836123539210,Mario Marzouk,1494910901933,5,Great food and prices the portions are large,,,0x80c2c778e3b73d33:0xbdc58662a4a97d49
...,...,...,...,...,...,...,...,...
2699995,111342337087018931578,Byunguk Kim,1573362674409,5,,,,0x80c2bea30829f279:0x39aa953ee93734ed
2699996,108081820251273699976,Ceveda Craytonhooks,1580608853411,5,Beautiful salon. The staff and atmosphere were...,,,0x8094675073616747:0x9f935a9b9046a9ba
2699997,117591383135333249676,Natacha Thompson,1528597806165,5,Flo is a great beautician. She is very patie...,[{'url': ['https://lh5.googleusercontent.com/p...,"{'time': 1528598932831, 'text': 'Thank you! I...",0x8094675073616747:0x9f935a9b9046a9ba
2699998,112426610655792883265,Eleanor Aikins,1551203379295,5,I Been going to have a variety of hair style a...,,,0x8094675073616747:0x9f935a9b9046a9ba


Elimino las columnas 'pics' y 'resp'

In [85]:
#Dropeamos las columnas que no usaremos
df_california_concat.drop(["pics", "resp"], axis=1, inplace=True)

Cambiamos el formato de la columna 'time'

In [86]:
# Convertir la columna "time" al formato adecuado
df_california_concat['time'] = pd.to_datetime(df_california_concat['time'], unit='ms')

Creamos las columnas  'date' y 'hour'

In [87]:
#Creamos dos columnas (date y hour) para ser extraidos de la columna time y convertir al formato adecuado 
df_california_concat['hour'] = pd.to_datetime(df_california_concat['time']).dt.strftime('%H:%M:%S')
df_california_concat['date'] = pd.to_datetime(df_california_concat['time']).dt.strftime('%Y-%m-%d')

Elimino la columna 'time'

In [88]:
df_california_concat.drop(["time"], axis=1, inplace=True)

Verifico valores nulos

In [89]:
herramientas.verifica_tipo_y_nulos(df_california_concat)

Unnamed: 0,nombre_campo,tipo_datos,no_nulos_%,nulos_%,nulos
0,user_id,[<class 'str'>],100.0,0.0,0
1,name,[<class 'str'>],100.0,0.0,0
2,rating,[<class 'int'>],100.0,0.0,0
3,text,"[<class 'str'>, <class 'NoneType'>]",56.63,43.37,1170964
4,gmap_id,[<class 'str'>],100.0,0.0,0
5,hour,[<class 'str'>],100.0,0.0,0
6,date,[<class 'str'>],100.0,0.0,0


Observamos el dataframe

In [90]:
df_california_concat

Unnamed: 0,user_id,name,rating,text,gmap_id,hour,date
0,108991152262655788985,Song Ro,5,Love there korean rice cake.,0x80c2c778e3b73d33:0xbdc58662a4a97d49,05:12:07,2021-01-06
1,111290322219796215751,Rafa Robles,5,Good very good,0x80c2c778e3b73d33:0xbdc58662a4a97d49,05:47:28,2021-02-09
2,112640357449611959087,David Han,4,They make Korean traditional food very properly.,0x80c2c778e3b73d33:0xbdc58662a4a97d49,05:04:42,2020-03-08
3,117440349723823658676,Anthony Kim,5,Short ribs are very delicious.,0x80c2c778e3b73d33:0xbdc58662a4a97d49,05:56:56,2019-03-07
4,100580770836123539210,Mario Marzouk,5,Great food and prices the portions are large,0x80c2c778e3b73d33:0xbdc58662a4a97d49,05:01:41,2017-05-16
...,...,...,...,...,...,...,...
2699995,111342337087018931578,Byunguk Kim,5,,0x80c2bea30829f279:0x39aa953ee93734ed,05:11:14,2019-11-10
2699996,108081820251273699976,Ceveda Craytonhooks,5,Beautiful salon. The staff and atmosphere were...,0x8094675073616747:0x9f935a9b9046a9ba,02:00:53,2020-02-02
2699997,117591383135333249676,Natacha Thompson,5,Flo is a great beautician. She is very patie...,0x8094675073616747:0x9f935a9b9046a9ba,02:30:06,2018-06-10
2699998,112426610655792883265,Eleanor Aikins,5,I Been going to have a variety of hair style a...,0x8094675073616747:0x9f935a9b9046a9ba,17:49:39,2019-02-26


Exporto los datos en un archivo csv

In [91]:
# Exportar el DataFrame
df_california_concat.to_csv('california.csv', escapechar='\\', index=False)

Leemos los archivos json de la carpeta "Florida"

In [92]:
#Funcion para leer el estado de Florida
def read_json_files(folder_path):
    data = []
    for file_name in os.listdir(folder_path):
        if file_name.endswith(".json"):
            file_path = os.path.join(folder_path, file_name)
            with open(file_path, 'r') as file:
                for line in file:
                    line = line.strip()
                    if line:
                        try:
                            json_obj = json.loads(line)
                            data.append(json_obj)
                        except json.JSONDecodeError:
                            continue

    df = pd.DataFrame(data)
    return df

Leemos cada json y lo convertimos en un dataframe y lo agregamos a una lista de dataframe

In [93]:
folder_path = "data/google/reviews-estados/review-Florida"
df_list = []
df_florida = read_json_files(folder_path)
df_list.append(df_florida)

Unimos todos los dataframes generados en uno

In [94]:
df_florida_concat = pd.concat(df_list, ignore_index=True)

In [95]:
df_florida_concat

Unnamed: 0,user_id,name,time,rating,text,pics,resp,gmap_id
0,101471856155148729010,Julie A. Gerber,1628003250740,1,Update: Their “reply” to my review amounted to...,,"{'time': 1627042799532, 'text': 'Thank you for...",0x8893863ea87bd5dd:0x9383ebf973e74abb
1,115477234789038326051,Martin Sheffield,1595031217005,5,He's a knowledgeable doctor but the way he run...,,"{'time': 1582464056733, 'text': 'Thank you for...",0x8893863ea87bd5dd:0x9383ebf973e74abb
2,101805010244892834381,Brian Truett,1522924253567,5,"Best doctor I've ever had, I never wait to be ...",,,0x8893863ea87bd5dd:0x9383ebf973e74abb
3,106344422881493743981,Tina Sun,1467907819586,1,I was told he is a good doctor. I was trying t...,,,0x8893863ea87bd5dd:0x9383ebf973e74abb
4,100875113069561776529,James Haynes,1480683415081,5,Takes the time to actually get to know his pat...,,,0x8893863ea87bd5dd:0x9383ebf973e74abb
...,...,...,...,...,...,...,...,...
2849995,100287714800809186330,James Rudolph,1617305583696,5,,,"{'time': 1617561671234, 'text': 'Thank you!'}",0x8890966585e36d3f:0x131d47c2c60a8d31
2849996,114098933097423358884,Vincent Alexander,1581817848416,3,,,,0x8890966585e36d3f:0x131d47c2c60a8d31
2849997,112344804812177444770,Brett Owen,1551240010030,5,,,,0x8890966585e36d3f:0x131d47c2c60a8d31
2849998,105446471831430265409,ashly kindle,1521152570004,5,,,"{'time': 1516914199348, 'text': 'Thank you, As...",0x8890966585e36d3f:0x131d47c2c60a8d31


Elimino las columnas 'pics' y 'resp'

In [96]:
#Dropeamos las columnas que no usaremos
df_florida_concat.drop(["pics", "resp"], axis=1, inplace=True)

Cambiamos el formato de la columna 'time'

In [97]:
# Convertir la columna "time" al formato adecuado
df_florida_concat['time'] = pd.to_datetime(df_florida_concat['time'], unit='ms')

Creamos las columnas  'date' y 'hour'

In [98]:
#Creamos dos columnas (date y hour) para ser extraidos de la columna time y convertir al formato adecuado 
df_florida_concat['hour'] = pd.to_datetime(df_florida_concat['time']).dt.strftime('%H:%M:%S')
df_florida_concat['date'] = pd.to_datetime(df_florida_concat['time']).dt.strftime('%Y-%m-%d')

Elimino la columna 'time'

In [99]:
df_florida_concat.drop(["time"], axis=1, inplace=True)

In [100]:
herramientas.verifica_tipo_y_nulos(df_florida_concat)

Unnamed: 0,nombre_campo,tipo_datos,no_nulos_%,nulos_%,nulos
0,user_id,[<class 'str'>],100.0,0.0,0
1,name,[<class 'str'>],100.0,0.0,0
2,rating,[<class 'int'>],100.0,0.0,0
3,text,"[<class 'str'>, <class 'NoneType'>]",62.12,37.88,1079510
4,gmap_id,[<class 'str'>],100.0,0.0,0
5,hour,[<class 'str'>],100.0,0.0,0
6,date,[<class 'str'>],100.0,0.0,0


In [101]:
# Exportar el DataFrame
df_florida_concat.to_csv('florida.csv', escapechar='\\', index=False)

Leemos los archivos json de la carpeta "Illinois"

In [102]:
#Funcion para leer el estado de Illinois
def read_json_files(folder_path):
    data = []
    for file_name in os.listdir(folder_path):
        if file_name.endswith(".json"):
            file_path = os.path.join(folder_path, file_name)
            with open(file_path, 'r') as file:
                for line in file:
                    line = line.strip()
                    if line:
                        try:
                            json_obj = json.loads(line)
                            data.append(json_obj)
                        except json.JSONDecodeError:
                            continue

    df = pd.DataFrame(data)
    return df

Leemos cada json y lo convertimos en un dataframe y lo agregamos a una lista de dataframe

In [103]:
folder_path = "data/google/reviews-estados/review-Illinois"
df_list = []
df_illinois = read_json_files(folder_path)
df_list.append(df_illinois)

Unimos todos los dataframes generados en uno

In [104]:
df_illinois_concat = pd.concat(df_list, ignore_index=True)

In [105]:
df_illinois_concat

Unnamed: 0,user_id,name,time,rating,text,pics,resp,gmap_id
0,109442577269176737834,Jim LeBurkien,1571842572977,5,Great selection of used and collectible toys. ...,,,0x880f0f1b32155555:0x8347e0c971acb955
1,100924480938771474648,Steve Muscat,1561679275695,5,Great visit as usual. 👍😊,,,0x880f0f1b32155555:0x8347e0c971acb955
2,100813976190002245469,Fletcher Oliver,1572719516502,5,,,,0x880f0f1b32155555:0x8347e0c971acb955
3,110168499961967096526,Daniel Chavez,1572641473424,5,,,,0x880f0f1b32155555:0x8347e0c971acb955
4,108990598812734370097,David Ferdinand,1561483351910,4,,,,0x880f0f1b32155555:0x8347e0c971acb955
...,...,...,...,...,...,...,...,...
2099995,105552273253338237445,Cindy Neath,1546825827465,5,Like them,,,0x8808bbdd68f8761b:0x8c518beb56b4f3ca
2099996,102035592083831756668,David Mauer,1541157502008,5,Loved It!,,,0x8808bbdd68f8761b:0x8c518beb56b4f3ca
2099997,100212235055035923068,Bloodkisses333,1517804540149,5,Local and awesome!,,,0x8808bbdd68f8761b:0x8c518beb56b4f3ca
2099998,109400103612749522118,Kevin Sircloumb,1553131495401,4,Very convenient to get in and out,,,0x8808bbdd68f8761b:0x8c518beb56b4f3ca


Elimino las columnas 'pics' y 'resp'

In [106]:
#Dropeamos las columnas que no usaremos
df_illinois_concat.drop(["pics", "resp"], axis=1, inplace=True)

Cambiamos el formato de la columna 'time'

In [107]:
# Convertir la columna "time" al formato adecuado
df_illinois_concat['time'] = pd.to_datetime(df_illinois_concat['time'], unit='ms')

Creamos las columnas  'date' y 'hour'

In [108]:
#Creamos dos columnas (date y hour) para ser extraidos de la columna time y convertir al formato adecuado 
df_illinois_concat['hour'] = pd.to_datetime(df_illinois_concat['time']).dt.strftime('%H:%M:%S')
df_illinois_concat['date'] = pd.to_datetime(df_illinois_concat['time']).dt.strftime('%Y-%m-%d')

Elimino la columan 'time'

In [109]:
df_illinois_concat.drop(["time"], axis=1, inplace=True)

In [110]:
herramientas.verifica_tipo_y_nulos(df_illinois_concat)

Unnamed: 0,nombre_campo,tipo_datos,no_nulos_%,nulos_%,nulos
0,user_id,[<class 'str'>],100.0,0.0,0
1,name,[<class 'str'>],100.0,0.0,0
2,rating,[<class 'int'>],100.0,0.0,0
3,text,"[<class 'str'>, <class 'NoneType'>]",56.56,43.44,912218
4,gmap_id,[<class 'str'>],100.0,0.0,0
5,hour,[<class 'str'>],100.0,0.0,0
6,date,[<class 'str'>],100.0,0.0,0


In [111]:
# Exportar el DataFrame
df_illinois_concat.to_csv('illinois.csv', escapechar='\\', index=False)

Leemos los archivos json de la carpeta "New York"

In [112]:
#Funcion para leer el estado de New York
def read_json_files(folder_path):
    data = []
    for file_name in os.listdir(folder_path):
        if file_name.endswith(".json"):
            file_path = os.path.join(folder_path, file_name)
            with open(file_path, 'r') as file:
                for line in file:
                    line = line.strip()
                    if line:
                        try:
                            json_obj = json.loads(line)
                            data.append(json_obj)
                        except json.JSONDecodeError:
                            continue

    df = pd.DataFrame(data)
    return df

Leemos cada json y lo convertimos en un dataframe y lo agregamos a una lista de dataframe

In [113]:
folder_path = "data/google/reviews-estados/review-New_York"
df_list = []
df_new_york= read_json_files(folder_path)
df_list.append(df_new_york)

Unimos todos los dataframes generados en uno

In [114]:
df_new_york_concat = pd.concat(df_list, ignore_index=True)

In [115]:
df_new_york_concat

Unnamed: 0,user_id,name,time,rating,text,pics,resp,gmap_id
0,113722104692308235141,Alvin Martinez,1603494795361,5,I'm late to posting this but this store especi...,[{'url': ['https://lh5.googleusercontent.com/p...,,0x89c25fc9494dce47:0x6d63c807b59a55
1,107293441492109320298,Johnnie Jackson,1620157037403,1,Very dissatisfied I did not get my phone the p...,,"{'time': 1620268360920, 'text': 'We pride ours...",0x89c25fc9494dce47:0x6d63c807b59a55
2,100378585801819400296,Manie Blazer,1597431662039,5,Excellent very well done with professional car...,,,0x89c25fc9494dce47:0x6d63c807b59a55
3,114998161153019826512,Fashion Fiinds,1543773862044,5,Basing my review strictly on the service I rec...,,"{'time': 1543855317372, 'text': 'Thanks for th...",0x89c25fc9494dce47:0x6d63c807b59a55
4,117178185728422297915,Andres Rieloff,1597279097718,1,Bad! Disorganized. I'm being totally honest. I...,,,0x89c25fc9494dce47:0x6d63c807b59a55
...,...,...,...,...,...,...,...,...
2699995,115088845698717434859,Gourav Saha,1555437628373,4,,,,0x89de0b8a0905153d:0x976fc4a006084f03
2699996,111706397232791116806,jenn mosher,1462985945347,5,,,,0x89de0b8a0905153d:0x976fc4a006084f03
2699997,108369784075976765619,Michele Huck,1573877341097,5,,,,0x89de0b8a0905153d:0x976fc4a006084f03
2699998,100786787316351751498,Frank B,1603835563283,5,,,,0x89de0b8a0905153d:0x976fc4a006084f03


Elimino las columnas 'pics' y 'resp'

In [116]:
#Dropeamos las columnas que no usaremos
df_new_york_concat.drop(["pics", "resp"], axis=1, inplace=True)

Cambiamos el formato de la columna 'time'

In [117]:
# Convertir la columna "time" al formato adecuado
df_new_york_concat['time'] = pd.to_datetime(df_new_york_concat['time'], unit='ms')

Creamos las columnas  'date' y 'hour'

In [118]:
#Creamos dos columnas (date y hour) para ser extraidos de la columna time y convertir al formato adecuado 
df_new_york_concat['hour'] = pd.to_datetime(df_new_york_concat['time']).dt.strftime('%H:%M:%S')
df_new_york_concat['date'] = pd.to_datetime(df_new_york_concat['time']).dt.strftime('%Y-%m-%d')

Elimino la columna 'time'

In [119]:
df_new_york_concat.drop(["time"], axis=1, inplace=True)

In [120]:
herramientas.verifica_tipo_y_nulos(df_new_york_concat)

Unnamed: 0,nombre_campo,tipo_datos,no_nulos_%,nulos_%,nulos
0,user_id,[<class 'str'>],100.0,0.0,0
1,name,[<class 'str'>],100.0,0.0,0
2,rating,[<class 'int'>],100.0,0.0,0
3,text,"[<class 'str'>, <class 'NoneType'>]",56.9,43.1,1163593
4,gmap_id,[<class 'str'>],100.0,0.0,0
5,hour,[<class 'str'>],100.0,0.0,0
6,date,[<class 'str'>],100.0,0.0,0


In [121]:
# Exportar el DataFrame
df_new_york_concat.to_csv('new_york.csv', escapechar='\\', index=False)

Leemos los archivos json de la carpeta "Texas"

In [122]:
#Funcion para leer el estado de Texas
def read_json_files(folder_path):
    data = []
    for file_name in os.listdir(folder_path):
        if file_name.endswith(".json"):
            file_path = os.path.join(folder_path, file_name)
            with open(file_path, 'r') as file:
                for line in file:
                    line = line.strip()
                    if line:
                        try:
                            json_obj = json.loads(line)
                            data.append(json_obj)
                        except json.JSONDecodeError:
                            continue

    df = pd.DataFrame(data)
    return df

Leemos cada json y lo convertimos en un dataframe y lo agregamos a una lista de dataframe

In [123]:
folder_path = "data/google/reviews-estados/review-Texas"
df_list = []
df_texas= read_json_files(folder_path)
df_list.append(df_texas)

Unimos todos los dataframes generados en uno

In [124]:
df_texas_concat = pd.concat(df_list, ignore_index=True)

In [125]:
df_texas_concat

Unnamed: 0,user_id,name,time,rating,text,pics,resp,gmap_id
0,110545299078970317447,Kimberly Feger,1625431734826,5,"The pharmacist, Erin, is phenomenal. She was s...",,,0x864c3998b8d8dc83:0x57ffabe1e2322320
1,103619788097506115343,Briana Streit,1568336283913,2,I gave them 2 stars because they offer prescri...,,,0x864c3998b8d8dc83:0x57ffabe1e2322320
2,101916663109091637233,Sylvia Caudillo,1519873586872,1,If I could put minus stars I would. This has t...,,,0x864c3998b8d8dc83:0x57ffabe1e2322320
3,117870898304582507607,Ginger Kinyon,1571638384637,1,Please fix your restroom doors,,,0x864c3998b8d8dc83:0x57ffabe1e2322320
4,110397346115416712442,Angeles Arellano,1546568877087,1,This pharmacy Walmart dose not work not come a...,,,0x864c3998b8d8dc83:0x57ffabe1e2322320
...,...,...,...,...,...,...,...,...
2296819,102168135687638614085,t williams,1562883151018,5,,,,0x8640d215ff1e43e5:0xaeeb5211a25191f4
2296820,105417309184712126911,Erika Lira,1532009672494,5,,,,0x8640d215ff1e43e5:0xaeeb5211a25191f4
2296821,110785699186536003973,Jon Brent,1567968578697,5,,,,0x8640d215ff1e43e5:0xaeeb5211a25191f4
2296822,109167601957430874067,lunna cabal,1506476840285,4,,,,0x8640d215ff1e43e5:0xaeeb5211a25191f4


Elimino las columnas 'pics' y 'resp'

In [126]:
#Dropeamos las columnas que no usaremos
df_texas_concat.drop(["pics", "resp"], axis=1, inplace=True)

Cambiamos el formato de la columna 'time'

In [127]:
# Convertir la columna "time" al formato adecuado
df_texas_concat['time'] = pd.to_datetime(df_texas_concat['time'], unit='ms')

In [128]:
#Creamos dos columnas (date y hour) para ser extraidos de la columna time y convertir al formato adecuado 
df_texas_concat['hour'] = pd.to_datetime(df_texas_concat['time']).dt.strftime('%H:%M:%S')
df_texas_concat['date'] = pd.to_datetime(df_texas_concat['time']).dt.strftime('%Y-%m-%d')

Elimino la columna 'time'

In [129]:
df_texas_concat.drop(["time"], axis=1, inplace=True)

In [130]:
herramientas.verifica_tipo_y_nulos(df_texas_concat)

Unnamed: 0,nombre_campo,tipo_datos,no_nulos_%,nulos_%,nulos
0,user_id,[<class 'str'>],100.0,0.0,0
1,name,[<class 'str'>],100.0,0.0,0
2,rating,[<class 'int'>],100.0,0.0,0
3,text,"[<class 'str'>, <class 'NoneType'>]",59.98,40.02,919209
4,gmap_id,[<class 'str'>],100.0,0.0,0
5,hour,[<class 'str'>],100.0,0.0,0
6,date,[<class 'str'>],100.0,0.0,0


In [131]:
# Exportar el DataFrame
df_texas_concat.to_csv('texas.csv', escapechar='\\', index=False)

Abrimos la carpeta de Sitios 

In [132]:
#Funcion para abrir carpeta de metadata-sitios
def merge_json_files(folder_path):
    merged_data = []  # Lista para almacenar los objetos JSON combinados

    for filename in os.listdir(folder_path):
        if filename.endswith('.json'):
            filepath = os.path.join(folder_path, filename)
            with open(filepath) as file:
                for line in file:
                    try:
                        obj = json.loads(line)
                        merged_data.append(obj)
                    except json.JSONDecodeError as e:
                        print(f"Error al decodificar JSON en el archivo {filename}: {str(e)}")

    df = pd.DataFrame(merged_data)  # Crear DataFrame a partir de los objetos JSON
    return df

# Ejemplo de uso
folder_path = 'data/google/metadata-sitios'
df_sitios= merge_json_files(folder_path)

In [133]:
df_sitios

Unnamed: 0,name,address,gmap_id,description,latitude,longitude,category,avg_rating,num_of_reviews,price,hours,MISC,state,relative_results,url
0,Porter Pharmacy,"Porter Pharmacy, 129 N Second St, Cochran, GA ...",0x88f16e41928ff687:0x883dad4fd048e8f8,,32.388300,-83.357100,[Pharmacy],4.9,16,,"[[Friday, 8AM–6PM], [Saturday, 8AM–12PM], [Sun...","{'Service options': ['In-store shopping', 'Sam...",Open ⋅ Closes 6PM,"[0x88f16e41929435cf:0x5b2532a2885e9ef6, 0x88f1...",https://www.google.com/maps/place//data=!4m2!3...
1,City Textile,"City Textile, 3001 E Pico Blvd, Los Angeles, C...",0x80c2c98c0e3c16fd:0x29ec8a728764fdf9,,34.018891,-118.215290,[Textile exporter],4.5,6,,,,Open now,"[0x80c2c624136ea88b:0xb0315367ed448771, 0x80c2...",https://www.google.com/maps/place//data=!4m2!3...
2,San Soo Dang,"San Soo Dang, 761 S Vermont Ave, Los Angeles, ...",0x80c2c778e3b73d33:0xbdc58662a4a97d49,,34.058092,-118.292130,[Korean restaurant],4.4,18,,"[[Thursday, 6:30AM–6PM], [Friday, 6:30AM–6PM],...","{'Service options': ['Takeout', 'Dine-in', 'De...",Open ⋅ Closes 6PM,"[0x80c2c78249aba68f:0x35bf16ce61be751d, 0x80c2...",https://www.google.com/maps/place//data=!4m2!3...
3,Nova Fabrics,"Nova Fabrics, 2200 E 11th St, Los Angeles, CA ...",0x80c2c89923b27a41:0x32041559418d447,,34.023669,-118.232930,[Fabric store],3.3,6,,"[[Thursday, 9AM–5PM], [Friday, 9AM–5PM], [Satu...","{'Service options': ['In-store shopping'], 'Pa...",Open ⋅ Closes 5PM,"[0x80c2c8811477253f:0x23a8a492df1918f7, 0x80c2...",https://www.google.com/maps/place//data=!4m2!3...
4,Nobel Textile Co,"Nobel Textile Co, 719 E 9th St, Los Angeles, C...",0x80c2c632f933b073:0xc31785961fe826a6,,34.036694,-118.249421,[Fabric store],4.3,7,,"[[Thursday, 9AM–5PM], [Friday, 9AM–5PM], [Satu...",{'Service options': ['In-store pickup']},Open ⋅ Closes 5PM,"[0x80c2c62c496083d1:0xdefa11317fe870a1, 0x80c2...",https://www.google.com/maps/place//data=!4m2!3...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1099999,Farmers Branch Shopping Center,"Farmers Branch Shopping Center, 12895 Josey Ln...",0x864c27a6047a10e1:0xcd332f4713cf5a6a,,32.923175,-96.882246,[Shopping mall],4.0,424,,,"{'Service options': ['In-store shopping'], 'Ac...",,"[0x864c20e52627ece5:0xc7284cab5500e6a5, 0x864c...",https://www.google.com/maps/place//data=!4m2!3...
1100000,Domino's Pizza,"Domino's Pizza, 9432 Natural Bridge Rd, Berkel...",0x87df34327dfbb649:0x7dc858dab8b30533,Delivery/carryout chain offering a wide range ...,38.725495,-90.339572,"[Pizza delivery, Delivery Restaurant, Takeout ...",3.1,268,$,"[[Friday, 10:30AM–1AM], [Saturday, 10:30AM–1AM...","{'Service options': ['Curbside pickup', 'No-co...",Open ⋅ Closes 1AM,"[0x87df35c8677a573f:0x3e7434f76833545d, 0x87df...",https://www.google.com/maps/place//data=!4m2!3...
1100001,Bullseye Shooting Range,"Bullseye Shooting Range, 1455 N Terrace Dr, Wi...",0x87bae320f4d7a9d1:0x24b5e3218e1d1549,,37.709828,-97.284202,"[Shooting range, Gun shop]",4.5,194,,"[[Friday, 10AM–7PM], [Saturday, 10AM–6PM], [Su...",{'Accessibility': ['Wheelchair-accessible car ...,Closed ⋅ Opens 10AM Sat,"[0x87bae1127f210379:0x8df4dfd92ed2469, 0x87bae...",https://www.google.com/maps/place//data=!4m2!3...
1100002,Cha Spa,"Cha Spa, 2817 Main St, Santa Monica, CA 90405",0x80c2bad224faa469:0x1d564aa010920e46,,33.999850,-118.481183,"[Day spa, Facial spa, Massage therapist, Nail ...",3.7,24,,"[[Friday, 10AM–10PM], [Saturday, 10AM–10PM], [...",{'Accessibility': ['Wheelchair accessible entr...,Open ⋅ Closes 10PM,"[0x80c2bad235832921:0xb12e61cb436a044c, 0x80c2...",https://www.google.com/maps/place//data=!4m2!3...


Verifico la existencia de nulos y tipos de datos

In [134]:
herramientas.verifica_tipo_y_nulos(df_sitios)

Unnamed: 0,nombre_campo,tipo_datos,no_nulos_%,nulos_%,nulos
0,name,"[<class 'str'>, <class 'NoneType'>]",100.0,0.0,15
1,address,"[<class 'str'>, <class 'NoneType'>]",97.3,2.7,29672
2,gmap_id,[<class 'str'>],100.0,0.0,0
3,description,"[<class 'NoneType'>, <class 'str'>]",7.99,92.01,1012128
4,latitude,[<class 'float'>],100.0,0.0,0
5,longitude,[<class 'float'>],100.0,0.0,0
6,category,"[<class 'list'>, <class 'NoneType'>]",99.41,0.59,6446
7,avg_rating,[<class 'float'>],100.0,0.0,0
8,num_of_reviews,[<class 'int'>],100.0,0.0,0
9,price,"[<class 'NoneType'>, <class 'str'>]",8.63,91.37,1005075


- ***Filtramos el metadata de sitios, con el estado de California***

In [135]:
# Filtrar los registros de df_sitios que tienen el mismo 'gmap_id' que los de California
df_sitios_california = df_sitios[df_sitios['gmap_id'].isin(df_california_concat['gmap_id'])].copy()

# Agregar la nueva columna 'California' al nuevo DataFrame
df_sitios_california['state'] = 'California'

#Observamos los cambios realizados
df_sitios_california

Unnamed: 0,name,address,gmap_id,description,latitude,longitude,category,avg_rating,num_of_reviews,price,hours,MISC,state,relative_results,url
2,San Soo Dang,"San Soo Dang, 761 S Vermont Ave, Los Angeles, ...",0x80c2c778e3b73d33:0xbdc58662a4a97d49,,34.058092,-118.292130,[Korean restaurant],4.4,18,,"[[Thursday, 6:30AM–6PM], [Friday, 6:30AM–6PM],...","{'Service options': ['Takeout', 'Dine-in', 'De...",California,"[0x80c2c78249aba68f:0x35bf16ce61be751d, 0x80c2...",https://www.google.com/maps/place//data=!4m2!3...
4,Nobel Textile Co,"Nobel Textile Co, 719 E 9th St, Los Angeles, C...",0x80c2c632f933b073:0xc31785961fe826a6,,34.036694,-118.249421,[Fabric store],4.3,7,,"[[Thursday, 9AM–5PM], [Friday, 9AM–5PM], [Satu...",{'Service options': ['In-store pickup']},California,"[0x80c2c62c496083d1:0xdefa11317fe870a1, 0x80c2...",https://www.google.com/maps/place//data=!4m2!3...
5,Matrix International Textiles,"Matrix International Textiles, 1363 S Bonnie B...",0x80c2cf163db6bc89:0x219484e2edbcfa41,,34.015505,-118.181839,[Fabric store],3.5,6,,"[[Thursday, 8:30AM–5:30PM], [Friday, 8:30AM–5:...",{'Accessibility': ['Wheelchair accessible entr...,California,"[0x80c2cf042a5d9561:0xd0024ad6f81f1335, 0x80c2...",https://www.google.com/maps/place//data=!4m2!3...
6,Vons Chicken,"Vons Chicken, 12740 La Mirada Blvd, La Mirada,...",0x80dd2b4c8555edb7:0xfc33d65c4bdbef42,,33.916402,-118.010855,[Restaurant],4.5,18,,"[[Thursday, 11AM–9:30PM], [Friday, 11AM–9:30PM...","{'Service options': ['Outdoor seating', 'Curbs...",California,,https://www.google.com/maps/place//data=!4m2!3...
13,Black Tie Ski Rental Delivery of Mammoth,"Black Tie Ski Rental Delivery of Mammoth, 501 ...",0x80960c29f2e3bf29:0x4b291f0d275a5699,,37.638754,-118.966055,"[Ski rental service, Snowboard rental service]",5.0,34,,"[[Thursday, 8AM–5PM], [Friday, 8AM–5PM], [Satu...",{'Accessibility': ['Wheelchair accessible entr...,California,"[0x80960dcd6ba76731:0x9a6875ced2f9228e, 0x8096...",https://www.google.com/maps/place//data=!4m2!3...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1099934,Luna's Cafe,"Luna's Cafe, 917 S Sycamore St, Santa Ana, CA ...",0x80dcd91a30eeb005:0xda3db9bb93ef4937,,33.734955,-117.868623,[Cafe],4.5,38,,"[[Friday, 9AM–6PM], [Saturday, 9AM–6PM], [Sund...","{'Service options': ['Takeout', 'Dine-in', 'De...",California,"[0x80dcd8dc37f120d5:0xf0ba6c061211747e, 0x80dc...",https://www.google.com/maps/place//data=!4m2!3...
1099937,Rite Aid,"Rite Aid, 111 N Main St, Santa Ana, CA 92701",0x80dcd9061788585b:0x7cd0f908a7ac6ebc,,33.745985,-117.867039,"[Drug store, Beauty supply store, Convenience ...",2.7,45,$$,"[[Friday, 7AM–10PM], [Saturday, 7AM–10PM], [Su...","{'Service options': ['Online care', 'Delivery'...",California,"[0x80dcd90614ba5369:0xca770ba6d6747ad1, 0x80dc...",https://www.google.com/maps/place//data=!4m2!3...
1099939,Rite Aid,"Rite Aid, 1610 San Miguel Dr, Newport Beach, C...",0x80dce09192e6cb4b:0xbf8840399a120ac,,33.611160,-117.864227,"[Drug store, Beauty supply store, Convenience ...",2.8,15,$$,"[[Friday, 7AM–10PM], [Saturday, 7AM–10PM], [Su...","{'Service options': ['Online care', 'Delivery'...",California,"[0x80dce0f1292ab233:0xed63f9e051803b03, 0x80dc...",https://www.google.com/maps/place//data=!4m2!3...
1099942,Foot Locker,"Foot Locker, 3451 S Dogwood Rd Space 1464, El ...",0x80d7668dc9ac1f59:0x56084e12bcd29e45,"Retail chain with brand-name athletic shoes, c...",32.762143,-115.530845,"[Shoe store, Clothing store, Fashion accessori...",4.5,47,$$,"[[Friday, 11AM–8PM], [Saturday, 11AM–8PM], [Su...","{'Service options': ['In-store pickup', 'In-st...",California,"[0x80d7668dc6a3d5e5:0xd0ed7fdcca64668b, 0x80d7...",https://www.google.com/maps/place//data=!4m2!3...


Verificamos la existencia de valores nulos y tipo de datos

In [136]:
herramientas.verifica_tipo_y_nulos(df_sitios_california)

Unnamed: 0,nombre_campo,tipo_datos,no_nulos_%,nulos_%,nulos
0,name,"[<class 'str'>, <class 'NoneType'>]",100.0,0.0,1
1,address,"[<class 'str'>, <class 'NoneType'>]",99.43,0.57,174
2,gmap_id,[<class 'str'>],100.0,0.0,0
3,description,"[<class 'NoneType'>, <class 'str'>]",14.82,85.18,26187
4,latitude,[<class 'float'>],100.0,0.0,0
5,longitude,[<class 'float'>],100.0,0.0,0
6,category,"[<class 'list'>, <class 'NoneType'>]",99.88,0.12,36
7,avg_rating,[<class 'float'>],100.0,0.0,0
8,num_of_reviews,[<class 'int'>],100.0,0.0,0
9,price,"[<class 'NoneType'>, <class 'str'>]",14.24,85.76,26366


Eliminamos los nulos de la columna 'category'

In [137]:
# Eliminar filas con valores nulos en la columna 'category'
df_sitios_california = df_sitios_california.dropna(subset=['category'])

Verificamos los cambios realizados anteriormente

In [138]:
herramientas.verifica_tipo_y_nulos(df_sitios_california)

Unnamed: 0,nombre_campo,tipo_datos,no_nulos_%,nulos_%,nulos
0,name,"[<class 'str'>, <class 'NoneType'>]",100.0,0.0,1
1,address,"[<class 'str'>, <class 'NoneType'>]",99.43,0.57,174
2,gmap_id,[<class 'str'>],100.0,0.0,0
3,description,"[<class 'NoneType'>, <class 'str'>]",14.81,85.19,26159
4,latitude,[<class 'float'>],100.0,0.0,0
5,longitude,[<class 'float'>],100.0,0.0,0
6,category,[<class 'list'>],100.0,0.0,0
7,avg_rating,[<class 'float'>],100.0,0.0,0
8,num_of_reviews,[<class 'int'>],100.0,0.0,0
9,price,"[<class 'NoneType'>, <class 'str'>]",14.21,85.79,26343


- ***Filtramos el metadata de sitios, con el estado de Texas***

In [139]:
# Filtrar los registros de Mdf_sitios que tienen el mismo 'gmap_id' que los de Texas
df_sitios_texas = df_sitios[df_sitios['gmap_id'].isin(df_texas_concat['gmap_id'])].copy()

# Agregar la nueva columna 'Texas' al nuevo DataFrame
df_sitios_texas['state'] = 'Texas'

#Observamos los cambios realizados
df_sitios_texas

Unnamed: 0,name,address,gmap_id,description,latitude,longitude,category,avg_rating,num_of_reviews,price,hours,MISC,state,relative_results,url
35,Walmart Pharmacy,"Walmart Pharmacy, 12220 FM 423, Frisco, TX 75033",0x864c3998b8d8dc83:0x57ffabe1e2322320,,33.179867,-96.883691,"[Pharmacy, Drug store, Medical supply store, V...",3.3,24,$,"[[Thursday, 9AM–9PM], [Friday, 9AM–9PM], [Satu...","{'Service options': ['Curbside pickup', 'In-st...",Texas,"[0x864c3999b29e291f:0x2d364c05e88eec13, 0x864c...",https://www.google.com/maps/place//data=!4m2!3...
39,Cricket Wireless Authorized Retailer,"Cricket Wireless Authorized Retailer, 2785 E E...",0x864c399793df5ee1:0x89754310c1a68fe,Wireless provider offering prepaid mobile phon...,33.178885,-96.889600,[Cell phone store],4.5,73,,"[[Thursday, 10AM–7PM], [Friday, 10AM–7PM], [Sa...","{'Service options': ['In-store shopping', 'Del...",Texas,"[0x864c3999f2047cab:0x65d5180e6920725a, 0x864c...",https://www.google.com/maps/place//data=!4m2!3...
99,Hunter's RV Park: Office,"Hunter's RV Park: Office, 605 S Main St, Highl...",0x863f589f655fa251:0xb329697cb9f39b66,,29.808913,-95.056543,[Campground],4.4,28,,,{'Accessibility': ['Wheelchair accessible entr...,Texas,"[0x863f58af62e1e3b1:0x78314375b1ceb16, 0x8640a...",https://www.google.com/maps/place//data=!4m2!3...
123,Golden Castle,"Golden Castle, 1906 E 12th St, Austin, TX 78702",0x8644b59b8fe872e5:0x5e638876caa84cc3,,30.273985,-97.719563,[Restaurant],4.5,8,,"[[Thursday, 5PM–12AM], [Friday, 5PM–12AM], [Sa...","{'Service options': ['Delivery', 'Takeout', 'D...",Texas,,https://www.google.com/maps/place//data=!4m2!3...
221,Women's Clinic of South Texas,"Women's Clinic of South Texas, 3001 N 23rd St,...",0x8665a6de4fed9c07:0xd8e859cdc30d7281,,26.233672,-98.241197,[Medical clinic],2.8,28,,"[[Thursday, 7:30AM–6:30PM], [Friday, 7:30AM–5:...",{'Accessibility': ['Wheelchair accessible entr...,Texas,"[0x8665a13e873178fb:0xeb3aa6a6cef5b458, 0x8665...",https://www.google.com/maps/place//data=!4m2!3...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1099886,Maserati of Austin,"Maserati of Austin, 12925 Pond Springs Rd, Aus...",0x8644cd26613306b9:0x4ad8e79d7ac54ccb,,30.439371,-97.771432,"[Maserati dealer, Car dealer, Car finance and ...",4.0,88,,"[[Friday, 9AM–6PM], [Saturday, 9AM–5PM], [Sund...","{'Service options': ['In-store shopping'], 'Ac...",Texas,"[0x8644cd29114b1191:0x415e0d2875419846, 0x8644...",https://www.google.com/maps/place//data=!4m2!3...
1099943,J & M Exhaust,"J & M Exhaust, 7411 Boulevard 26, North Richla...",0x864e785519c45361:0xa48c3a2ca3bec550,,32.827175,-97.224205,[Muffler shop],4.0,78,,"[[Friday, 9AM–5PM], [Saturday, 9AM–5PM], [Sund...","{'Service options': ['In-store shopping', 'Del...",Texas,"[0x864e769d516df8c9:0xb211a7192115962d, 0x864e...",https://www.google.com/maps/place//data=!4m2!3...
1099945,A-1 Muffler & Welding,"A-1 Muffler & Welding, 2130 Jacksboro Hwy, For...",0x864e747284b57d61:0xf80ed757e2bf54d4,,32.782700,-97.374804,"[Muffler shop, Auto repair shop, Metal fabrica...",4.8,46,,"[[Friday, 8AM–5PM], [Saturday, 9AM–3PM], [Sund...","{'Service options': ['In-store pickup', 'In-st...",Texas,"[0x864e769d516df8c9:0xb211a7192115962d, 0x864e...",https://www.google.com/maps/place//data=!4m2!3...
1099999,Farmers Branch Shopping Center,"Farmers Branch Shopping Center, 12895 Josey Ln...",0x864c27a6047a10e1:0xcd332f4713cf5a6a,,32.923175,-96.882246,[Shopping mall],4.0,424,,,"{'Service options': ['In-store shopping'], 'Ac...",Texas,"[0x864c20e52627ece5:0xc7284cab5500e6a5, 0x864c...",https://www.google.com/maps/place//data=!4m2!3...


Verificamos la existencia de valores nulos y tipo de datos

In [140]:
herramientas.verifica_tipo_y_nulos(df_sitios_texas)

Unnamed: 0,nombre_campo,tipo_datos,no_nulos_%,nulos_%,nulos
0,name,[<class 'str'>],100.0,0.0,0
1,address,"[<class 'str'>, <class 'NoneType'>]",98.74,1.26,371
2,gmap_id,[<class 'str'>],100.0,0.0,0
3,description,"[<class 'NoneType'>, <class 'str'>]",11.58,88.42,26029
4,latitude,[<class 'float'>],100.0,0.0,0
5,longitude,[<class 'float'>],100.0,0.0,0
6,category,"[<class 'list'>, <class 'NoneType'>]",99.89,0.11,31
7,avg_rating,[<class 'float'>],100.0,0.0,0
8,num_of_reviews,[<class 'int'>],100.0,0.0,0
9,price,"[<class 'str'>, <class 'NoneType'>]",10.97,89.03,26207


Eliminamos los nulos de la columna 'category'

In [141]:
# Eliminar filas con valores nulos en la columna 'category'
df_sitios_texas = df_sitios_texas.dropna(subset=['category'])

Verificamos los cambios realizados anteriormente

In [142]:
herramientas.verifica_tipo_y_nulos(df_sitios_texas)

Unnamed: 0,nombre_campo,tipo_datos,no_nulos_%,nulos_%,nulos
0,name,[<class 'str'>],100.0,0.0,0
1,address,"[<class 'str'>, <class 'NoneType'>]",98.74,1.26,371
2,gmap_id,[<class 'str'>],100.0,0.0,0
3,description,"[<class 'NoneType'>, <class 'str'>]",11.58,88.42,26001
4,latitude,[<class 'float'>],100.0,0.0,0
5,longitude,[<class 'float'>],100.0,0.0,0
6,category,[<class 'list'>],100.0,0.0,0
7,avg_rating,[<class 'float'>],100.0,0.0,0
8,num_of_reviews,[<class 'int'>],100.0,0.0,0
9,price,"[<class 'str'>, <class 'NoneType'>]",10.96,89.04,26182


- ***Filtramos el metadata de sitios, con el estado de New York***

In [143]:
# Filtrar los registros de Mdf_sitios que tienen el mismo 'gmap_id' que los de New York
df_sitios_new_york = df_sitios[df_sitios['gmap_id'].isin(df_new_york_concat['gmap_id'])].copy()

# Agregar la nueva columna 'New York' al nuevo DataFrame
df_sitios_new_york['state'] = 'New York'

#Observamos los cambios realizados
df_sitios_new_york

Unnamed: 0,name,address,gmap_id,description,latitude,longitude,category,avg_rating,num_of_reviews,price,hours,MISC,state,relative_results,url
59,T-Mobile,"T-Mobile, 3923 103rd St, Queens, NY 11368",0x89c25fc9494dce47:0x6d63c807b59a55,,40.750146,-73.862536,"[Cell phone store, Electronic parts supplier, ...",3.5,95,$$,"[[Thursday, 10AM–8PM], [Friday, 10AM–8PM], [Sa...","{'Service options': ['In-store shopping', 'Del...",New York,"[0x89c25fc7a91c609f:0xb103d6a261373fd, 0x89c25...",https://www.google.com/maps/place//data=!4m2!3...
195,Gillespie Chevrolet Parts,"Gillespie Chevrolet Parts, 128 Cayuga St, Unio...",0x89d0ba60bb7d710f:0x877e749ac7f9304a,,42.840487,-76.692377,[Auto parts store],4.6,18,,"[[Thursday, 7:30AM–8PM], [Friday, 7:30AM–5PM],...","{'Service options': ['In-store shopping'], 'Ac...",New York,"[0x89d0ab78eb57aca9:0x3a78b7d92d2e3950, 0x89d0...",https://www.google.com/maps/place//data=!4m2!3...
328,Arcana Juventa Anti-Aging Spa,"Arcana Juventa Anti-Aging Spa, 14 Brighton 11t...",0x89c244688b7609cd:0x205e2d8d173dd35e,,40.582130,-73.956333,"[Spa, Waterproofing company]",4.1,15,,"[[Thursday, 10AM–8PM], [Friday, 10AM–8PM], [Sa...",,New York,"[0x89c2446bdf00743d:0x1a016c5e3c603c88, 0x89c2...",https://www.google.com/maps/place//data=!4m2!3...
334,Horizon Health Care Staffing,"Horizon Health Care Staffing, 101 Ellis St # M...",0x89c3b5606048fd35:0x372ca4b707b21213,,40.517670,-74.242485,[Employment agency],3.9,8,,,"{'Health & safety': ['Appointment required'], ...",New York,"[0x89c24e8123a04bdf:0x647442f3bc28496, 0x89c24...",https://www.google.com/maps/place//data=!4m2!3...
432,NYC Parking Manhattan Avenue. Garage Corporation.,NYC Parking Manhattan Avenue. Garage Corporati...,0x89c2f6146dd4b907:0x5a6b44a7f6b8a76c,,40.807305,-73.954636,[Parking garage],3.7,8,,"[[Thursday, Open 24 hours], [Friday, Open 24 h...",{'Accessibility': ['Wheelchair accessible entr...,New York,"[0x89c2f613135dfba3:0xe185cc5d2b02df0f, 0x89c2...",https://www.google.com/maps/place//data=!4m2!3...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1099746,Creek-N-Wood RV Park and Campground,"Creek-N-Wood RV Park and Campground, 2530 Whee...",0x89d13ad30c7fcaff:0x3c01448f5da558dd,,42.906084,-77.380880,[Campground],4.5,58,,,{'Accessibility': ['Wheelchair accessible entr...,New York,"[0x89d13d78ddf709e5:0x80dc7df9525a6103, 0x873b...",https://www.google.com/maps/place//data=!4m2!3...
1099747,Absolute Automotive of Ny,"Absolute Automotive of Ny, 2310 Walworth-Mario...",0x89d6d41470c5dbf5:0x66461848781795d3,,43.138831,-77.268383,[Auto repair shop],4.9,45,,"[[Friday, 8AM–6PM], [Saturday, Closed], [Sunda...",{'Accessibility': ['Wheelchair accessible entr...,New York,"[0x89d6d413ac3b37c9:0xe8f4ed0862cd4c0a, 0x89d6...",https://www.google.com/maps/place//data=!4m2!3...
1099751,Michael Prouty Memorial Park,"Michael Prouty Memorial Park, 2402-2474 NY-65,...",0x89d1388d4acc026f:0x30a872bfd0a4679d,,42.908991,-77.539035,[Park],4.1,16,,"[[Friday, 8AM–6PM], [Saturday, 8AM–6PM], [Sund...",{'Accessibility': ['Wheelchair accessible entr...,New York,"[0x89d13ba56ebef6a1:0x196f02728d220a25, 0x89d1...",https://www.google.com/maps/place//data=!4m2!3...
1099848,Cafe Booqoo,"Cafe Booqoo, 478 Smith St, Brooklyn, NY 11231",0x89c25af6e18e56a5:0x9b38fce3dbd88098,,40.674726,-73.997918,"[Creole restaurant, Breakfast restaurant, Cafe...",4.5,68,,"[[Friday, 11AM–8PM], [Saturday, 9AM–4PM], [Sun...","{'Service options': ['Takeout', 'Delivery'], '...",New York,"[0x89c25a4e53443397:0xecc6f06f74f11410, 0x89c2...",https://www.google.com/maps/place//data=!4m2!3...


Verificamos la existencia de valores nulos y tipo de datos

In [144]:
herramientas.verifica_tipo_y_nulos(df_sitios_new_york)

Unnamed: 0,nombre_campo,tipo_datos,no_nulos_%,nulos_%,nulos
0,name,[<class 'str'>],100.0,0.0,0
1,address,"[<class 'str'>, <class 'NoneType'>]",99.61,0.39,100
2,gmap_id,[<class 'str'>],100.0,0.0,0
3,description,"[<class 'NoneType'>, <class 'str'>]",15.47,84.53,21824
4,latitude,[<class 'float'>],100.0,0.0,0
5,longitude,[<class 'float'>],100.0,0.0,0
6,category,"[<class 'list'>, <class 'NoneType'>]",99.94,0.06,16
7,avg_rating,[<class 'float'>],100.0,0.0,0
8,num_of_reviews,[<class 'int'>],100.0,0.0,0
9,price,"[<class 'str'>, <class 'NoneType'>]",18.58,81.42,21021


Eliminamos los nulos de la columna 'category'

In [145]:
# Eliminar filas con valores nulos en la columna 'category'
df_sitios_new_york = df_sitios_new_york.dropna(subset=['category'])

Verificamos los cambios realizados anteriormente

In [146]:
herramientas.verifica_tipo_y_nulos(df_sitios_new_york)

Unnamed: 0,nombre_campo,tipo_datos,no_nulos_%,nulos_%,nulos
0,name,[<class 'str'>],100.0,0.0,0
1,address,"[<class 'str'>, <class 'NoneType'>]",99.61,0.39,100
2,gmap_id,[<class 'str'>],100.0,0.0,0
3,description,"[<class 'NoneType'>, <class 'str'>]",15.46,84.54,21812
4,latitude,[<class 'float'>],100.0,0.0,0
5,longitude,[<class 'float'>],100.0,0.0,0
6,category,[<class 'list'>],100.0,0.0,0
7,avg_rating,[<class 'float'>],100.0,0.0,0
8,num_of_reviews,[<class 'int'>],100.0,0.0,0
9,price,"[<class 'str'>, <class 'NoneType'>]",18.58,81.42,21009


- ***Filtramos el metadata de sitios, con el estado de Florida***

In [147]:
# Filtrar los registros de Mdf_sitios que tienen el mismo 'gmap_id' que los de Florida
df_sitios_florida = df_sitios[df_sitios['gmap_id'].isin(df_florida_concat['gmap_id'])].copy()

# Agregar la nueva columna 'Florida' al nuevo DataFrame
df_sitios_florida['state'] = 'Florida'

#Observamos los cambios realizados
df_sitios_florida

Unnamed: 0,name,address,gmap_id,description,latitude,longitude,category,avg_rating,num_of_reviews,price,hours,MISC,state,relative_results,url
105,"Brian Shaheen, MD","Brian Shaheen, MD, 2421 Thomas Dr, Panama City...",0x8893863ea87bd5dd:0x9383ebf973e74abb,,30.159982,-85.752277,"[Family practice physician, General practitioner]",4.2,18,,"[[Thursday, 8AM–5PM], [Friday, 8AM–5PM], [Satu...","{'Service options': ['Online care'], 'Accessib...",Florida,"[0x88938ebfbd53f9c5:0xf6e52004f37523c8, 0x8893...",https://www.google.com/maps/place//data=!4m2!3...
117,Mail Station,"Mail Station, 8466 Lockwood Ridge Rd, Sarasota...",0x88c33f1c8e1f99a3:0x99b22c5bd258b3a7,,27.389683,-82.508387,"[Mailing service, Courier service, Fax service...",3.4,28,,"[[Thursday, 8AM–5PM], [Friday, 8AM–5PM], [Satu...",{'Accessibility': ['Wheelchair accessible entr...,Florida,"[0x88c3475a64b98c03:0xe8de19a19dae1b2c, 0x88c3...",https://www.google.com/maps/place//data=!4m2!3...
118,APC Pediatrics,"APC Pediatrics, 5255 Office Park Blvd 110 & 11...",0x88c33c2935058dd3:0xb888c702ecf5aef6,,27.448302,-82.510850,"[Pediatrician, Children's hospital, Doctor]",4.2,34,,"[[Thursday, 8AM–5PM], [Friday, 8AM–5PM], [Satu...","{'Service options': ['Online care'], 'Health &...",Florida,"[0x88c33e113c201abb:0x49ef8b29737f8773, 0x88c3...",https://www.google.com/maps/place//data=!4m2!3...
119,Infinia Accounting,"Infinia Accounting, 555 W. Granada Blvd Ste Bl...",0x88d908b8c02dc3bb:0xea94b1fb47b68ef6,,29.280410,-81.071481,"[Tax preparation service, Accountant, Payroll ...",5.0,13,,"[[Thursday, 10AM–5PM], [Friday, 10AM–4PM], [Sa...",{'Accessibility': ['Wheelchair accessible entr...,Florida,"[0x88e6dc9fd184eed9:0x81252448cc728aa8, 0x88e6...",https://www.google.com/maps/place//data=!4m2!3...
120,Baker Distributing Company,"Baker Distributing Company, 7810 25th Ct E #11...",0x88c33fb303dc8e69:0x3485e731f1a704f8,,27.401269,-82.532346,"[Air conditioning system supplier, Air filter ...",4.3,8,,"[[Thursday, 7:30AM–5PM], [Friday, 7:30AM–5PM],...",{'Accessibility': ['Wheelchair accessible entr...,Florida,"[0x88c33e46cf5434ad:0x8772f525f32cc2d2, 0x88c3...",https://www.google.com/maps/place//data=!4m2!3...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1099837,Smoke Shop,"Smoke Shop, 5275 Red Bug Lake Rd #125, Winter ...",0x88e76eeb288b3547:0x1a15780ed1dd82e3,,28.647043,-81.276647,"[Tobacco shop, Cigar shop]",4.4,78,,"[[Friday, 11AM–9:30PM], [Saturday, 11AM–9:30PM...","{'Service options': ['In-store shopping', 'Del...",Florida,"[0x88e76e3c25fc6b5d:0x43d6684d268ada2e, 0x88e7...",https://www.google.com/maps/place//data=!4m2!3...
1099838,Venci's Barber Shop,"Venci's Barber Shop, 1325 W Broadway St, Ovied...",0x88e76961865270e9:0xbe67cb393b44261f,,28.665168,-81.224895,[Barber shop],4.8,64,,"[[Friday, 8:30AM–3PM], [Saturday, 8AM–2PM], [S...",{'Accessibility': ['Wheelchair accessible entr...,Florida,"[0x88e769f68d97c221:0xdc735caf0e5c146f, 0x88e7...",https://www.google.com/maps/place//data=!4m2!3...
1099839,Oviedo Little League,"Oviedo Little League, 275 King St, Oviedo, FL ...",0x88e769038bdd946d:0x915dab8e50516a3d,,28.672274,-81.215889,"[Little league club, Sports complex]",4.4,48,,"[[Friday, 5–9PM], [Saturday, 8:30AM–3PM], [Sun...",{'Accessibility': ['Wheelchair accessible entr...,Florida,"[0x88e76bd4d42d834f:0xf0d38820c232deb1, 0x88e7...",https://www.google.com/maps/place//data=!4m2!3...
1099841,CosmoProf,"CosmoProf, 4257 W Lake Mary Blvd #10, Lake Mar...",0x88e772bad61b7dd3:0x3f42bbf4a63267b0,,28.754376,-81.351337,[Beauty supply store],4.4,37,,"[[Friday, 8AM–6PM], [Saturday, 8AM–4PM], [Sund...","{'Service options': ['Curbside pickup', 'Deliv...",Florida,"[0x88e772b90fe0a9a1:0x3428036853668172, 0x88e7...",https://www.google.com/maps/place//data=!4m2!3...


Verificamos la existencia de valores nulos y tipo de datos

In [148]:
herramientas.verifica_tipo_y_nulos(df_sitios_florida)

Unnamed: 0,nombre_campo,tipo_datos,no_nulos_%,nulos_%,nulos
0,name,[<class 'str'>],100.0,0.0,0
1,address,"[<class 'str'>, <class 'NoneType'>]",98.13,1.87,417
2,gmap_id,[<class 'str'>],100.0,0.0,0
3,description,"[<class 'NoneType'>, <class 'str'>]",11.03,88.97,19790
4,latitude,[<class 'float'>],100.0,0.0,0
5,longitude,[<class 'float'>],100.0,0.0,0
6,category,"[<class 'list'>, <class 'NoneType'>]",99.95,0.05,12
7,avg_rating,[<class 'float'>],100.0,0.0,0
8,num_of_reviews,[<class 'int'>],100.0,0.0,0
9,price,"[<class 'NoneType'>, <class 'str'>]",10.38,89.62,19935


- ***Filtramos el metadata de sitios, con el estado de Illinois***

In [149]:
# Filtrar los registros de Mdf_sitios que tienen el mismo 'gmap_id' que los de Illinois
df_sitios_illinois = df_sitios[df_sitios['gmap_id'].isin(df_illinois_concat['gmap_id'])].copy()

# Agregar la nueva columna 'Illinois' al nuevo DataFrame
df_sitios_illinois['state'] = 'Illinois'

#Observamos los cambios realizados
df_sitios_illinois

Unnamed: 0,name,address,gmap_id,description,latitude,longitude,category,avg_rating,num_of_reviews,price,hours,MISC,state,relative_results,url
124,The Toy House,"The Toy House, 10 E Main St #105, East Dundee,...",0x880f0f1b32155555:0x8347e0c971acb955,,42.098159,-88.275237,[Toy store],4.9,17,,"[[Thursday, 10AM–8PM], [Friday, 10AM–8PM], [Sa...",{'Accessibility': ['Wheelchair accessible entr...,Illinois,"[0x880f72a26e93c371:0xed11dd45ac49ed11, 0x880f...",https://www.google.com/maps/place//data=!4m2!3...
125,ROYAL LIQUOR,"ROYAL LIQUOR, 26W211 Geneva Rd, Wheaton, IL 60187",0x880e5523024703c1:0xb93a8ccb6918d616,,41.887341,-88.136456,[Liquor store],3.6,8,,"[[Thursday, 10AM–10PM], [Friday, 10AM–10:30PM]...","{'Service options': ['In-store shopping', 'Del...",Illinois,"[0x880e55410d7a8187:0x9863102d1658ecf2, 0x880e...",https://www.google.com/maps/place//data=!4m2!3...
135,"Newton, IL Chizevsky Field","Newton, IL Chizevsky Field, W Decatur St, Newt...",0x8873b548b16a932f:0xaa86fda70cf3b195,,38.989536,-88.177085,[Football Field],4.4,17,,,{'Accessibility': ['Wheelchair accessible entr...,Illinois,,https://www.google.com/maps/place//data=!4m2!3...
138,Camp Walter Scott,"Camp Walter Scott, 15290 E 300th Ave, Dieteric...",0x8873eb206dd04879:0xb303a76f2c682e4,,38.959944,-88.519025,[Conference center],4.9,14,,,{'Accessibility': ['Wheelchair accessible entr...,Illinois,"[0x8873bbdf9a504c3d:0x1cf279640439c21f, 0x8873...",https://www.google.com/maps/place//data=!4m2!3...
526,Honda Service Center,"Honda Service Center, 1111 N Clark St #2, Chic...",0x880fd3ee7195c1bd:0x84d7c59622116e4b,,41.901940,-87.630926,"[Auto repair shop, Auto air conditioning servi...",3.3,35,,"[[Thursday, 6AM–10PM], [Friday, 6AM–10PM], [Sa...",{'Accessibility': ['Wheelchair accessible entr...,Illinois,"[0x880fd1dba71b87cb:0xb0a40d8eef20c31d, 0x880e...",https://www.google.com/maps/place//data=!4m2!3...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1099759,Dry Goods,"Dry Goods, 524 Oakbrook Center, Oak Brook, IL ...",0x880e4c42f77dc4f9:0x8056e7f0f5a037f7,,41.851432,-87.952389,"[Women's clothing store, Clothing store, Costu...",4.5,26,,"[[Friday, 11AM–8PM], [Saturday, 11AM–8PM], [Su...","{'Service options': ['In-store shopping', 'Del...",Illinois,"[0x880e4c6805240753:0xe2dcd383e27e1400, 0x880e...",https://www.google.com/maps/place//data=!4m2!3...
1099779,Williams Automotive,"Williams Automotive, 1800 Vermont St, Blue Isl...",0x880e237d7e673299:0x17c1c978afdfc140,,41.657216,-87.665749,[Auto repair shop],4.7,64,,"[[Friday, 8AM–6PM], [Saturday, Closed], [Sunda...",{'Accessibility': ['Wheelchair accessible entr...,Illinois,"[0x880e24a05b46fb4d:0xccd7e1524533bb7b, 0x880e...",https://www.google.com/maps/place//data=!4m2!3...
1099896,"Doctors of Physical Therapy, merged with Colle...","Doctors of Physical Therapy, merged with Colle...",0x880f813908127c51:0xee18719d922d9a92,,42.479510,-88.102355,"[Physical therapist, Physical fitness program,...",4.9,93,,"[[Friday, 5:30AM–4PM], [Saturday, Closed], [Su...",{'Accessibility': ['Wheelchair accessible entr...,Illinois,"[0x880f814bf357e987:0xbfb33e5d1ab83581, 0x880f...",https://www.google.com/maps/place//data=!4m2!3...
1099899,K B CITGO,"K B CITGO, 41082 IL-83, Antioch, IL 60002",0x880f8154f5614b71:0x5e62e7a848c5fea6,,42.460434,-88.092262,[Gas station],3.9,28,,"[[Friday, 5AM–10PM], [Saturday, 6AM–10PM], [Su...",,Illinois,"[0x880f8268253f80c9:0x73b9c044d10cd795, 0x880f...",https://www.google.com/maps/place//data=!4m2!3...


Verificamos la existencia de valores nulos y tipo de datos

In [150]:
herramientas.verifica_tipo_y_nulos(df_sitios_illinois)

Unnamed: 0,nombre_campo,tipo_datos,no_nulos_%,nulos_%,nulos
0,name,[<class 'str'>],100.0,0.0,0
1,address,"[<class 'str'>, <class 'NoneType'>]",99.48,0.52,86
2,gmap_id,[<class 'str'>],100.0,0.0,0
3,description,"[<class 'NoneType'>, <class 'str'>]",16.46,83.54,13711
4,latitude,[<class 'float'>],100.0,0.0,0
5,longitude,[<class 'float'>],100.0,0.0,0
6,category,"[<class 'list'>, <class 'NoneType'>]",99.92,0.08,13
7,avg_rating,[<class 'float'>],100.0,0.0,0
8,num_of_reviews,[<class 'int'>],100.0,0.0,0
9,price,"[<class 'NoneType'>, <class 'str'>]",17.64,82.36,13517


Eliminamos los nulos de la columna 'category'

In [151]:
# Eliminar filas con valores nulos en la columna 'category'
df_sitios_illinois = df_sitios_illinois.dropna(subset=['category'])

Verificamos los cambios realizados anteriormente

In [152]:
herramientas.verifica_tipo_y_nulos(df_sitios_illinois)

Unnamed: 0,nombre_campo,tipo_datos,no_nulos_%,nulos_%,nulos
0,name,[<class 'str'>],100.0,0.0,0
1,address,"[<class 'str'>, <class 'NoneType'>]",99.48,0.52,86
2,gmap_id,[<class 'str'>],100.0,0.0,0
3,description,"[<class 'NoneType'>, <class 'str'>]",16.47,83.53,13699
4,latitude,[<class 'float'>],100.0,0.0,0
5,longitude,[<class 'float'>],100.0,0.0,0
6,category,[<class 'list'>],100.0,0.0,0
7,avg_rating,[<class 'float'>],100.0,0.0,0
8,num_of_reviews,[<class 'int'>],100.0,0.0,0
9,price,"[<class 'NoneType'>, <class 'str'>]",17.66,82.34,13504


Concatenamos todos los estados

In [153]:
# Lista de los df_sitios_concat 
df_sitios_concat = [df_sitios_california, df_sitios_illinois, df_sitios_florida, df_sitios_texas, df_sitios_new_york]

df_sitios_estados = pd.concat(df_sitios_concat)

df_sitios_estados

Unnamed: 0,name,address,gmap_id,description,latitude,longitude,category,avg_rating,num_of_reviews,price,hours,MISC,state,relative_results,url
2,San Soo Dang,"San Soo Dang, 761 S Vermont Ave, Los Angeles, ...",0x80c2c778e3b73d33:0xbdc58662a4a97d49,,34.058092,-118.292130,[Korean restaurant],4.4,18,,"[[Thursday, 6:30AM–6PM], [Friday, 6:30AM–6PM],...","{'Service options': ['Takeout', 'Dine-in', 'De...",California,"[0x80c2c78249aba68f:0x35bf16ce61be751d, 0x80c2...",https://www.google.com/maps/place//data=!4m2!3...
4,Nobel Textile Co,"Nobel Textile Co, 719 E 9th St, Los Angeles, C...",0x80c2c632f933b073:0xc31785961fe826a6,,34.036694,-118.249421,[Fabric store],4.3,7,,"[[Thursday, 9AM–5PM], [Friday, 9AM–5PM], [Satu...",{'Service options': ['In-store pickup']},California,"[0x80c2c62c496083d1:0xdefa11317fe870a1, 0x80c2...",https://www.google.com/maps/place//data=!4m2!3...
5,Matrix International Textiles,"Matrix International Textiles, 1363 S Bonnie B...",0x80c2cf163db6bc89:0x219484e2edbcfa41,,34.015505,-118.181839,[Fabric store],3.5,6,,"[[Thursday, 8:30AM–5:30PM], [Friday, 8:30AM–5:...",{'Accessibility': ['Wheelchair accessible entr...,California,"[0x80c2cf042a5d9561:0xd0024ad6f81f1335, 0x80c2...",https://www.google.com/maps/place//data=!4m2!3...
6,Vons Chicken,"Vons Chicken, 12740 La Mirada Blvd, La Mirada,...",0x80dd2b4c8555edb7:0xfc33d65c4bdbef42,,33.916402,-118.010855,[Restaurant],4.5,18,,"[[Thursday, 11AM–9:30PM], [Friday, 11AM–9:30PM...","{'Service options': ['Outdoor seating', 'Curbs...",California,,https://www.google.com/maps/place//data=!4m2!3...
13,Black Tie Ski Rental Delivery of Mammoth,"Black Tie Ski Rental Delivery of Mammoth, 501 ...",0x80960c29f2e3bf29:0x4b291f0d275a5699,,37.638754,-118.966055,"[Ski rental service, Snowboard rental service]",5.0,34,,"[[Thursday, 8AM–5PM], [Friday, 8AM–5PM], [Satu...",{'Accessibility': ['Wheelchair accessible entr...,California,"[0x80960dcd6ba76731:0x9a6875ced2f9228e, 0x8096...",https://www.google.com/maps/place//data=!4m2!3...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1099746,Creek-N-Wood RV Park and Campground,"Creek-N-Wood RV Park and Campground, 2530 Whee...",0x89d13ad30c7fcaff:0x3c01448f5da558dd,,42.906084,-77.380880,[Campground],4.5,58,,,{'Accessibility': ['Wheelchair accessible entr...,New York,"[0x89d13d78ddf709e5:0x80dc7df9525a6103, 0x873b...",https://www.google.com/maps/place//data=!4m2!3...
1099747,Absolute Automotive of Ny,"Absolute Automotive of Ny, 2310 Walworth-Mario...",0x89d6d41470c5dbf5:0x66461848781795d3,,43.138831,-77.268383,[Auto repair shop],4.9,45,,"[[Friday, 8AM–6PM], [Saturday, Closed], [Sunda...",{'Accessibility': ['Wheelchair accessible entr...,New York,"[0x89d6d413ac3b37c9:0xe8f4ed0862cd4c0a, 0x89d6...",https://www.google.com/maps/place//data=!4m2!3...
1099751,Michael Prouty Memorial Park,"Michael Prouty Memorial Park, 2402-2474 NY-65,...",0x89d1388d4acc026f:0x30a872bfd0a4679d,,42.908991,-77.539035,[Park],4.1,16,,"[[Friday, 8AM–6PM], [Saturday, 8AM–6PM], [Sund...",{'Accessibility': ['Wheelchair accessible entr...,New York,"[0x89d13ba56ebef6a1:0x196f02728d220a25, 0x89d1...",https://www.google.com/maps/place//data=!4m2!3...
1099848,Cafe Booqoo,"Cafe Booqoo, 478 Smith St, Brooklyn, NY 11231",0x89c25af6e18e56a5:0x9b38fce3dbd88098,,40.674726,-73.997918,"[Creole restaurant, Breakfast restaurant, Cafe...",4.5,68,,"[[Friday, 11AM–8PM], [Saturday, 9AM–4PM], [Sun...","{'Service options': ['Takeout', 'Delivery'], '...",New York,"[0x89c25a4e53443397:0xecc6f06f74f11410, 0x89c2...",https://www.google.com/maps/place//data=!4m2!3...


Eliminamos las columnas 'descripcion' y 'price' 

In [154]:
df_sitios_estados = df_sitios_estados.drop(['description', 'price'], axis=1)

Elimino los valores nulos de la columna 'category'

In [155]:
# Eliminar filas con valores nulos en la columna 'category'
df_sitios_estados = df_sitios_estados.dropna(subset=['category'])

Elimino los valores nulos de la columna 'name'

In [156]:
# Eliminar filas con valores nulos en la columna 'name'
df_sitios_estados = df_sitios_estados.dropna(subset=['name'])

Verificamos valores nulos y tipo de datos

In [157]:
herramientas.verifica_tipo_y_nulos(df_sitios_estados)

Unnamed: 0,nombre_campo,tipo_datos,no_nulos_%,nulos_%,nulos
0,name,[<class 'str'>],100.0,0.0,0
1,address,"[<class 'str'>, <class 'NoneType'>]",99.08,0.92,1147
2,gmap_id,[<class 'str'>],100.0,0.0,0
3,latitude,[<class 'float'>],100.0,0.0,0
4,longitude,[<class 'float'>],100.0,0.0,0
5,category,[<class 'list'>],100.0,0.0,0
6,avg_rating,[<class 'float'>],100.0,0.0,0
7,num_of_reviews,[<class 'int'>],100.0,0.0,0
8,hours,"[<class 'list'>, <class 'NoneType'>]",86.69,13.31,16579
9,MISC,"[<class 'dict'>, <class 'NoneType'>]",89.19,10.81,13469


Verifico el tipo de dato del dataframe

In [None]:
df_sitios_estados.info()

Exportamos el dataframe 

In [158]:
# Exportar el DataFrame
df_sitios_estados.to_csv('df_sitios_estados.csv', escapechar='\\', index=False)