In [1]:
!pip3 install jsonschema




In [2]:
import json

In [3]:
with open("../data/places.json") as places_file:
    places_dict = json.load(places_file)

In [4]:
features = places_dict["features"]
properties = list(map(lambda x: x["properties"], features))
properties[:2]

[{'identifier': 'bienvenida_2025-19',
  'name': 'Cristales de colores. Educación',
  'campus': 'SJ',
  'information': '',
  'categories': ['other'],
  'faculties': [],
  'floors': [1]},
 {'identifier': 'bienvenida_2025-11',
  'name': 'Casillero 666',
  'campus': 'SJ',
  'information': '',
  'categories': ['other'],
  'faculties': [],
  'floors': [1]}]

In [5]:
from jsonschema import validate, ValidationError


In [7]:
schema = {
    "type": "object",
    "properties": {
        "identifier": {"type": "string"},
        "name": {"type": "string"},
        "information": {"type": "string"},
        "categories": {
            "type": "array",
            "items": {"type": "string"}
        },
        "campus": {"type": "string"},
        "faculties": {
            "type": "array",
            "items": {"type": "string"}
        },
        "floors": {
            "type": "array",
            "items": {"type": "number"}
        }
    },
    "required": ["identifier", "name", "information", "categories", "campus", "faculties"],
    "additionalProperties": False
}

In [8]:
def process_item(instance):
    """
    Procesa un elemento aplicando las reglas específicas:
    - Si no tiene id o es "", se borra el item
    - Si no tiene nombre o es "", se borra el item  
    - Si no tiene information, se pone ""
    - Si no tiene categories, se pone []
    - Si no tiene campus o es "", se borra el item
    - Si no tiene faculties, se pone []
    - Si faculties es un string, se convierte a array [string]
    - floors es opcional
    - Se eliminan propiedades extra no definidas en el schema
    """
    if not isinstance(instance, dict):
        return None
    
    # Verificar campos obligatorios que causan borrado
    if not instance.get("identifier") or instance.get("identifier") == "":
        return None
    
    if not instance.get("name") or instance.get("name") == "":
        return None
        
    if not instance.get("campus") or instance.get("campus") == "":
        return None
    
    # Crear el item procesado SOLO con propiedades permitidas en el schema
    processed_item = {
        "identifier": instance["identifier"],
        "name": instance["name"],
        "campus": instance["campus"]
    }
    
    # Aplicar defaults para campos opcionales/con valores por defecto
    processed_item["information"] = instance.get("information", "")
    processed_item["categories"] = instance.get("categories", [])
    
    # Procesar faculties - convertir string a array si es necesario
    faculties = instance.get("faculties", [])
    if isinstance(faculties, str):
        processed_item["faculties"] = [faculties]
    elif isinstance(faculties, list):
        processed_item["faculties"] = faculties
    else:
        processed_item["faculties"] = []
    
    # floors es completamente opcional - solo agregar si existe
    if "floors" in instance:
        processed_item["floors"] = instance["floors"]
    
    return processed_item

# Procesamiento
processed_properties = []
murio = []
eliminados = 0

for i, instance in enumerate(properties):
    processed_item = process_item(instance)
    
    if processed_item is not None:
        try:
            # Validar el item procesado
            validate(instance=processed_item, schema=schema)
            processed_properties.append(processed_item)
        except ValidationError as e:
            murio.append({
                "item": processed_item,
                "error": str(e.message)
            })
    else:
        eliminados += 1

print(f"Valores iniciales: {len(properties)}")
print(f"Items eliminados (falta id/name/campus): {eliminados}")
print(f"Total de items válidos: {len(processed_properties)}")
print(f"Total de items inválidos: {len(murio)}")
print(f"\nItems procesados válidos: {len(processed_properties)}")

# Mostrar algunos ejemplos de items válidos
if processed_properties:
    print(f"\nPrimeros 3 items válidos:")
    for i, item in enumerate(processed_properties[:3]):
        print(f"  {i+1}: {item}")

# Mostrar items que murieron en validación
if murio:
    print(f"\nItems que fallaron validación:")
    for i, item in enumerate(murio):
        print(f"  {i+1}: Error: {item['error']}")
        print(f"      Item: {item['item']}")

Valores iniciales: 1263
Items eliminados (falta id/name/campus): 0
Total de items válidos: 1263
Total de items inválidos: 0

Items procesados válidos: 1263

Primeros 3 items válidos:
  1: {'identifier': 'bienvenida_2025-19', 'name': 'Cristales de colores. Educación', 'campus': 'SJ', 'information': '', 'categories': ['other'], 'faculties': [], 'floors': [1]}
  2: {'identifier': 'bienvenida_2025-11', 'name': 'Casillero 666', 'campus': 'SJ', 'information': '', 'categories': ['other'], 'faculties': [], 'floors': [1]}
  3: {'identifier': 'bienvenida_2025-10', 'name': 'Los peces de sociales', 'campus': 'SJ', 'information': '', 'categories': ['other'], 'faculties': [], 'floors': [1]}


In [9]:
for f in features:
    f["properties"] = process_item(f["properties"])

In [10]:
places_dict["features"] = features

In [11]:
with open("../data/places.json", "w") as places_file:
    json.dump(places_dict, places_file, indent=2, ensure_ascii=False)


In [72]:
features = places_dict["features"]
campus = set(map(lambda x: x["properties"]["campus"], features))
campus

{'CC', 'LC', 'OR', 'SJ', 'VR'}

In [73]:
from functools import reduce

categories = reduce(
    lambda acc, x: acc.union(set(x["properties"]["categories"])), 
    features, 
    set()
)

categories

{'auditorium',
 'bath',
 'building',
 'campus',
 'classroom',
 'computers',
 'faculty',
 'financial',
 'food_lunch',
 'laboratory',
 'library',
 'other',
 'park_bicycle',
 'parking',
 'photocopy',
 'shop',
 'sports_place',
 'studyroom',
 'trash',
 'water',
 'yard'}

In [74]:
features[0]

{'type': 'Feature',
 'properties': {'identifier': 'bienvenida_2025-19',
  'name': 'Cristales de colores. Educación',
  'campus': 'SJ',
  'information': '',
  'categories': ['other'],
  'faculties': [],
  'floors': [1]},
 'geometry': {'type': 'Point',
  'coordinates': [-70.61404824737933, -33.49801815724207]}}

In [75]:
features = places_dict["features"]
n_ids = len(set(map(lambda x: x["properties"]["identifier"], features)))
n_ids

1263

In [76]:
from collections import Counter

# Opción 1: Usando Counter para contar ocurrencias de cada ID
all_ids = [x["properties"]["identifier"] for x in features]
id_counts = Counter(all_ids)

# Encontrar IDs duplicados (que aparecen más de 1 vez)
duplicated_ids = {id_val: count for id_val, count in id_counts.items() if count > 1}

print(f"Total de features: {len(features)}")
print(f"IDs únicos: {len(set(all_ids))}")
print(f"IDs duplicados: {duplicated_ids}")

# Opción 2: Mostrar detalles de los items con IDs duplicados
if duplicated_ids:
    print("\nDetalles de los items con IDs duplicados:")
    for dup_id in duplicated_ids.keys():
        print(f"\nID duplicado: '{dup_id}' (aparece {duplicated_ids[dup_id]} veces)")
        items_with_dup_id = [f for f in features if f["properties"]["identifier"] == dup_id]
        
        for i, item in enumerate(items_with_dup_id, 1):
            print(f"  {i}. Name: {item['properties']['name']}")
            print(f"     Categories: {item['properties']['categories']}")
            print(f"     Campus: {item['properties']['campus']}")
            print(f"     Faculties: {item['properties']['faculties']}")

# Opción 3: Solo obtener la lista de IDs que están duplicados
duplicate_ids_only = [id_val for id_val, count in id_counts.items() if count > 1]
print(f"\nLista de IDs duplicados: {duplicate_ids_only}")

# Opción 4: Encontrar duplicados usando set (alternativa)
seen_ids = set()
duplicate_ids_alt = set()

for feature in features:
    id_val = feature["properties"]["identifier"]
    if id_val in seen_ids:
        duplicate_ids_alt.add(id_val)
    else:
        seen_ids.add(id_val)

print(f"\nIDs duplicados (método alternativo): {list(duplicate_ids_alt)}")

Total de features: 1263
IDs únicos: 1263
IDs duplicados: {}

Lista de IDs duplicados: []

IDs duplicados (método alternativo): []
