# Data Connector Notebook

This notebook connects to all of our various data sources in the repository and stores them into the Bronze Layer as a parquet format

## Beginning

In [1]:
# Import required libraries
import pandas as pd
import geopandas as gpd
import os
from dotenv import load_dotenv

# Load environment variables
load_dotenv()  # Loads from .env by default

## Define base path (adjust if notebook is in different location)
BASE_PATH = os.getenv('BASE_PATH')

# Verify base path exists
if not BASE_PATH or not os.path.exists(BASE_PATH):
    raise ValueError(f"Invalid BASE_PATH: {BASE_PATH}. Check your .env file and directory structure")

# 2. CSV FILES

## CIVIL SERVICE

In [2]:
civil_service = pd.read_csv(
    os.path.join(BASE_PATH, "data/raw/civil_service/control-politico.csv"),
    encoding='latin-1',
    sep=';'
)

In [3]:
civil_service.head()

Unnamed: 0,Id Proposición,Aprobación,Bancada,Comisión origen,Título,Tema,Subtema,Proposición,Año
0,dic-16,8/02/2016,Partido Conservador Colombiano,Plan,RIO BOGOTÁ,Asuntos Ambientales,AMB - Río Bogotá,12,2016
1,nov-16,8/02/2016,Partido Libres,Plan,PLAN DE ORDENMIENTO TERRITORIAL,Ordenamiento Territorial,POT - POT,11,2016
2,oct-16,8/02/2016,Partido de la U,Plan,IMPLEMENTACIÓN DEL ACUERDO 516/2012 - ZONAS AM...,Seguridad,SEG - Puntos Críticos,10,2016
3,sep-16,8/02/2016,Centro Democrático,Plan,EMPRESA ACUEDUCTO,Empresas Públicas,EPU - Empresa de Acueducto,9,2016
4,ago-16,8/02/2016,Partido Alianza Verde,Plan,RIO BOGOTÁ,Asuntos Ambientales,AMB - Río Bogotá,8,2016


In [4]:
# Convert to parquet and save to Bronze layer
civil_service.to_parquet(os.path.join(BASE_PATH, "data/1_Bronze/civil_service.parquet"))

## Transport

In [5]:
transport = pd.read_csv(
    os.path.join(BASE_PATH, "data/raw/transport/consolidado-de-salidas-sistema-troncal-por-franja-horaria-noviembre-2024.csv"),
    encoding='latin-1',
    sep=';'
)

  transport = pd.read_csv(


In [6]:
transport.head()

Unnamed: 0,Línea,Estación,Acceso de Estación,MES,INTERVALO,DÍA 01,DÍA 02,DÍA 03,DÍA 04,DÍA 05,...,DÍA 25,DÍA 26,DÍA 27,DÍA 28,DÍA 29,DÍA 30,DÍA 31,Total general,Unnamed: 37,Unnamed: 38
0,(11) Zona K Calle 26,(06000) Portal Eldorado,(01) PLAT2 ALIM-DESAL FONTIBÓN/FONTIBÓN CENTRO...,NOVIEMBRE,00:00,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,X,0.0,,
1,(11) Zona K Calle 26,(06000) Portal Eldorado,(01) PLAT2 ALIM-DESAL FONTIBÓN/FONTIBÓN CENTRO...,NOVIEMBRE,00:15,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,X,0.0,,
2,(11) Zona K Calle 26,(06000) Portal Eldorado,(01) PLAT2 ALIM-DESAL FONTIBÓN/FONTIBÓN CENTRO...,NOVIEMBRE,00:30,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,X,0.0,,
3,(11) Zona K Calle 26,(06000) Portal Eldorado,(01) PLAT2 ALIM-DESAL FONTIBÓN/FONTIBÓN CENTRO...,NOVIEMBRE,00:45,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,X,0.0,,
4,(11) Zona K Calle 26,(06000) Portal Eldorado,(01) PLAT2 ALIM-DESAL FONTIBÓN/FONTIBÓN CENTRO...,NOVIEMBRE,01:00,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,X,0.0,,


In [7]:
# Convert to parquet and save to Bronze layer
transport.to_parquet(os.path.join(BASE_PATH, "data/1_Bronze/transport.parquet"))

## Economy and Finance

### Judiciales

In [8]:
judiciales = pd.read_csv(
    os.path.join(BASE_PATH, "data/raw/economy-finance/judiciales-informe-procesos-1er-trimestre-2019.csv"),
    encoding='latin-1',
    sep=';'
)

In [9]:
judiciales.head()

Unnamed: 0,PROCESO,DEMANDANTE,"Fortaleza de los planteamientos de la demanda, su presentación y desarrollo",Debilidad de las excepciones propuestas al contestar la demanda,Presencia de Riesgos Procesales,Suficiencia del material Probatorio en contra de la Entidad,Debilidad de las pruebas con las que se pueda considerar la prosperidad de las excepciones propuesta,Nivel de Jurisprudencia relacionada o antecedentes similares,INST ACTUAL,FALLO 1a,VR. PRETENSION ORIGINAL
0,2001-06318,19323985 PARRA JORGE ORLANDO (1),A,M,A,A,A,A,2,F,5447669.0
1,2003-00080,860520306-1 INTERAMERICANA DE ELECTRONICA - IN...,M,B,M,A,A,A,3,D,10739360000.0
2,2005-00220,17171120 HERNANDO FRACISCO OLAYA ROMAN (1),A,M,M,A,M,A,1,,4000000.0
3,2009-00169,830132153 INTCOMEX COLOMBIA LTDA. (1),A,M,M,M,A,A,2,F,250364000.0
4,2009-00209,800173557-4 HYUNDAI COLOMBIA AUTOMOTRIZ S.A....,M,M,B,A,B,A,2,D,372348000.0


In [10]:
# Convert to parquet and save to Bronze layer
judiciales.to_parquet(os.path.join(BASE_PATH, "data/1_Bronze/judiciales.parquet"))

### CGN

In [11]:
cgn = pd.read_csv(
    os.path.join(BASE_PATH, "data/raw/economy-finance/cgn001.csv"),
    encoding='latin-1',
    sep=';'
)

In [12]:
cgn.head()

Unnamed: 0,Codigo contable,Nombre cuenta,Saldo inicial,Movimiento debito,Movimiento credito,Saldo final,Corriente,No corriente
0,100000,ACTIVOS,218881000000000.0,50428980000000.0,48872860000000.0,220437100000000.0,12512530000000.0,207924600000000.0
1,110000,EFECTIVO Y EQUIVALENTES AL EFECTIVO,5334106000000.0,22359230000000.0,23601710000000.0,4091626000000.0,4091626000000.0,0.0
2,110500,CAJA,491693700.0,152519500000.0,152504200000.0,507026800.0,507026800.0,0.0
3,110501,Caja principal,0.0,152429100000.0,152429100000.0,0.0,0.0,0.0
4,110502,Caja menor,491693700.0,90427060.0,75093970.0,507026800.0,507026800.0,0.0


In [13]:
# Convert to parquet and save to Bronze layer
cgn.to_parquet(os.path.join(BASE_PATH, "data/1_Bronze/cgn.parquet"))

## Security and Defense

In [14]:
security_llamadas = pd.read_csv(
    os.path.join(BASE_PATH, "data/raw/security-defense/llamadastramitadas-c4-bogota_numerounicodeseguridadyemergencias-nuse_linea-123-a-31dic2024.csv"),
    encoding='latin-1',
    sep=';'
)

  security_llamadas = pd.read_csv(


In [15]:
security_llamadas.head()

Unnamed: 0,ID,ANIO,MES,TIPO_INCIDENTE,TIPO_DETALLE,COD_LOCALIDAD,LOCALIDAD,COD_UPZ,UPZ,CANT_INCIDENTES
0,20241299UPZ999990,2024,12,990,RESTOS HUMANOS,99,SIN LOCALIZACION,UPZ999,SIN LOCALIZACION,2
1,20241299UPZ999978,2024,12,978,HALLAZGO DE EXPLOSIVOS,99,SIN LOCALIZACION,UPZ999,SIN LOCALIZACION,2
2,20241299UPZ999977,2024,12,977,VEHÍCULO RECUPERADO,99,SIN LOCALIZACION,UPZ999,SIN LOCALIZACION,7
3,20241299UPZ999976,2024,12,976,EXTRAVIADO O DESAPARECIDO,99,SIN LOCALIZACION,UPZ999,SIN LOCALIZACION,34
4,20241299UPZ999973,2024,12,973,DAÑOS EN REDES DE SERVICIOS,99,SIN LOCALIZACION,UPZ999,SIN LOCALIZACION,34


In [16]:
#Errors in the data
security_llamadas['COD_LOCALIDAD'] = pd.to_numeric(security_llamadas['COD_LOCALIDAD'], errors='coerce').astype('Int64')

# Convert to parquet and save to Bronze layer
security_llamadas.to_parquet(os.path.join(BASE_PATH, "data/1_Bronze/security_llamadas.parquet"))

# 3. Geospatial Data

## CRNLoc

In [17]:
# Bike Network (GeoJSON)
geo_crnloc = gpd.read_file(
    os.path.join(BASE_PATH, "data/raw/security-defense/CRNLoc.geojson")
)

In [18]:
geo_crnloc.head()

Unnamed: 0,OBJECTID,CMRANK,CMRNCONT,CMIULOCAL,CMNOMLOCAL,CMRNART,CMRNNUM,CMRNDESCOMP,CMRNMES,CMRNTOTAL,SHAPE.AREA,SHAPE.LEN,geometry
0,22494,4,4151,10,Engativá,Artículo 140,Numeral 13,"Consumir, portar, distribuir, ofrecer o comerc...",Ene-Dic,430816,0.002922,0.291937,"POLYGON ((-74.119 4.73641, -74.11932 4.73658, ..."
1,22495,5,1207,10,Engativá,Artículo 155,Articulo 155,Traslado por protección,Ene-Dic,430816,0.002922,0.291937,"POLYGON ((-74.119 4.73641, -74.11932 4.73658, ..."
2,22496,6,762,10,Engativá,Artículo 140,Numeral 14,"Consumir, portar, distribuir, ofrecer o comerc...",Ene-Dic,430816,0.002922,0.291937,"POLYGON ((-74.119 4.73641, -74.11932 4.73658, ..."
3,22497,7,533,10,Engativá,Artículo 27,Numeral 1,"Reñir, incitar o incurrir en confrontaciones v...",Ene-Dic,430816,0.002922,0.291937,"POLYGON ((-74.119 4.73641, -74.11932 4.73658, ..."
4,22498,8,473,10,Engativá,Artículo 140,Numeral 11,Realizar necesidades fisiologicas en el espaci...,Ene-Dic,430816,0.002922,0.291937,"POLYGON ((-74.119 4.73641, -74.11932 4.73658, ..."


In [19]:
# Convert to parquet and save to Bronze layer
geo_crnloc.to_parquet(os.path.join(BASE_PATH, "data/1_Bronze/gro_crnloc.parquet"))

## CRNSCAT

In [20]:
# Bike Network (GeoJSON)
geo_crnscat = gpd.read_file(
    os.path.join(BASE_PATH, "data/raw/security-defense/CRNSCAT.geojson")
)

In [21]:
geo_crnscat.head()

Unnamed: 0,OBJECTID,CMRANK,CMRNCONT,CMIUSCAT,CMNOMSCAT,CMRNART,CMRNNUM,CMRNDESCOMP,CMRNMES,CMRNTOTAL,SHAPE.AREA,SHAPE.LEN,geometry
0,436462,8,2,1427,Puerto Rico,Artículo 92,Numeral 16,Desarrollar la actividad economica sin cumplir...,Ene-Dic,444219,2.2e-05,0.029004,"POLYGON ((-74.10863 4.56389, -74.10869 4.56391..."
1,436463,9,1,1427,Puerto Rico,Artículo 92,Numeral 4,Quebrantar los horarios establecidos por el Al...,Ene-Dic,444219,2.2e-05,0.029004,"POLYGON ((-74.10863 4.56389, -74.10869 4.56391..."
2,436464,10,1,1427,Puerto Rico,Artículo 93,Numeral 8,No permitir el ingreso de las autoridades de P...,Ene-Dic,444219,2.2e-05,0.029004,"POLYGON ((-74.10863 4.56389, -74.10869 4.56391..."
3,436465,1,1001,9124,Puente Largo,Artículo 146,Numeral 12,Ingresar y salir de las estaciones o portales ...,Ene-Dic,444219,4.8e-05,0.032086,"POLYGON ((-74.06627 4.70183, -74.0664 4.70197,..."
4,436466,2,775,9124,Puente Largo,Artículo 146,Numeral 7,"Evadir el pago de la tarifa, validacion, tique...",Ene-Dic,444219,4.8e-05,0.032086,"POLYGON ((-74.06627 4.70183, -74.0664 4.70197,..."


In [22]:
# Convert to parquet and save to Bronze layer
geo_crnscat.to_parquet(os.path.join(BASE_PATH, "data/1_Bronze/geo_crnscat.parquet"))

## CRNUPZ

In [23]:
# Bike Network (GeoJSON)
geo_crnupz = gpd.read_file(
    os.path.join(BASE_PATH, "data/raw/security-defense/CRNUPZ.geojson")
)

In [24]:
geo_crnupz.head()

Unnamed: 0,OBJECTID,CMRANK,CMRNCONT,CMIUUPLA,CMNOMUPLA,CMRNART,CMRNNUM,CMRNDESCOMP,CMRNMES,CMRNTOTAL,SHAPE.AREA,SHAPE.LEN,geometry
0,100513,3,18,UPZ52,La Flora,Artículo 95,Numeral 1,"Comprar, alquilar o usar equipo terminal movil...",Ene-Dic,436213,0.000153,0.099,"MULTIPOLYGON (((-74.08737 4.51697, -74.08738 4..."
1,100514,4,11,UPZ52,La Flora,Artículo 155,Articulo 155,Traslado por protección,Ene-Dic,436213,0.000153,0.099,"MULTIPOLYGON (((-74.08737 4.51697, -74.08738 4..."
2,100512,2,25,UPZ52,La Flora,Artículo 140,Numeral 13,"Consumir, portar, distribuir, ofrecer o comerc...",Ene-Dic,436213,0.000153,0.099,"MULTIPOLYGON (((-74.08737 4.51697, -74.08738 4..."
3,100515,5,9,UPZ52,La Flora,Artículo 27,Numeral 1,"Reñir, incitar o incurrir en confrontaciones v...",Ene-Dic,436213,0.000153,0.099,"MULTIPOLYGON (((-74.08737 4.51697, -74.08738 4..."
4,100516,6,8,UPZ52,La Flora,Artículo 35,Numeral 5,Ofrecer cualquier tipo de resistencia a la apl...,Ene-Dic,436213,0.000153,0.099,"MULTIPOLYGON (((-74.08737 4.51697, -74.08738 4..."


In [25]:
# Convert to parquet and save to Bronze layer
geo_crnupz.to_parquet(os.path.join(BASE_PATH, "data/1_Bronze/geo_crnupz.parquet"))

## DAILoc

In [26]:
# Bike Network (GeoJSON)
geo_dailoc = gpd.read_file(
    os.path.join(BASE_PATH, "data/raw/security-defense/DAILoc.geojson")
)

In [27]:
geo_dailoc.head()

Unnamed: 0,CMIULOCAL,CMNOMLOCAL,CMMES,CMH18CONT,CMH19CONT,CMH20CONT,CMHVAR,CMHTOTAL,CMLP18CONT,CMLP19CONT,...,CMHA24CONT,CMHB24CONT,CMHCE24CON,CMHM24CONT,CMHC24CONT,CMDS24CONT,CMVI24CONT,SHAPE_AREA,SHAPE_LEN,geometry
0,9,Fontibón,Ene-Dic,19.0,27.0,22.0,-39.62,1084.0,1083.0,808.0,...,29.0,108.0,463.0,48.0,78.0,30.0,230.0,0.00271,0.321915,"MULTIPOLYGON (((-74.15857 4.66274, -74.15848 4..."
1,2,Chapinero,Ene-Dic,10.0,15.0,9.0,-46.67,1084.0,799.0,627.0,...,9.0,39.0,464.0,7.0,93.0,16.0,139.0,0.003095,0.333966,"MULTIPOLYGON (((-74.01116 4.66459, -74.01154 4..."
2,20,Sumapaz,Ene-Dic,0.0,0.0,1.0,50.0,1084.0,22.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.063549,1.914949,"MULTIPOLYGON (((-74.20584 4.26966, -74.20584 4..."
3,18,Rafael Uribe Uribe,Ene-Dic,70.0,61.0,86.0,-51.25,1084.0,1601.0,1483.0,...,45.0,16.0,205.0,45.0,43.0,43.0,337.0,0.001126,0.174513,"MULTIPOLYGON (((-74.12803 4.59254, -74.12777 4..."
4,17,Candelaria,Ene-Dic,3.0,2.0,3.0,60.0,1084.0,242.0,266.0,...,1.0,2.0,54.0,1.0,11.0,5.0,12.0,0.000168,0.067158,"MULTIPOLYGON (((-74.06621 4.60317, -74.0662 4...."


In [28]:
# Convert to parquet and save to Bronze layer
geo_dailoc.to_parquet(os.path.join(BASE_PATH, "data/1_Bronze/geo_dailoc.parquet"))

And with that we're done with our data connection

# (END)