In [2]:
import pandas as pd
import geopandas as gpd
from unidecode import unidecode
import plotly.express as px
import plotly.graph_objects as go
# set up pandas to display all columns
pd.set_option('display.max_columns', 50)

In [3]:
# Read parquet file
df = pd.read_parquet('../Data/col_2000-2024.parquet')
# Read shapefile using geopandas
nat_parks = gpd.read_file('../Data/national_parks_shapefile/runap2Polygon.shp') 

In [4]:
nat_parks.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 1673 entries, 0 to 1672
Data columns (total 21 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   nombre      1673 non-null   object  
 1   categoria   1673 non-null   object  
 2   territoria  65 non-null     object  
 3   resolucion  1673 non-null   object  
 4   hectareas_  1673 non-null   float64 
 5   escala      0 non-null      object  
 6   organizaci  1673 non-null   object  
 7   fecha_act   1440 non-null   object  
 8   url         1673 non-null   object  
 9   wkid        1673 non-null   object  
 10  centroid_x  1673 non-null   float64 
 11  centroid_y  1673 non-null   float64 
 12  fecha_regi  1440 non-null   object  
 13  validado    1673 non-null   object  
 14  hectareas0  1673 non-null   float64 
 15  perimetro_  0 non-null      object  
 16  administra  1673 non-null   object  
 17  id_pnn      1673 non-null   int64   
 18  record_id   1673 non-null   int64   
 19

In [5]:
def clean_names(name):
    name = name.strip()  # Remove leading and trailing whitespace
    name = unidecode(name)  # Remove accents and diacritics
    name = name.lower()  # Convert to lowercase
    name = name.replace('a3', 'o')
    name = name.replace('(c)', 'e')
    name = name.replace('+-', 'n')
    name = name.replace('antionquia', 'antioquia')
    name = name.replace('sinao', 'sinu')
    name = name.replace('raos', 'rios')
    name = name.replace('anica', 'unica')
    name = name.replace('!', '')
    name = name.replace('vaa', 'via')
    return name

In [6]:
nat_parks['categoria'] = nat_parks['categoria'].apply(clean_names)
nat_parks['nombre'] = nat_parks['nombre'].apply(clean_names)
nat_parks['organizaci'] = nat_parks['organizaci'].apply(clean_names)

In [7]:
nat_parks.head(2)

Unnamed: 0,nombre,categoria,territoria,resolucion,hectareas_,escala,organizaci,fecha_act,url,wkid,centroid_x,centroid_y,fecha_regi,validado,hectareas0,perimetro_,administra,id_pnn,record_id,app_id,geometry
0,villaluz,reserva natural de la sociedad civil,,142,18.0,,parques nacionales naturales de colombia,2011-07-26,https://runap.parquesnacionales.gov.co/area-pr...,3116,-73.83604,4.558803,2011-07-26,Si,19.452341,,PNNC,334,153,1,"POLYGON ((-73.83242 4.55865, -73.83254 4.55860..."
1,la marcada,distritos de conservacion de suelos,,10,1874.0,,corporacion autonoma regional de risaralda,2016-12-21,https://runap.parquesnacionales.gov.co/area-pr...,3115,-75.615349,4.829014,2016-12-21,Si,1872.636332,,CARDER,743,708,1,"POLYGON ((-75.63042 4.85554, -75.63044 4.85549..."


In [8]:
nat_parks['categoria'].value_counts()

categoria
reserva natural de la sociedad civil          1238
distritos regionales de manejo integrado       122
reservas forestales protectoras regionales      98
parques naturales regionales                    60
reservas forestales protectoras nacionales      56
parque nacional natural                         44
distritos de conservacion de suelos             21
areas de recreacion                             12
santuario de fauna y flora                       9
distritos nacionales de manejo integrado         5
reserva natural                                  3
santuario de flora                               2
area natural unica                               1
santuario de fauna                               1
via parque                                       1
Name: count, dtype: int64

In [9]:
nat_parks['organizaci'].value_counts()

organizaci
parques nacionales naturales de colombia                                                                    1299
ministerio de ambiente y desarrollo sostenible                                                                61
corporacion autonoma regional del tolima                                                                      32
corporacion autonoma regional de cundinamarca                                                                 31
corporacion autonoma regional del guavio                                                                      25
corporacion autonoma regional del valle del cauca                                                             22
corporacion autonoma regional de las cuencas de los rios rionegro y nare                                      20
corporacion autonoma regional del centro de antioquia                                                         19
corporacion autonoma regional de risaralda                                           

In [10]:
df.shape

(953822, 15)

In [11]:
df.head()

Unnamed: 0,latitude,longitude,brightness,scan,track,acq_date,acq_time,satellite,instrument,confidence,version,bright_t31,frp,daynight,type
0,11.0912,-72.6625,306.5,4.5,2.0,2000-11-01,14:58:00,Terra,MODIS,26,6.2,277.3,38.5,D,2.0
1,11.091,-72.6702,307.0,4.5,2.0,2000-11-01,14:58:00,Terra,MODIS,31,6.2,277.5,41.9,D,2.0
2,5.3136,-68.8234,316.9,1.1,1.1,2000-11-02,03:24:00,Terra,MODIS,94,6.2,278.9,19.6,N,0.0
3,5.339,-68.8125,301.9,1.1,1.1,2000-11-02,03:24:00,Terra,MODIS,40,6.2,272.7,8.3,N,0.0
4,5.3376,-68.8227,301.0,1.1,1.1,2000-11-02,03:24:00,Terra,MODIS,29,6.2,274.3,7.3,N,0.0


In [12]:
df.tail()

Unnamed: 0,latitude,longitude,brightness,scan,track,acq_date,acq_time,satellite,instrument,confidence,version,bright_t31,frp,daynight,type
953817,4.95166,-68.70023,341.56,1.19,1.09,2024-02-29,,Terra,MODIS,91,6.1NRT,299.75,45.56,D,
953818,4.95018,-68.6897,325.16,1.19,1.09,2024-02-29,,Terra,MODIS,41,6.1NRT,298.76,15.59,D,
953819,3.87275,-72.28797,319.28,2.07,1.4,2024-02-29,,Terra,MODIS,44,6.1NRT,298.62,19.47,D,
953820,3.87031,-72.2699,320.47,2.06,1.4,2024-02-29,,Terra,MODIS,48,6.1NRT,297.52,23.38,D,
953821,3.86782,-72.2763,322.82,2.06,1.4,2024-02-29,,Terra,MODIS,43,6.1NRT,297.91,27.93,D,


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 953822 entries, 0 to 953821
Data columns (total 15 columns):
 #   Column      Non-Null Count   Dtype         
---  ------      --------------   -----         
 0   latitude    953822 non-null  float64       
 1   longitude   953822 non-null  float64       
 2   brightness  953822 non-null  float64       
 3   scan        953822 non-null  float64       
 4   track       953822 non-null  float64       
 5   acq_date    953822 non-null  datetime64[ns]
 6   acq_time    609588 non-null  object        
 7   satellite   953822 non-null  category      
 8   instrument  953822 non-null  category      
 9   confidence  953822 non-null  int64         
 10  version     953822 non-null  object        
 11  bright_t31  953822 non-null  float64       
 12  frp         953822 non-null  float64       
 13  daynight    953822 non-null  category      
 14  type        922993 non-null  float64       
dtypes: category(3), datetime64[ns](1), float64(8), int6