In [1]:
import io
import os
import glob
import numpy as np
import pandas as pd
import geopandas as gpd

import warnings
warnings.filterwarnings('ignore')

In [2]:
dir_base = os.path.join('/','media','arturo','Arturo','Data','Mexico','data')

In [3]:
list_dir = [d for d in os.listdir(os.path.join(dir_base,'raw')) if os.path.isdir(os.path.join(os.path.join(dir_base,'raw'), d))]
list_dir = sorted(list_dir)
len_dir = len(list_dir)
print(f'Number of directories: {len_dir}')

Number of directories: 32


In [4]:
id_, nombre_, file_, estado_, municipio_, situacion_, latitud_, longitud_, altitud_ = [],[],[],[],[],[],[],[],[]
for st in range(len_dir):

    list_files = glob.glob(os.path.join(dir_base,'raw',list_dir[st],'*.txt'))
    list_files = sorted(list_files)
    list_len = len(list_files)
    print(f'{list_dir[st]} has {list_len} stations')

    for nn in range(list_len):

        with open(list_files[nn], 'r', encoding='utf-8') as f:
            lineas = f.readlines()

        info_estacion = {}
        for linea in lineas[10:19]:
            if ':' in linea:
                clave, valor = linea.strip().split(':', 1)
                info_estacion[clave.strip()] = valor.strip()

        id_.append(int(info_estacion['ESTACIÓN']))
        nombre_.append(info_estacion['NOMBRE'])
        file_.append(list_files[nn].replace('/media/arturo/Arturo/Data/Mexico/data/raw/',''))
        estado_.append(info_estacion['ESTADO'])
        municipio_.append(info_estacion['MUNICIPIO'])
        situacion_.append(info_estacion['SITUACIÓN'])
        latitud_.append(float(info_estacion['LATITUD'].replace(' °','')))
        longitud_.append(float(info_estacion['LONGITUD'].replace(' °','')))
        altitud_.append(float(info_estacion['ALTITUD'].replace(' msnm','')))
        
        datos_climaticos = lineas[23:]
        datos_texto = ''.join(datos_climaticos)

        df = pd.read_csv(
            io.StringIO(datos_texto),
            sep=r'\s+',  # usa expresión regular para espacios
            names=['FECHA', 'PRECIP', 'EVAP', 'TMAX', 'TMIN'],
            parse_dates=['FECHA'],
            date_format=lambda x: pd.to_datetime(x, format='%Y-%m-%d', errors='coerce')  # especifica el formato de fecha
        )
        df = df.drop([0, 1]).reset_index(drop=True)
        df[['PRECIP', 'EVAP', 'TMAX', 'TMIN']] = df[['PRECIP', 'EVAP', 'TMAX', 'TMIN']].replace('NULO', np.nan)

        if not os.path.exists(os.path.join(dir_base,'clear',list_dir[st])):
            os.makedirs(os.path.join(dir_base,'clear',list_dir[st]))

        data_out = os.path.join(dir_base,'clear',list_dir[st],list_files[nn].split('/')[-1])
        df.to_csv(data_out,header=True,index=False)

    DATA_INFO = pd.DataFrame({
                            'Id':id_,
                            'Nombre':nombre_,
                            'Estado':estado_,
                            'File':file_,
                            'Municipio':municipio_,
                            'Situacion':situacion_,
                            'Latitud':latitud_,
                            'Longitud':longitud_,
                            'Altitud':altitud_
                            })

Aguas Calientes has 59 stations
Baja California has 66 stations
Baja California Sur has 104 stations
Campeche has 41 stations
Chiapas has 117 stations
Chihuahua has 34 stations
Ciudad de Mexico has 12 stations
Coahuila de Zaragoza has 40 stations
Colima has 57 stations
Durango has 57 stations
Estado de Mexico has 72 stations
Guanajuato has 68 stations
Guerrero has 69 stations
Hidalgo has 51 stations
Jalisco has 80 stations
Michoacan has 30 stations
Morelos has 31 stations
Nayarit has 16 stations
Nuevo Leon has 38 stations
Oaxaca has 28 stations
Puebla has 28 stations
Queretaro de Arteaga has 16 stations
Quintana Roo has 28 stations
San Luis Potosi has 91 stations
Sinaloa has 15 stations
Sonora has 52 stations
Tabasco has 27 stations
Tamaulipas has 25 stations
Tlaxcala has 24 stations
Veracruz has 72 stations
Yucatan has 52 stations
Zacatecas has 68 stations


In [5]:
STATES = gpd.read_file(os.path.join('..','..','geometry','MX_states.geojson'))

In [6]:
STATES_CODES = [
    ('AGUASCALIENTES','MX-AGU'),
    ('BAJA CALIFORNIA','MX-BCN'),
    ('BAJA CALIFORNIA SUR','MX-BCS'),
    ('CAMPECHE','MX-CAM'),
    ('CHIAPAS','MX-CHP'),
    ('CHIHUAHUA','MX-CHH'),
    ('CIUDAD DE MÉXICO','MX-CMX'),
    ('COAHUILA DE ZARAGOZA','MX-COA'),
    ('COLIMA','MX-COL'),
    ('DURANGO','MX-DUR'),
    ('GUANAJUATO','MX-GUA'),
    ('GUERRERO','MX-GRO'),
    ('HIDALGO','MX-HID'),
    ('JALISCO','MX-JAL'),
    ('MICHOACÁN DE OCAMPO','MX-MIC'),
    ('MORELOS','MX-MOR'),
    ('MÉXICO','MX-MEX'),
    ('NAYARIT','MX-NAY'),
    ('NUEVO LEÓN','MX-NLE'),
    ('OAXACA','MX-OAX'),
    ('PUEBLA','MX-PUE'),
    ('QUERÉTARO','MX-QUE'),
    ('QUINTANA ROO','MX-ROO'),
    ('SAN LUIS POTOSÍ','MX-SLP'),
    ('SINALOA','MX-SIN'),
    ('SONORA','MX-SON'),
    ('TABASCO','MX-TAB'),
    ('TAMAULIPAS','MX-TAM'),
    ('TLAXCALA','MX-TLA'),
    ('VERACRUZ DE IGNACIO DE LA LLAVE','MX-VER'),
    ('YUCATÁN','MX-YUC'),
    ('ZACATECAS','MX-ZAC')
]
state_to_iso = dict(STATES_CODES)

In [7]:
DATA_INFO['ISO3166-2'] = DATA_INFO['Estado'].map(state_to_iso)

In [8]:
DATA_INFO

Unnamed: 0,Id,Nombre,Estado,File,Municipio,Situacion,Latitud,Longitud,Altitud,ISO3166-2
0,1001,AGUASCALIENTES (OBS),AGUASCALIENTES,Aguas Calientes/dia01001.txt,AGUASCALIENTES,OPERANDO,21.850278,-102.290833,1890.8,MX-AGU
1,1004,CAÑADA HONDA,AGUASCALIENTES,Aguas Calientes/dia01004.txt,AGUASCALIENTES,OPERANDO,22.000833,-102.198889,1925.0,MX-AGU
2,1005,PRESA EL NIAGARA,AGUASCALIENTES,Aguas Calientes/dia01005.txt,AGUASCALIENTES,OPERANDO,21.780556,-102.371667,1844.0,MX-AGU
3,1008,PUERTO DE LA CONCEPCION,AGUASCALIENTES,Aguas Calientes/dia01008.txt,TEPEZALÁ,OPERANDO,22.202778,-102.135000,2322.8,MX-AGU
4,1010,LA TINAJA,AGUASCALIENTES,Aguas Calientes/dia01010.txt,SAN JOSÉ DE GRACIA,OPERANDO,22.164444,-102.554167,2525.7,MX-AGU
...,...,...,...,...,...,...,...,...,...,...
1563,32139,HUITZILA,ZACATECAS,Zacatecas/dia32139.txt,TEÚL DE GONZÁLEZ ORTEGA,OPERANDO,21.223889,-103.605833,1680.0,MX-ZAC
1564,32140,MEZQUITAL DEL ORO,ZACATECAS,Zacatecas/dia32140.txt,MEZQUITAL DEL ORO,OPERANDO,21.212500,-103.363889,1208.0,MX-ZAC
1565,32142,TIERRA Y LIBERTAD,ZACATECAS,Zacatecas/dia32142.txt,VILLA DE COS,OPERANDO,23.450278,-102.389167,2030.0,MX-ZAC
1566,32143,GARCIA DE LA CADENA,ZACATECAS,Zacatecas/dia32143.txt,TRINIDAD GARCÍA DE LA CADENA,OPERANDO,21.206944,-103.458889,1712.0,MX-ZAC


In [9]:
dir_out = os.path.join(dir_base,'INFO.csv')
DATA_INFO.to_csv(dir_out,header=True,index=False)