## Leer, procesar y filtrar datos de 'Puestos de trabajo registrados por los patrones en el Instituto Mexicano del Seguro Social y asegurados sin un empleo asociado' para la CDMX

En este script se procesan los datos de IMSS para tener solamente los que pertenecen a CDMX. Posteriormente se escriben los datos en una carpeta en formato .csv para su futuro análisis. Este código toma un promedio de 50 minutos en terminar.

In [14]:
import os
import sys
import inspect
import numpy as np
import pandas as pd

parentdir = os.path.abspath('..')
grandparentdir=os.path.dirname(parentdir)
ggpdir=os.path.dirname(grandparentdir)

sys.path.insert(0, parentdir)
sys.path.insert(0, grandparentdir)
sys.path.insert(0, ggpdir)

In [15]:
#Obtener nombres de archivos del IMSS
mypath=parentdir+'/row_data/IMSS_all/'
asg_files= sorted([f for f in os.listdir(mypath)])
asg_files

['asg-1998-01-31.csv',
 'asg-1998-02-28.csv',
 'asg-1998-03-31.csv',
 'asg-1998-04-30.csv',
 'asg-1998-05-31.csv',
 'asg-1998-06-30.csv',
 'asg-1998-07-31.csv',
 'asg-1998-08-31.csv',
 'asg-1998-09-30.csv',
 'asg-1998-10-31.csv',
 'asg-1998-11-30.csv',
 'asg-1998-12-31.csv',
 'asg-1999-01-31.csv',
 'asg-1999-02-28.csv',
 'asg-1999-03-31.csv',
 'asg-1999-04-30.csv',
 'asg-1999-05-31.csv',
 'asg-1999-06-30.csv',
 'asg-1999-07-31.csv',
 'asg-1999-08-31.csv',
 'asg-1999-09-30.csv',
 'asg-1999-10-31.csv',
 'asg-1999-11-30.csv',
 'asg-1999-12-31.csv',
 'asg-2000-01-31.csv',
 'asg-2000-02-29.csv',
 'asg-2000-03-31.csv',
 'asg-2000-04-30.csv',
 'asg-2000-05-31.csv',
 'asg-2000-06-30.csv',
 'asg-2000-07-31.csv',
 'asg-2000-08-31.csv',
 'asg-2000-09-30.csv',
 'asg-2000-10-31.csv',
 'asg-2000-11-30.csv',
 'asg-2000-12-31.csv',
 'asg-2001-01-31.csv',
 'asg-2001-02-28.csv',
 'asg-2001-03-31.csv',
 'asg-2001-04-30.csv',
 'asg-2001-05-31.csv',
 'asg-2001-06-30.csv',
 'asg-2001-07-31.csv',
 'asg-2001-

In [3]:
asg_files_months=[s[:-7] for s in asg_files]
asg_files_months

['asg-1998-01',
 'asg-1998-02',
 'asg-1998-03',
 'asg-1998-04',
 'asg-1998-05',
 'asg-1998-06',
 'asg-1998-07',
 'asg-1998-08',
 'asg-1998-09',
 'asg-1998-10',
 'asg-1998-11',
 'asg-1998-12',
 'asg-1999-01',
 'asg-1999-02',
 'asg-1999-03',
 'asg-1999-04',
 'asg-1999-05',
 'asg-1999-06',
 'asg-1999-07',
 'asg-1999-08',
 'asg-1999-09',
 'asg-1999-10',
 'asg-1999-11',
 'asg-1999-12',
 'asg-2000-01',
 'asg-2000-02',
 'asg-2000-03',
 'asg-2000-04',
 'asg-2000-05',
 'asg-2000-06',
 'asg-2000-07',
 'asg-2000-08',
 'asg-2000-09',
 'asg-2000-10',
 'asg-2000-11',
 'asg-2000-12',
 'asg-2001-01',
 'asg-2001-02',
 'asg-2001-03',
 'asg-2001-04',
 'asg-2001-05',
 'asg-2001-06',
 'asg-2001-07',
 'asg-2001-08',
 'asg-2001-09',
 'asg-2001-10',
 'asg-2001-11',
 'asg-2001-12',
 'asg-2002-01',
 'asg-2002-02',
 'asg-2002-03',
 'asg-2002-04',
 'asg-2002-05',
 'asg-2002-06',
 'asg-2002-08',
 'asg-2002-09',
 'asg-2002-10',
 'asg-2002-11',
 'asg-2002-12',
 'asg-2003-01',
 'asg-2003-02',
 'asg-2003-03',
 'asg-20

In [4]:
#Instrucción correcta para leer datos

df_asg_2023_01_31=pd.read_csv(mypath+'asg-2023-01-31.csv',encoding = "ISO-8859-1",sep='|',low_memory=False)
df_asg_2023_01_31.head()

Unnamed: 0,cve_delegacion,cve_subdelegacion,cve_entidad,cve_municipio,sector_economico_1,sector_economico_2,sector_economico_4,tamaño_patron,sexo,rango_edad,...,ta_sal,teu_sal,tec_sal,tpu_sal,tpc_sal,masa_sal_ta,masa_sal_teu,masa_sal_tec,masa_sal_tpu,masa_sal_tpc
0,1,1,1,A01,,,,,1,E1,...,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0
1,1,1,1,A01,,,,,1,E10,...,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0
2,1,1,1,A01,,,,,1,E11,...,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0
3,1,1,1,A01,,,,,1,E12,...,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0
4,1,1,1,A01,,,,,1,E13,...,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0


In [5]:
#Filtrar datos para CDMX y dividir entre norte y sur

df_asg_2023_01_31_CDMX=df_asg_2023_01_31.loc[(df_asg_2023_01_31['cve_entidad']==9)]
df_asg_2023_01_31_CDMX_norte=df_asg_2023_01_31_CDMX[df_asg_2023_01_31_CDMX['cve_delegacion']==39]
df_asg_2023_01_31_CDMX_sur=df_asg_2023_01_31_CDMX[df_asg_2023_01_31_CDMX['cve_delegacion']==40]
df_asg_2023_01_31_CDMX_sur=df_asg_2023_01_31_CDMX_sur.reset_index()
df_asg_2023_01_31_CDMX_sur=df_asg_2023_01_31_CDMX_sur.drop('index', axis=1)
df_asg_2023_01_31_CDMX_sur.head()

Unnamed: 0,cve_delegacion,cve_subdelegacion,cve_entidad,cve_municipio,sector_economico_1,sector_economico_2,sector_economico_4,tamaño_patron,sexo,rango_edad,...,ta_sal,teu_sal,tec_sal,tpu_sal,tpc_sal,masa_sal_ta,masa_sal_teu,masa_sal_tec,masa_sal_tpu,masa_sal_tpc
0,40,1,9,,,,,,1,E1,...,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0
1,40,1,9,,,,,,1,E10,...,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0
2,40,1,9,,,,,,1,E11,...,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0
3,40,1,9,,,,,,1,E12,...,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0
4,40,1,9,,,,,,1,E13,...,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0


In [24]:
#Repetir el proceso de leer y filtrar para todos los datos en la carpeta de lectura y ecribir datos filtrados
import time

t00 = time.time()
write_dir=parentdir+'/IMSS_CDMX/'

for asg_data in asg_files[54:55]:
    t01 = time.time()
    
    print('Procesando:',asg_data)
    df_asg=pd.read_csv(mypath+asg_data,encoding = "ISO-8859-1",sep='|',low_memory=False)
    
    df_asg_CDMX=df_asg.loc[(df_asg['cve_entidad']==9)].reset_index()
    df_asg_CDMX=df_asg_CDMX.drop('index', axis=1)
#     df_asg_CDMX_norte=df_asg[df_asg['cve_delegacion']==39]
#     df_asg_CDMX_sur=df_asg[df_asg['cve_delegacion']==40]

    df_asg_CDMX.to_csv(write_dir+asg_data[:-7]+'_CDMX.csv')
    
    t11 = time.time()

    total11 = t11-t01 
    
    print(total11)

t10 = time.time()

total00 = t10-t00

print(total00)

Procesando: asg-2002-07-31.csv
12.537874221801758
12.538993835449219


In [23]:
asg_files[0:96]

['asg-1998-01-31.csv',
 'asg-1998-02-28.csv',
 'asg-1998-03-31.csv',
 'asg-1998-04-30.csv',
 'asg-1998-05-31.csv',
 'asg-1998-06-30.csv',
 'asg-1998-07-31.csv',
 'asg-1998-08-31.csv',
 'asg-1998-09-30.csv',
 'asg-1998-10-31.csv',
 'asg-1998-11-30.csv',
 'asg-1998-12-31.csv',
 'asg-1999-01-31.csv',
 'asg-1999-02-28.csv',
 'asg-1999-03-31.csv',
 'asg-1999-04-30.csv',
 'asg-1999-05-31.csv',
 'asg-1999-06-30.csv',
 'asg-1999-07-31.csv',
 'asg-1999-08-31.csv',
 'asg-1999-09-30.csv',
 'asg-1999-10-31.csv',
 'asg-1999-11-30.csv',
 'asg-1999-12-31.csv',
 'asg-2000-01-31.csv',
 'asg-2000-02-29.csv',
 'asg-2000-03-31.csv',
 'asg-2000-04-30.csv',
 'asg-2000-05-31.csv',
 'asg-2000-06-30.csv',
 'asg-2000-07-31.csv',
 'asg-2000-08-31.csv',
 'asg-2000-09-30.csv',
 'asg-2000-10-31.csv',
 'asg-2000-11-30.csv',
 'asg-2000-12-31.csv',
 'asg-2001-01-31.csv',
 'asg-2001-02-28.csv',
 'asg-2001-03-31.csv',
 'asg-2001-04-30.csv',
 'asg-2001-05-31.csv',
 'asg-2001-06-30.csv',
 'asg-2001-07-31.csv',
 'asg-2001-

In [25]:
asg_files[54:55]

['asg-2002-07-31.csv']