# Mexican federal budget pre-processing pipeline

## Instructions

To you run the notebook:

1. choose a unique `ITERATION_LABEL` for each pipeline run
2. specify and describe your input files (`INPUT_FILES`)
3. make sure your column mapping (`COLUMN_ALIASES`) is correct
3. run the whole notebook by clicking on __Kernel > Restart & Run All__

## Imports

In [1]:
from sys import stdout
from pandas import read_csv, concat, DataFrame, ExcelWriter, ExcelFile, Series
from numpy import nan, isnan
from os.path import join, isdir
from os import mkdir
from json import dumps, loads
from pprint import pprint
from csv import DictReader
from shutil import copyfile

## Settings

Choose a unique iteration label for each pipeline run.

In [2]:
ITERATION_LABEL = 'iteration-20-post-launch-bug-fixes-test-run2'

Put your input files inside the `pipeline.in` folder and describe them here.

In [3]:
INPUT_FILES = {
    2008: {'name': 'Cuenta_Publica_2008.csv', 'encoding': 'windows-1252'},
    2009: {'name': 'Cuenta_Publica_2009.csv', 'encoding': 'windows-1252'},
    2010: {'name': 'Cuenta_Publica_2010.csv', 'encoding': 'windows-1252'},
    2011: {'name': 'Cuenta_Publica_2011.csv', 'encoding': 'windows-1252'},
    2012: {'name': 'Cuenta_Publica_2012.csv', 'encoding': 'windows-1252'},
    2013: {'name': 'CP_2013.csv', 'encoding': 'windows-1252'},
    2014: {'name': 'CP_2014.csv', 'encoding': 'windows-1252'},
    2015: {'name': 'CP_2015.csv', 'encoding': 'windows-1252'},
    2016: {'name': 'PEF_AC01_2t_2016.csv', 'encoding': 'windows-1252'} 
}


Define your mappings in `columns.aliases.json`.

The following hierarchical categories will have IDs prefixed with the parent categories:

In [4]:
HIERARCHIES = {
    'functional': [
        'GPO_FUNCIONAL', 
        'ID_FUNCION', 
        'ID_SUBFUNCION', 
        'ID_AI'
    ],
    'administrative': [
        'ID_RAMO',
        'ID_UR',
        'ID_PP',
        'ID_MODALIDAD'
    ]
}

The following columns are unsused and removed at the end of the pipeline:

In [5]:
REMOVE_OUTPUT_COLUMNS = [
    'Reasignacion',
    'Objeto del Gasto',
    'Descripción de Reasignacion',
    'Descripción de Objeto del Gasto'
]

In [6]:
REMOVE_INPUT_COLUMNS = {}

That's it. Now just run the notebook from beginning to end.

## Configuration

In [7]:
BASENAME = 'mexican_federal_budget'
INPUT_FOLDER = 'pipeline.in.v2'
OUTPUT_FOLDER = 'pipeline.out'
ITERATION_FOLDER = join(OUTPUT_FOLDER, ITERATION_LABEL)
MERGED_FILE = join(ITERATION_FOLDER, BASENAME + '.merged.csv')
CATALOGS_FILE = 'objeto_del_gasto.catalog.xlsx'
COLUMN_ALIASES_FILE = 'columns.aliases.json'
DATAPACKAGE_FILE = 'datapackage.default.json'
DESCRIPTIONS_FILE = 'columns.descriptions.csv'

In [8]:
if isdir(ITERATION_FOLDER):
    raise ValueError('Please enter a unique iteration label')
    
mkdir(ITERATION_FOLDER)

## Encoding inspection

Detect the file encodings of the input files using the `cChardet` utility library. __Warning:__ it's not always accurate. This is meant only as an indication only. In the end, encodings will be taken from `INPUT_FILES`.

In [9]:
def detect_encodings():
    """Detect CSV file encoding with the cChardet library"""

    try:
        import cchardet as chardet
    except ImportError:
        cChardet = 'https://github.com/PyYoshi/cChardet'
        print('Encoding inspection skipped: install %s', cChardet)
        return

    results = {}
    results_file = join(OUTPUT_FOLDER, ITERATION_LABEL, 'encodings.detected.json')
    
    for year, file in sorted(INPUT_FILES.items()):
        datafile = join(INPUT_FOLDER, file['name'])
        
        with open(datafile, 'rb') as f:
            text = f.read()
            
        result = chardet.detect(text)
        results.update({year: result})
        print(year, 'Inspected', file['name'], result)
    
    with open(results_file, 'w+') as json:
        json.write(dumps(results, indent=4))
        print('\nSaved encoding detection report to', results_file)
        
# detect_encodings()

## Load files

In [10]:
def read_columns(file, encoding):
    """Return clean CSV file headers"""
    
    with open(file, encoding=encoding) as csv:
        header = csv.readline()
        return header.replace('\n', '').split(',')

In [11]:
def force_strings(columns):
    """Return string enforcement for each column of a CSV file"""
    
    for column in columns:
        yield column, str

In [12]:
def load_csv_files():
    """Load raw data (CSV) files"""
    
    batch = {}
    
    for year, file in sorted(INPUT_FILES.items()):
        filepath = join(INPUT_FOLDER, file['name'])
        column_names = read_columns(filepath, file['encoding'])
        column_types = dict(force_strings(column_names))
        
        batch[year] = read_csv(filepath, encoding=file['encoding'], dtype=column_types)
        print('Loaded', file['name'], 'with encoding', file['encoding'])
    
    print()
    stdout.flush()

    for year in sorted(INPUT_FILES.keys()):
        if year in REMOVE_INPUT_COLUMNS:
            for column in REMOVE_INPUT_COLUMNS[year]:
                try:
                    del batch[year][column]
                    print(year, 'deleted', column)
                except KeyError:
                    print(year, column, 'not found in', file['name'])

        stdout.flush()

    return batch

In [13]:
with open(COLUMN_ALIASES_FILE) as json:
    COLUMN_ALIASES = loads(json.read())
    
with open(DATAPACKAGE_FILE) as json:
    DATAPACKAGE = loads(json.read())

def update_datapackage():
    with open(DESCRIPTIONS_FILE) as stream:
        rows = DictReader(stream)
        reference = {row.pop('Backend Header'): row for row in rows}
   
    fields = DATAPACKAGE['resources'][0]['schema']['fields']
    field_names = []
    
    for field in fields:
        field_name = field['name']
        field['title'] = reference[field_name]['Concept']
        field['description'] = reference[field_name]['Description']
        field_names.append(field_name)
        print('updated datapackage title and description for %s' % field_name)
        
    not_matched = set(field_names) ^ set(reference.keys())
    if not_matched:
        raise ValueError('Fields in {} do not match fields in {}: {}'.format(
            DATAPACKAGE_FILE, 
            DESCRIPTIONS_FILE,
            not_matched))
        
    not_matched = set(field_names) ^ set(COLUMN_ALIASES.keys())
    if not_matched:
        raise ValueError('Fields in {} do not match fields in {}: {}'.format(
            DATAPACKAGE_FILE, 
            DESCRIPTIONS_FILE,
            not_matched))
        
    with open(DATAPACKAGE_FILE, 'w+') as stream:
        stream.write(dumps(DATAPACKAGE, indent=2, ensure_ascii=True))
        
    copyfile(DATAPACKAGE_FILE, join(ITERATION_FOLDER, DATAPACKAGE_FILE.replace('.default', '')))
        
    print('checked columns consistency for {}, {} and {}: okay!'.format(
            DATAPACKAGE_FILE, COLUMN_ALIASES_FILE, DESCRIPTIONS_FILE))
    print('copied', DATAPACKAGE_FILE)

## Clean the data

In [14]:
def strip_cell_padding(batch):
    for year in sorted(batch.keys()):
        for column in batch[year].columns:
            batch[year].rename(columns={column: column.strip()}, inplace=True)
            batch[year][column] = batch[year][column].apply(lambda x: x.strip() if x is not nan else x)
        print(year, 'stripped cell paddings')
        stdout.flush()

In [15]:
def delete_empty_columns(batch):
    for year in batch.keys():
        for column in batch[year].columns:
            if 'Unnamed:' in column:
                try:
                    del batch[year][column]
                    print(year, column, 'deleted')
                    stdout.flush()
                except KeyError:
                    pass  

In [16]:
def count_missing_values(batch):
    collector = {}
    table = []

    for column in get_union_of_columns(batch):
        row = {'Column': column}
        collector.update({column: []})
        
        for year in batch.keys():
            if column in batch[year].columns:
                is_empty = batch[year][column].isnull()
                empty_lines = batch[year].where(is_empty).dropna(how='all')
                collector[column].extend(empty_lines.to_dict(orient='records'))
                nb_empty_cells = len(empty_lines)
            else:
                nb_empty_cells = nan
                
            row.update({year: nb_empty_cells})
            if nb_empty_cells not in (nan, 0):
                print(year, 'found', nb_empty_cells, 'missing values in', column)

        table.append(row)
        
    ordered_columns = ['Column']
    ordered_columns.extend(sorted(batch.keys()))
    empty_values_overview_table = DataFrame(table).reindex_axis(ordered_columns, axis=1)
    
    return empty_values_overview_table, collector

In [17]:
def count_duplicates(batch):
    for year, df in sorted(batch.items()):
        nb_duplicate_lines = df.duplicated().apply(lambda x: 1 if x is True else 0).sum()
        print(year, 'found', nb_duplicate_lines, 'duplicate lines')

## Alias column names

In [18]:
def get_union_of_columns(batch):
    union = set()
    for year in batch.keys():
        union = union | set(batch[year].columns)
    return union

In [19]:
from yaml import load

def load_aliases(file):
    with open(file) as yaml:
        aliases = load(yaml.read())
        return aliases

In [20]:
def map_columns_to_aliases(batch, list_of_aliases):
    for year in sorted(batch.keys()):
        for column in sorted(batch[year].columns):
            if not column in list_of_aliases:
                for reference, aliases in list_of_aliases.items():
                    if aliases:
                        if column in aliases:
                            batch[year].rename(columns={column: reference}, inplace=True)
                            print(year, column, 'replaced with', reference)
                            stdout.flush()
                            break  
                else:
                    print(year, 'NO ALIAS REGISTERED FOR', column)
                    stdout.flush()

In [21]:
def build_overview(batch):
    table = []
    
    for column in get_union_of_columns(batch):
        row = {'Column': column}
        for year in batch.keys():
            row.update({year: column in batch[year].columns})
        table.append(row)
        
    ordered_columns = ['Column']
    ordered_columns.extend(sorted(batch.keys()))
    
    overview = DataFrame(table).reindex_axis(ordered_columns, axis=1)
    print('Column mapping overview: done')
    return overview

## Check expenditure sums

There's a little cleaning to do on the amount columns (zeros represented by a dash). Assume thousands are seperated by a comma.

In [22]:
EXPENDITURE_COLUMNS = [
    'MONTO_EJERCIDO', 
    'MONTO_DEVENGADO', 
    'MONTO_APROBADO', 
    'MONTO_PAGADO', 
    'MONTO_MODIFICADO', 
    'MONTO_ADEFAS', 
    'MONTO_EJERCICIO'
]
count = 0

def clean_expenditure_columns(batch):
    check_sums = []

    for column in EXPENDITURE_COLUMNS:
        row = {'Column': column}
        
        for year in sorted(batch.keys()):
            try:
                series = batch[year][column]
                
                # I'm assuming a single '-' represents zero
                series = series.apply(lambda x: '0' if x == '-' else x)
                try:
                    series = series.apply(lambda x: x.replace(',', '') if x is not nan else x)    
                except AttributeError:
                    if count < 10:
                        print(year, column)
                batch[year][column] = series.astype(float)
                check_sum = batch[year][column].sum()
                
                print(year, 'cleaned and summed', column, '=', check_sum, 'pesos')
                
            except KeyError:
                check_sum = nan
                
            row.update({year: check_sum})
        
        check_sums.append(row)

    ordered_columns = ['Column']
    ordered_columns.extend(sorted(batch.keys()))
    return DataFrame(check_sums).reindex_axis(ordered_columns, axis=1)    

## Objeto del Gasto Column split

In [23]:
from os.path import join

def generate_catalog(file):
    
    catalog_ = {}
    catalog_file = ExcelFile(file)
    INDEX_COLUMN = 0
    
    for sheet in catalog_file.sheet_names:
        if sheet != 'Concatenated':
            name = sheet.lower().replace(' ', '_')
            output = join('objeto_del_gasto.catalog', name + '.csv')

            df = catalog_file.parse(sheet).dropna()
            index = df.columns[INDEX_COLUMN]

            df[index] =  df[index].astype(str)
            df.set_index(index, inplace=True)
            df = df.groupby(df.index).first()
            df.sort_index(inplace=True)
            
            message = 'Loaded catalog {sheet} into "{name}" ({nb} lines)'
            parameters = dict(sheet=sheet, name=name, nb=len(df))

            print(message.format(**parameters))
            catalog_[name] = df['DESCRIPCION']
    
    print()
    return catalog_

__Note!__ Years are hard coded in the script below.

In [24]:
def split_objeto_del_gasto(batch):
    catalog = generate_catalog(CATALOGS_FILE)
    missing_in_catalog = []
    
    def has_digits(n, N):
        return not isinstance(n, float) and len(n) >= N 
            
    def lookup(n, table, year):
        try:
            return catalog[table].loc[n]
        except KeyError:
            missing_in_catalog.append({'year': year, 'table': table, 'ID': n})
            return nan
        except TypeError:
            # n is nan
            return nan
    
    for year in sorted(batch.keys()):
        if year == 2016:
            print('Skipping', year, 'because the raw CSV already has the required columns')
        
        else:
            objeto = batch[year]['ID_CONCEPTO'].astype(str)

            batch[year]['ID_CAPITULO'] = objeto.apply(lambda x: x[0] + '000' if x not in (nan, 'nan') else nan)
            batch[year]['ID_CONCEPTO'] = objeto.apply(lambda x: x[:2] + '00' if x not in (nan, 'nan') else nan)
            batch[year]['DESC_CAPITULO'] = batch[year]['ID_CAPITULO'].map(lambda x: lookup(x, 'capitulo', year))  
            batch[year]['DESC_CONCEPTO'] = batch[year]['ID_CONCEPTO'].map(lambda x: lookup(x, 'concepto', year))  
            
            nb_generica_digits = 4 if year in (2008, 2009, 2010) else 3
            
            # Skip the LAST year of the dataset (currently 2016) it has split columns already
            batch[year]['ID_PARTIDA_GENERICA'] = objeto.apply(lambda x: x[:nb_generica_digits] if has_digits(x, 4) else nan)
            batch[year]['DESC_PARTIDA_GENERICA'] = batch[year]['ID_PARTIDA_GENERICA'].map(lambda x: lookup(x, 'partida_generica', year))  
            
            if year not in (2008, 2009, 2010):
                batch[year]['ID_PARTIDA_ESPECIFICA'] = objeto.apply(lambda x: x if has_digits(x, 5) else nan)
                batch[year]['DESC_PARTIDA_ESPECIFICA'] = batch[year]['ID_PARTIDA_ESPECIFICA'].map(lambda x: lookup(x, 'partida_específica', year) if has_digits(x, 5) else nan)  
            else:
                batch[year]['ID_PARTIDA_ESPECIFICA'] = nan
                batch[year]['DESC_PARTIDA_ESPECIFICA'] = nan

            print(year, 'broke down "Objeto del Gasto" column')
        
    return DataFrame(missing_in_catalog).drop_duplicates(['ID', 'table'])

## Prefix IDs 
Disambiguating sub-categories may require prefixing their IDs with their parents' IDs.

In [25]:
def prefix_ids(batch):
    for year in batch.keys():       
        for hierarchy, levels in HIERARCHIES.items():
            prefix = batch[year]['CICLO'].apply(lambda x: '')
            for n, level in enumerate(levels):
                dash = '.' if n > 0 else ''
                prefix = prefix + dash + batch[year][level]  
                batch[year][level] = prefix
                
                print(year, 'prefixed', hierarchy, 'level', n, level)
                stdout.flush()

## Remove unused columns

In [26]:
def remove_unused_columns(batch):
    for year, budget in batch.items():
        for column in REMOVE_OUTPUT_COLUMNS:
            try:
                del budget[column]
                print(year, 'deleted', column)
            except KeyError:
                print(column, ': no such column to delete')

## Add descriptions and titles to datapackage.json

##  Pipeline

In [27]:
def do_pipeline():

    def echo_section(section):
        print('\n', section, '\n')

    echo_section('Update datapackage.json')
    update_datapackage()

    echo_section('Loading files')
    datasets = load_csv_files()
    
    echo_section('Delete empty columns')
    delete_empty_columns(datasets)

    echo_section('Stripping padding from cells')
    strip_cell_padding(datasets)
    
    echo_section('Counting duplicate lines (NOT de-duplicating)')
    count_duplicates(datasets)
    
    echo_section('Mapping column to aliases')
    map_columns_to_aliases(datasets, COLUMN_ALIASES)

    echo_section('Counting missing values')
    missing_values_report, bad_records = count_missing_values(datasets)
    
    echo_section('Building column mapping overview')
    column_mapping_report = build_overview(datasets)
    
    echo_section('Cleaning expenditure columns')
    sums_report = clean_expenditure_columns(datasets)
    
    echo_section('Breaking down Objeto del Gasto column')
    missing_catalog_ids = split_objeto_del_gasto(datasets)
        
    echo_section('Prefixing IDs of certain category hierarchies')
    prefix_ids(datasets)

    echo_section('Removing unused columns')
    remove_unused_columns(datasets)

    echo_section('Saving pipeline configuration')

    reports_file = join(ITERATION_FOLDER, BASENAME + '.reports.xlsx')
    writer = ExcelWriter(reports_file)    
    missing_values_report.to_excel(writer, 'missing values', encoding='utf-8', index=False)
    column_mapping_report.to_excel(writer, 'column mapping', encoding='utf-8', index=False)
    sums_report.to_excel(writer, 'check sums', encoding='utf-8', index=False)
    missing_catalog_ids.to_excel(writer, 'missing_catalog_IDs', encoding='utf-8', index=False)    
    print('Saved 4 reports to', reports_file)    

    aliases_file = join(ITERATION_FOLDER, BASENAME + '.aliases.json')
    inputs_file = join(ITERATION_FOLDER, BASENAME + '.inputs.json')
    levels_file = join(ITERATION_FOLDER, BASENAME + '.levels.json')
    bad_records_file = join(ITERATION_FOLDER, BASENAME + '.missing.json')

    with open(bad_records_file, 'w+') as json:
        json.write(dumps(bad_records, indent=4))
        
    with open(aliases_file, 'w+') as json:
        json.write(dumps(COLUMN_ALIASES, indent=4))
        
    with open(levels_file, 'w+') as json:
        json.write(dumps(HIERARCHIES, indent=4))
        
    with open(inputs_file, 'w+') as json:
        json.write(dumps(INPUT_FILES, indent=4))
    
    print('Saved input configuration to', inputs_file)    
    print('Saved column aliases to', aliases_file) 
    print('Saved bad records (those with empty cells) to', bad_records_file)    
    print('Saved hierarchy levels used for prefixing to', levels_file) 
    
    echo_section('Pipeline run "%s" done' % ITERATION_LABEL)

    return datasets, missing_catalog_ids, column_mapping_report, missing_values_report, sums_report

## Run the pipeline

In [28]:
budgets, missing_ids, column_mapping, missing_values, sums = do_pipeline()


 Update datapackage.json 

updated datapackage title and description for CICLO
updated datapackage title and description for DESC_AI
updated datapackage title and description for DESC_CAPITULO
updated datapackage title and description for DESC_CONCEPTO
updated datapackage title and description for ENTIDAD_FEDERATIVA
updated datapackage title and description for DESC_FF
updated datapackage title and description for DESC_FUNCION
updated datapackage title and description for DESC_GPO_FUNCIONAL
updated datapackage title and description for DESC_MODALIDAD
updated datapackage title and description for DESC_PARTIDA_ESPECIFICA
updated datapackage title and description for DESC_PARTIDA_GENERICA
updated datapackage title and description for DESC_PP
updated datapackage title and description for DESC_RAMO
updated datapackage title and description for DESC_SUBFUNCION
updated datapackage title and description for DESC_TIPOGASTO
updated datapackage title and description for DESC_UR
updated datapacka

In [29]:
from gc import collect
collect()

3033

In [30]:
for year, budget in budgets.items():
    filepath = MERGED_FILE.replace('merged', str(year))
    budget.to_csv(filepath, encoding='utf-8', index=False)
    print('Saved', filepath)
    stdout.flush()

Saved pipeline.out/iteration-20-post-launch-bug-fixes-test-run2/mexican_federal_budget.2016.csv
Saved pipeline.out/iteration-20-post-launch-bug-fixes-test-run2/mexican_federal_budget.2008.csv
Saved pipeline.out/iteration-20-post-launch-bug-fixes-test-run2/mexican_federal_budget.2009.csv
Saved pipeline.out/iteration-20-post-launch-bug-fixes-test-run2/mexican_federal_budget.2010.csv
Saved pipeline.out/iteration-20-post-launch-bug-fixes-test-run2/mexican_federal_budget.2011.csv
Saved pipeline.out/iteration-20-post-launch-bug-fixes-test-run2/mexican_federal_budget.2012.csv
Saved pipeline.out/iteration-20-post-launch-bug-fixes-test-run2/mexican_federal_budget.2013.csv
Saved pipeline.out/iteration-20-post-launch-bug-fixes-test-run2/mexican_federal_budget.2014.csv
Saved pipeline.out/iteration-20-post-launch-bug-fixes-test-run2/mexican_federal_budget.2015.csv


In [31]:
merged = concat(list(budgets.values()))
merged.to_csv(MERGED_FILE, encoding='utf-8', index=False)
print('Saved merged dataset to', MERGED_FILE)    

Saved merged dataset to pipeline.out/iteration-20-post-launch-bug-fixes-test-run2/mexican_federal_budget.merged.csv


## Quality control

In [32]:
sorted(list(budget.columns))

['CICLO',
 'DESC_AI',
 'DESC_CAPITULO',
 'DESC_CONCEPTO',
 'DESC_FF',
 'DESC_FUNCION',
 'DESC_GPO_FUNCIONAL',
 'DESC_MODALIDAD',
 'DESC_PARTIDA_ESPECIFICA',
 'DESC_PARTIDA_GENERICA',
 'DESC_PP',
 'DESC_RAMO',
 'DESC_SUBFUNCION',
 'DESC_TIPOGASTO',
 'DESC_UR',
 'ENTIDAD_FEDERATIVA',
 'GPO_FUNCIONAL',
 'ID_AI',
 'ID_CAPITULO',
 'ID_CLAVE_CARTERA',
 'ID_CONCEPTO',
 'ID_ENTIDAD_FEDERATIVA',
 'ID_FF',
 'ID_FUNCION',
 'ID_MODALIDAD',
 'ID_PARTIDA_ESPECIFICA',
 'ID_PARTIDA_GENERICA',
 'ID_PP',
 'ID_RAMO',
 'ID_SUBFUNCION',
 'ID_TIPOGASTO',
 'ID_UR',
 'MONTO_ADEFAS',
 'MONTO_APROBADO',
 'MONTO_DEVENGADO',
 'MONTO_EJERCICIO',
 'MONTO_MODIFICADO',
 'MONTO_PAGADO']

In [33]:
merged.sample(n=20)

Unnamed: 0,CICLO,DESC_AI,DESC_CAPITULO,DESC_CONCEPTO,DESC_FF,DESC_FUNCION,DESC_GPO_FUNCIONAL,DESC_MODALIDAD,DESC_PARTIDA_ESPECIFICA,DESC_PARTIDA_GENERICA,...,ID_SUBFUNCION,ID_TIPOGASTO,ID_UR,MONTO_ADEFAS,MONTO_APROBADO,MONTO_DEVENGADO,MONTO_EJERCICIO,MONTO_EJERCIDO,MONTO_MODIFICADO,MONTO_PAGADO
90027,2010,Manejo eficiente y sustentable del agua y prev...,Servicios generales,"Servicios financieros, bancarios y comerciales",Recursos fiscales,Agua Potable y Alcantarillado,Desarrollo Social,Otros Subsidios,,Otros impuestos y derechos,...,2.4.1,1,16.B32,,0.0,,,1603.0,,
198079,2015,Impartición de Justicia en materia agraria,Materiales y suministros,Materiales y articulos de construccion y de re...,Recursos fiscales,Justicia,Gobierno,Prestación de Servicios Públicos,Productos minerales no metálicos,Productos minerales no metálicos,...,1.2.1,1,31.200,0.0,1680.0,0.0,0.0,,0.0,0.0
47174,2008,Regulación eficiente de las comunicaciones y l...,Materiales y suministros,Alimentos y utensilios,Recursos fiscales,Comunicaciones y Transportes,Desarrollo Económico,Regulación y supervisión,,Productos alimenticios,...,3.1.6,1,9.647,,0.0,,,0.0,,
302094,2015,Cobertura de la atención médica preventiva,Servicios personales,Remuneraciones al personal de carácter transit...,Recursos fiscales,Salud,Desarrollo Social,Prestación de Servicios Públicos,Remuneraciones al personal eventual,Sueldos base al personal eventual,...,2.3.2,1,51.GYN,0.0,133624.0,0.0,0.0,,0.0,0.0
144799,2014,Ordenamiento y regularización de la propiedad ...,Servicios personales,Remuneraciones adicionales y especiales,Recursos fiscales,"Agropecuaria, Silvicultura, Pesca y Caza",Desarrollo Económico,"Planeación, seguimiento y evaluación de políti...",Primas de vacaciones y dominical,"Primas de vacaciones, dominical y gratificació...",...,3.2.1,1,15.145,0.0,69010.0,62613.96,62613.96,,62613.96,62613.96
17015,2010,Función pública y buen gobierno,Servicios generales,"Servicios financieros, bancarios y comerciales",Recursos fiscales,Administración Pública,Gobierno,Apoyo a la función pública y al mejoramiento d...,,Otros impuestos y derechos,...,1.8.3,1,8.136,,14850.0,,,14653.0,,
336067,2014,Cobertura de la atención médica preventiva,Servicios generales,Servicios basicos,Ingresos Propios,Salud,Desarrollo Social,Prestación de Servicios Públicos,Servicio de energía eléctrica,Energía eléctrica,...,2.3.2,1,GYN.GYN,0.0,421626.0,1072399.0,1072399.0,,1072399.0,1072399.0
160256,2015,Manejo eficiente y sustentable del agua y prev...,Servicios generales,"Servicios de instalacion, reparacion, mantenim...",Recursos fiscales,Protección Ambiental,Desarrollo Social,Regulación y supervisión,Mantenimiento y conservación de mobiliario y e...,"Instalación, reparación y mantenimiento de mob...",...,2.1.2,1,16.B00,0.0,38183.0,17980.0,17980.0,,17980.0,17980.0
228439,2015,Democracia preservada y fortalecida mediante l...,Materiales y suministros,"Herramientas, refacciones y accesorios menores",Recursos fiscales,Coordinación de la Política de Gobierno,Gobierno,Específicos,Refacciones y accesorios menores de equipo de ...,Refacciones y accesorios menores de equipo de ...,...,1.3.6,1,22.300,0.0,0.0,1452.9,1452.9,,1452.9,1452.9
55133,2014,"Carreteras eficientes, seguras y suficientes",Materiales y suministros,"Combustibles, lubricantes y aditivos",Recursos fiscales,Transporte,Desarrollo Económico,Proyectos de Inversión,"Combustibles, lubricantes y aditivos para vehí...","Combustibles, lubricantes y aditivos",...,3.5.1,3,9.630,0.0,1093500.0,2313972.38,2313972.38,,2313972.38,2313972.38


In [34]:
objeto_breakdown = [
    'CICLO', 
    'ID_CAPITULO', 
    'ID_CONCEPTO', 
    'ID_PARTIDA_ESPECIFICA', 
    'ID_PARTIDA_GENERICA'
]
merged[objeto_breakdown].sample(n=20)

Unnamed: 0,CICLO,ID_CAPITULO,ID_CONCEPTO,ID_PARTIDA_ESPECIFICA,ID_PARTIDA_GENERICA
167521,2014,3000,3300,33603.0,336
190539,2013,2000,2100,21601.0,216
333412,2015,1000,1400,14301.0,143
15201,2015,3000,3100,31801.0,318
242102,2014,3000,3500,35501.0,355
85818,2012,3000,3100,31801.0,318
22347,2015,3000,3500,35101.0,351
24269,2008,1000,1500,,1511
238499,2015,1000,1400,14101.0,141
52692,2015,2000,2200,22104.0,221


In [35]:
print('Total: missing', len(missing_ids), 'catalog IDs to breakdown the "Objeto del Gasto" column')
print('Tables:', dict(missing_ids.groupby('table').count()['ID']))
print('Years:', dict(missing_ids.groupby('year').count()['ID']))
try:
    missing_ids.sample(n=20)
except ValueError:
    pass

Total: missing 76 catalog IDs to breakdown the "Objeto del Gasto" column
Tables: {'partida_específica': 24, 'partida_generica': 45, 'concepto': 7}
Years: {2008: 48, 2009: 4, 2012: 22, 2013: 1, 2015: 1}


In [36]:
missing_ids.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 76 entries, 0 to 18706
Data columns (total 3 columns):
ID       76 non-null object
table    76 non-null object
year     76 non-null int64
dtypes: int64(1), object(2)
memory usage: 2.4+ KB


In [37]:
column_mapping

Unnamed: 0,Column,2008,2009,2010,2011,2012,2013,2014,2015,2016
0,DESC_FUNCION,True,True,True,True,True,True,True,True,True
1,MONTO_APROBADO,True,True,True,True,True,True,True,True,True
2,DESC_CAPITULO,False,False,False,False,False,False,False,False,True
3,DESC_AI,True,True,True,True,True,True,True,True,True
4,ID_RAMO,True,True,True,True,True,True,True,True,True
5,MONTO_MODIFICADO,False,False,False,False,False,False,True,True,False
6,ID_CLAVE_CARTERA,False,False,False,False,True,True,True,True,True
7,MONTO_EJERCICIO,False,False,False,False,False,True,True,True,False
8,DESC_FF,True,True,True,True,True,True,True,True,True
9,ENTIDAD_FEDERATIVA,False,False,False,False,True,True,True,True,True


In [38]:
missing_values

Unnamed: 0,Column,2008,2009,2010,2011,2012,2013,2014,2015,2016
0,DESC_FUNCION,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,MONTO_APROBADO,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,DESC_CAPITULO,,,,,,,,,0.0
3,DESC_AI,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,ID_RAMO,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,MONTO_MODIFICADO,,,,,,,0.0,0.0,
6,ID_CLAVE_CARTERA,,,,,0.0,0.0,0.0,0.0,0.0
7,MONTO_EJERCICIO,,,,,,0.0,0.0,0.0,
8,DESC_FF,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,ENTIDAD_FEDERATIVA,,,,,172.0,0.0,0.0,0.0,0.0


In [39]:
sums

Unnamed: 0,Column,2008,2009,2010,2011,2012,2013,2014,2015,2016
0,MONTO_EJERCIDO,2576692000000.0,2296086000000.0,2474100000000.0,2717372000000.0,2896331000000.0,,,,2160939000000.0
1,MONTO_DEVENGADO,,,,,,,5076810000000.0,5508987000000.0,
2,MONTO_APROBADO,1992356000000.0,2289715000000.0,2376915000000.0,2560196000000.0,2754868000000.0,4322619000000.0,4905401000000.0,5138442000000.0,5297126000000.0
3,MONTO_PAGADO,,,,,,,4972712000000.0,5366652000000.0,2137679000000.0
4,MONTO_MODIFICADO,,,,,,,5013990000000.0,5398608000000.0,
5,MONTO_ADEFAS,,,,,,,36941610000.0,31122650000.0,
6,MONTO_EJERCICIO,,,,,,4617618000000.0,5010877000000.0,5399018000000.0,


In [40]:
merged.sample(n=20)

Unnamed: 0,CICLO,DESC_AI,DESC_CAPITULO,DESC_CONCEPTO,DESC_FF,DESC_FUNCION,DESC_GPO_FUNCIONAL,DESC_MODALIDAD,DESC_PARTIDA_ESPECIFICA,DESC_PARTIDA_GENERICA,...,ID_SUBFUNCION,ID_TIPOGASTO,ID_UR,MONTO_ADEFAS,MONTO_APROBADO,MONTO_DEVENGADO,MONTO_EJERCICIO,MONTO_EJERCIDO,MONTO_MODIFICADO,MONTO_PAGADO
23751,2016,Investigación del delito federal,Servicios generales,Servicios básicos,Recursos fiscales,Justicia,Gobierno,Prestación de Servicios Públicos,,,...,1.2.2,1,17.324,,185600.0,,,109847.6,,109847.6
8727,2008,Política de ingresos equitativa y promotora de...,Materiales y suministros,"Vestuario, blancos, prendas de protección y ar...",Recursos fiscales,Hacienda,Gobierno,"Planeación, formulación, implementación, segui...",,"Vestuario, uniformes y blancos",...,1.3.1,1,6.300,,4702.0,,,26993.0,,
28394,2013,Impulso a la reconversión productiva en materi...,Servicios personales,Seguridad social,Recursos fiscales,"Agropecuaria, Silvicultura, Pesca y Caza",Desarrollo Económico,"Planeación, seguimiento y evaluación de políti...",Cuotas para el seguro colectivo de retiro,Aportaciones para seguros,...,3.2.1,1,8.138,,95153.0,,95034.65,,,
119718,2010,Fondo de Aportaciones para la Educación Tecnol...,Participaciones y aportaciones,,Recursos fiscales,Educación,Desarrollo Social,Gasto Federalizado,,Aportaciones Federales a las entidades federat...,...,2.0.2,1,33.416,,178074.0,,,208797.0,,
275526,2015,Apoyo a la función pública y buen gobierno,Servicios personales,Seguridad social,Recursos fiscales,Coordinación de la Política de Gobierno,Gobierno,Apoyo a la función pública y al mejoramiento d...,Aportaciones al seguro de cesantía en edad ava...,Aportaciones de seguridad social,...,1.3.4,1,51.GYN,0.0,44546.0,0.0,0.0,,0.0,0.0
122755,2014,Servicios de apoyo administrativo,Servicios personales,Remuneraciones adicionales y especiales,Recursos fiscales,Salud,Desarrollo Social,Apoyo al proceso presupuestario y para mejorar...,Compensaciones por servicios eventuales,Compensaciones,...,2.3.4,1,12.NBV,0.0,339368.0,339368.0,339368.0,,339368.0,339368.0
176570,2015,Investigación del delito federal,Servicios personales,Seguridad social,Recursos fiscales,Justicia,Gobierno,Prestación de Servicios Públicos,Aportaciones al FOVISSSTE,Aportaciones a fondos de vivienda,...,1.2.2,1,17.343,0.0,363069.0,311639.53,311639.53,,311639.53,311639.53
150069,2014,Manejo eficiente y sustentable del agua y prev...,Servicios personales,Seguridad social,Recursos fiscales,"Agropecuaria, Silvicultura, Pesca y Caza",Desarrollo Económico,Regulación y supervisión,Depósitos para el ahorro solidario,Aportaciones al sistema para el retiro,...,3.2.5,1,16.B00,0.0,170706.0,170706.0,170706.0,,170706.0,170706.0
112106,2013,Servicios de apoyo administrativo,Materiales y suministros,"Herramientas, refacciones y accesorios menores",Recursos fiscales,Salud,Desarrollo Social,Apoyo al proceso presupuestario y para mejorar...,Refacciones y accesorios para equipo de cómput...,Refacciones y accesorios menores de equipo de ...,...,2.3.4,1,12.611,,12000.0,,3132.0,,,
118404,2011,Apoyo a pequeñas comunidades rurales,Servicios generales,"Servicios profesionales, cientificos, tecnicos...",Recursos fiscales,"Urbanización, Vivienda y Desarrollo Regional",Desarrollo Social,Sujetos a Reglas de Operación,Subcontratación de servicios con terceros,"Servicios profesionales, científicos y técnico...",...,2.3.3,7,20.143,,0.0,,,0.0,,


In [41]:
breakdown = [
    'CICLO', 
    'ID_CAPITULO', 
    'ID_CONCEPTO', 
    'ID_PARTIDA_GENERICA',        
    'ID_PARTIDA_ESPECIFICA', 
    'DESC_CAPITULO',
    'DESC_CONCEPTO', 
    'DESC_PARTIDA_GENERICA',
    'DESC_PARTIDA_ESPECIFICA'
]

merged[breakdown].sample(n=200)

Unnamed: 0,CICLO,ID_CAPITULO,ID_CONCEPTO,ID_PARTIDA_GENERICA,ID_PARTIDA_ESPECIFICA,DESC_CAPITULO,DESC_CONCEPTO,DESC_PARTIDA_GENERICA,DESC_PARTIDA_ESPECIFICA
235920,2013,2000,2200,221,22104,Materiales y suministros,Alimentos y utensilios,Productos alimenticios para personas,Productos alimenticios para el personal en las...
166236,2015,3000,3700,375,37501,Servicios generales,Servicios de traslado y viáticos,Viáticos en el país,Viáticos nacionales para labores en campo y de...
60437,2015,3000,3200,325,32505,Servicios generales,Servicios de arrendamiento,Arrendamiento de equipo de transporte,"Arrendamiento de vehículos terrestres, aéreos,..."
22466,2013,2000,2600,261,26103,Materiales y suministros,"Combustibles, lubricantes y aditivos","Combustibles, lubricantes y aditivos","Combustibles, lubricantes y aditivos para vehí..."
41044,2012,3000,3100,313,31301,Servicios generales,Servicios basicos,Agua,Servicio de agua
44210,2012,3000,3100,314,31401,Servicios generales,Servicios basicos,Telefonía tradicional,Servicio telefónico convencional
93598,2016,2000,2200,,,Materiales y suministros,Alimentos y utensilios,,
121252,2010,8000,8400,8403,,Participaciones y aportaciones,,Aportaciones Federales a las entidades federat...,
34175,2013,3000,3200,327,32701,Servicios generales,Servicios de arrendamiento,Arrendamiento de activos intangibles,"Patentes, derechos de autor, regalías y otros"
206076,2013,1000,1600,161,16101,Servicios personales,Previsiones,"Previsiones de carácter laboral, económica y d...",Incrementos a las percepciones


In [42]:
merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1909649 entries, 0 to 341255
Data columns (total 39 columns):
CICLO                      object
DESC_AI                    object
DESC_CAPITULO              object
DESC_CONCEPTO              object
DESC_FF                    object
DESC_FUNCION               object
DESC_GPO_FUNCIONAL         object
DESC_MODALIDAD             object
DESC_PARTIDA_ESPECIFICA    object
DESC_PARTIDA_GENERICA      object
DESC_PP                    object
DESC_RAMO                  object
DESC_SUBFUNCION            object
DESC_TIPOGASTO             object
DESC_UR                    object
ENTIDAD_FEDERATIVA         object
GPO_FUNCIONAL              object
ID_AI                      object
ID_CAPITULO                object
ID_CLAVE_CARTERA           object
ID_CONCEPTO                object
ID_ENTIDAD_FEDERATIVA      object
ID_FF                      object
ID_FUNCION                 object
ID_MODALIDAD               object
ID_PARTIDA_ESPECIFICA      object
ID_P

In [43]:
len(merged)

1909649

In [44]:
merged.columns

Index(['CICLO', 'DESC_AI', 'DESC_CAPITULO', 'DESC_CONCEPTO', 'DESC_FF',
       'DESC_FUNCION', 'DESC_GPO_FUNCIONAL', 'DESC_MODALIDAD',
       'DESC_PARTIDA_ESPECIFICA', 'DESC_PARTIDA_GENERICA', 'DESC_PP',
       'DESC_RAMO', 'DESC_SUBFUNCION', 'DESC_TIPOGASTO', 'DESC_UR',
       'ENTIDAD_FEDERATIVA', 'GPO_FUNCIONAL', 'ID_AI', 'ID_CAPITULO',
       'ID_CLAVE_CARTERA', 'ID_CONCEPTO', 'ID_ENTIDAD_FEDERATIVA', 'ID_FF',
       'ID_FUNCION', 'ID_MODALIDAD', 'ID_PARTIDA_ESPECIFICA',
       'ID_PARTIDA_GENERICA', 'ID_PP', 'ID_RAMO', 'ID_SUBFUNCION',
       'ID_TIPOGASTO', 'ID_UR', 'MONTO_ADEFAS', 'MONTO_APROBADO',
       'MONTO_DEVENGADO', 'MONTO_EJERCICIO', 'MONTO_EJERCIDO',
       'MONTO_MODIFICADO', 'MONTO_PAGADO'],
      dtype='object')

In [45]:
merged['CICLO'].value_counts()

2015    341256
2014    340699
2013    308453
2012    272142
2009    146382
2011    142549
2008    130950
2010    129014
2016     98204
Name: CICLO, dtype: int64

In [46]:
pwd

'/home/loic/repos/mexico'

In [48]:
merged.sample(n=100000).to_csv('pipeline.out/iteration-20-post-launch-bug-fixes-test-run2/mexican_federal_budget_sample.csv', encoding='utf-8', index=False)