# Generate `Objeto de Gasto` catalog

## File ingestion

In [25]:
from pandas import ExcelFile, read_excel, DataFrame
from slugify import slugify

In [22]:
cd /home/loic/repos/mexico

/home/loic/repos/mexico


In [24]:
catalog_file = ExcelFile('objeto_del_gasto.split.xlsx')

In [80]:
catalog = {}

for sheet in catalog_file.sheet_names:
    name = slugify(sheet, separator='_')
    catalog[name] = catalog_file.parse(sheet).dropna()
    message = 'Loaded sheet {sheet} into "{name}" ({nb} lines)'
    parameters = dict(sheet=sheet, name=name, nb=len(catalog[name]))
    print(message.format(**parameters))
    print('Columns =', list(catalog[name].columns))

Loaded sheet Concatenated into "concatenated" (460 lines)
Columns = ['CAPITULO', 'CONCEPTO', 'PARTIDA_GENERICA', 'PARTIDA_ESPECIFICA', 'DESCRIPCION']
Loaded sheet CAPITULO into "capitulo" (9 lines)
Columns = ['CAPITULO', 'DESCRIPCION']
Loaded sheet CONCEPTO into "concepto" (88 lines)
Columns = ['CONCEPTO', 'DESCRIPCION']
Loaded sheet PARTIDA GENERICA into "partida_generica" (351 lines)
Columns = ['PARTIDA_GENERICA', 'DESCRIPCION']
Loaded sheet PARTIDA ESPECÍFICA into "partida_especifica" (460 lines)
Columns = ['PARTIDA_ESPECIFICA', 'DESCRIPCION']


## Quality assurance

The "Concatenated" and the "PARTIDA ESPECÍFICA" sheets must be the same.

In [61]:
catalog['partida_especifica']['PARTIDA_ESPECIFICA'] =  catalog['partida_especifica']['PARTIDA_ESPECIFICA'].astype(int)
especifica_1 = catalog['partida_especifica'].set_index('PARTIDA_ESPECIFICA').sort_index()
print(especifica_1.info())
especifica_1.head(n=5)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 460 entries, 11101 to 99101
Data columns (total 1 columns):
DESCRIPCION    460 non-null object
dtypes: object(1)
memory usage: 7.2+ KB
None


Unnamed: 0_level_0,DESCRIPCION
PARTIDA_ESPECIFICA,Unnamed: 1_level_1
11101,(Derogada)
11201,Haberes
11301,Sueldos base
11401,Retribuciones por adscripción en el extranjero
12101,Honorarios


In [65]:
catalog['concatenated']['PARTIDA_ESPECIFICA'] =  catalog['concatenated']['PARTIDA_ESPECIFICA'].astype(int)
especifica_2 = catalog['concatenated'][['PARTIDA_ESPECIFICA', 'DESCRIPCION']].set_index('PARTIDA_ESPECIFICA').sort_index()
print(especifica_2.info())
especifica_2.head(n=5)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 460 entries, 11101 to 99101
Data columns (total 1 columns):
DESCRIPCION    460 non-null object
dtypes: object(1)
memory usage: 7.2+ KB
None


Unnamed: 0_level_0,DESCRIPCION
PARTIDA_ESPECIFICA,Unnamed: 1_level_1
11101,(Derogada)
11201,Haberes
11301,Sueldos base
11401,Retribuciones por adscripción en el extranjero
12101,Honorarios


In [67]:
comparaison = especifica_1 == especifica_2

In [69]:
comparaison.all()

DESCRIPCION    True
dtype: bool

Okay, everything seems to be coherent, after I get rid of the hidden rows in the `Concatenated` sheet. 

## Lookup table

Now I need to generate lookup tables for `CAPITULO`, `CONCEPTO`, `PARTIDA_GENERICA` and `PARTIDA_ESPECIFICA`, or rather a lookup function which gives me the description from the ID. It's as simple as using the dataframe indexing, like so:

In [76]:
especifica_1.loc[11101]

DESCRIPCION    (Derogada)
Name: 11101, dtype: object

In [98]:
from os.path import join

def generate_catalog(file):
    
    new_columns = {}
    catalog_file = ExcelFile(file)
    INDEX_COLUMN = 0
    
    for sheet in catalog_file.sheet_names:
        if sheet != 'Concatenated':
            name = slugify(sheet, separator='_')
            output = join('..', 'objeto_del_gasto.catalog', name + '.csv')

            df = catalog_file.parse(sheet).dropna()
            index = df.columns[INDEX_COLUMN]

            df[index] =  df[index].astype(int)
            df.set_index(index, inplace=True)
            df.sort_index(inplace=True)
            
            new_columns[name] = df
            df.to_csv(output)
            
            message = 'Loaded sheet {sheet} into "{name}" ({nb} lines)'
            parameters = dict(sheet=sheet, name=name, nb=len(catalog[name]))

            print(message.format(**parameters))
            print('Columns =', list(catalog[name].columns))
            print('Saved to', output)
            
    return new_columns

In [99]:
catalog = generate_catalog('objeto_del_gasto.catalog.xlsx')

Loaded sheet CAPITULO into "capitulo" (9 lines)
Columns = ['DESCRIPCION']
Saved to ../objeto_del_gasto.catalog/capitulo.csv
Loaded sheet CONCEPTO into "concepto" (88 lines)
Columns = ['DESCRIPCION']
Saved to ../objeto_del_gasto.catalog/concepto.csv
Loaded sheet PARTIDA GENERICA into "partida_generica" (351 lines)
Columns = ['DESCRIPCION']
Saved to ../objeto_del_gasto.catalog/partida_generica.csv
Loaded sheet PARTIDA ESPECÍFICA into "partida_especifica" (460 lines)
Columns = ['DESCRIPCION']
Saved to ../objeto_del_gasto.catalog/partida_especifica.csv


In [94]:
catalog['capitulo'].loc[1000]

DESCRIPCION    Servicios personales
Name: 1000, dtype: object

In [115]:
catalog['partida_especifica'].loc[21101]

DESCRIPCION    Materiales y útiles de oficina
Name: 21101, dtype: object

## Read pre-processed catalog files

In [111]:
from pandas import read_csv
from os import listdir
from os.path import join

def load_catalogs(folder):
    
    catalogs = {}
    files = listdir(folder)
    
    for file in files:
        name = file.split('.')[0]
        print('Loading', name)
        filepath = join(folder, file)
        
        catalogs[name] = read_csv(filepath)
        index_column = catalogs[name].columns[0]
        catalogs[name].set_index(index_column, inplace=True)
    
        print(catalogs[name].info(), '\n')
    
    return catalogs

catalogs = load_catalogs('objeto_del_gasto.catalog')

Loading partida_generica
<class 'pandas.core.frame.DataFrame'>
Int64Index: 351 entries, 111 to 991
Data columns (total 1 columns):
DESCRIPCION    351 non-null object
dtypes: object(1)
memory usage: 5.5+ KB
None 

Loading capitulo
<class 'pandas.core.frame.DataFrame'>
Int64Index: 9 entries, 1000 to 9000
Data columns (total 1 columns):
DESCRIPCION    9 non-null object
dtypes: object(1)
memory usage: 144.0+ bytes
None 

Loading concepto
<class 'pandas.core.frame.DataFrame'>
Int64Index: 88 entries, 1100 to 9900
Data columns (total 1 columns):
DESCRIPCION    88 non-null object
dtypes: object(1)
memory usage: 1.4+ KB
None 

Loading partida_especifica
<class 'pandas.core.frame.DataFrame'>
Int64Index: 460 entries, 11101 to 99101
Data columns (total 1 columns):
DESCRIPCION    460 non-null object
dtypes: object(1)
memory usage: 7.2+ KB
None 



In [114]:
catalogs['capitulo'].loc[2000]

DESCRIPCION    Materiales y suministros
Name: 2000, dtype: object