In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("ENV_2017.csv", encoding='latin-1',low_memory=False)

In [3]:
df.dtypes

prov_insc     object
cant_insc     object
parr_insc     object
anio_insc     object
mes_insc      object
dia_insc      object
fecha_insc    object
sexo          object
anio_nac       int64
mes_nac       object
dia_nac        int64
fecha_nac     object
talla         object
peso          object
sem_gest      object
tipo_part     object
apgar1        object
apgar5        object
p_emb         object
lugar_ocur    object
prov_nac      object
cant_nac      object
parr_nac      object
area_nac      object
asis_por      object
nac_mad       object
cod_pais      object
anio_mad      object
mes_mad       object
dia_mad       object
fecha_mad     object
edad_mad      object
con_pren      object
num_emb        int64
num_par        int64
hij_viv        int64
hij_vivm      object
hij_nacm      object
etnia         object
est_civil     object
niv_inst      object
sabe_leer     object
prov_res      object
cant_res      object
parr_res      object
area_res      object
residente     object
dtype: object

In [4]:
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 308252 entries, 0 to 308251
Data columns (total 47 columns):
prov_insc     308252 non-null object
cant_insc     308252 non-null object
parr_insc     308252 non-null object
anio_insc     308252 non-null object
mes_insc      308252 non-null object
dia_insc      308252 non-null object
fecha_insc    308252 non-null object
sexo          308252 non-null object
anio_nac      308252 non-null int64
mes_nac       308252 non-null object
dia_nac       308252 non-null int64
fecha_nac     308252 non-null object
talla         308252 non-null object
peso          308252 non-null object
sem_gest      308252 non-null object
tipo_part     308252 non-null object
apgar1        308252 non-null object
apgar5        308252 non-null object
p_emb         308252 non-null object
lugar_ocur    308252 non-null object
prov_nac      308252 non-null object
cant_nac      308252 non-null object
parr_nac      308252 non-null object
area_nac      308252 non-null object
asi

### uso de memoria por tipo de datos

In [3]:
for dtype in ['float','int','object']:
    selected_dtype = df.select_dtypes(include=[dtype])
    mean_usage_b = selected_dtype.memory_usage(deep=True).mean()
    mean_usage_mb = mean_usage_b / 1024 ** 2
    print("Uso promedio de memoria para columnas tipo {} : {:03.2f} MB".format(dtype,mean_usage_mb))

Uso promedio de memoria para columnas tipo float : 0.00 MB
Uso promedio de memoria para columnas tipo int : 1.96 MB
Uso promedio de memoria para columnas tipo object : 25.34 MB


In [4]:
def mem_usage(pandas_obj):
    if isinstance(pandas_obj,pd.DataFrame):
        usage_b = pandas_obj.memory_usage(deep=True).sum()
    else: 
        usage_b = pandas_obj.memory_usage(deep=True)
    usage_mb = usage_b / 1024 ** 2 # bytes a megabytes
    return "{:03.2f} MB".format(usage_mb)

gl_int = df.select_dtypes(include=['int'])
converted_int = gl_int.apply(pd.to_numeric,downcast='unsigned')

print('---'*10+' valores enteros '+'---'*10)
print(mem_usage(gl_int))
print(mem_usage(converted_int))

compare_ints = pd.concat([gl_int.dtypes,converted_int.dtypes],axis=1)
compare_ints.columns = ['antes','despues']
compare_ints.apply(pd.Series.value_counts)

#reduccion de valores enteros

------------------------------ valores enteros ------------------------------
11.00 MB
1.00 MB


Unnamed: 0,antes,despues
uint8,,4.0
uint16,,1.0
int64,5.0,


In [5]:
df_nuevo = df.copy()

df_nuevo[converted_int.columns] = converted_int

print('Dataframe antes de la reduccion: '+mem_usage(df))
print('Dataframe despues de la reduccion: '+mem_usage(df_nuevo))

Dataframe antes de la reduccion: 1101.00 MB
Dataframe despues de la reduccion: 1091.00 MB


## Uso de Categoricals

In [8]:
df_nuevo = df.select_dtypes(include=['object']).copy()
df_nuevo.describe()

Unnamed: 0,prov_insc,cant_insc,parr_insc,anio_insc,mes_insc,dia_insc,fecha_insc,sexo,mes_nac,fecha_nac,...,hij_nacm,etnia,est_civil,niv_inst,sabe_leer,prov_res,cant_res,parr_res,area_res,residente
count,308252,308252,308252,308252,308252,308252.0,308252.0,308252,308252,308252,...,308252,308252,308252,308252,308252,308252,308252,308252,308252,308252
unique,25,163,196,3,14,33.0,444.0,2,12,3853,...,10,9,8,10,3,26,221,1141,2,2
top,Guayas,Guayaquil,Tarqui,2017,Marzo,,,Hombre,Mayo,2017-08-18,...,0,Mestiza,Soltera,Secundaria,Si,Guayas,Guayaquil,Tarqui,Urbana,No residente
freq,75912,53335,26187,270067,27582,15433.0,15433.0,157960,27469,966,...,297541,267566,123141,87752,303080,78232,50177,20804,237144,209897


In [6]:
#ciclo para iterar sobre cada columna de objeto, 
#verifica si el número de valores únicos es inferior al 50% 
#de ser así,lo convierte al tipo de categoría.
df_nuevo = pd.DataFrame()

for col in df.columns:
    num_unique_values = len(df[col].unique())
    num_total_values = len(df[col])
    if num_unique_values / num_total_values < 0.5:
        df_nuevo.loc[:,col] = df[col].astype('category')
    else:
        df_nuevo.loc[:,col] = gl_obj[col]
        
print(mem_usage(df))
print(mem_usage(df_nuevo))

compare_obj = pd.concat([df.dtypes,df_nuevo.dtypes],axis=1)
compare_obj.columns = ['antes','despues']
compare_obj.apply(pd.Series.value_counts)

1091.00 MB
19.00 MB


Unnamed: 0,before,after
uint8,4.0,
uint16,1.0,
object,42.0,
category,,2.0
category,,2.0
category,,1.0
category,,1.0
category,,1.0
category,,1.0
category,,1.0


In [8]:
z.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 308252 entries, 0 to 308251
Data columns (total 47 columns):
prov_insc     308252 non-null category
cant_insc     308252 non-null category
parr_insc     308252 non-null category
anio_insc     308252 non-null category
mes_insc      308252 non-null category
dia_insc      308252 non-null category
fecha_insc    308252 non-null category
sexo          308252 non-null category
anio_nac      308252 non-null category
mes_nac       308252 non-null category
dia_nac       308252 non-null category
fecha_nac     308252 non-null category
talla         308252 non-null category
peso          308252 non-null category
sem_gest      308252 non-null category
tipo_part     308252 non-null category
apgar1        308252 non-null category
apgar5        308252 non-null category
p_emb         308252 non-null category
lugar_ocur    308252 non-null category
prov_nac      308252 non-null category
cant_nac      308252 non-null category
parr_nac      308252 non-null c