In [1]:
import numpy as np
import os
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
plt.rc("figure", figsize=(10, 6))
np.set_printoptions(precision=4)
pd.options.display.max_columns = 20
pd.options.display.max_rows = 20
pd.options.display.max_colwidth = 80

## Load data

In [2]:
# First, we load data with 'pandas.read_csv'

data = pd.read_csv("saber11_2019_2022.csv",
                   delimiter=",",
                   on_bad_lines='skip')

## Dropping unrequired variables

In [3]:
data = data.drop(columns=['ESTU_TIPODOCUMENTO', 'ESTU_CONSECUTIVO',
                          'COLE_CALENDARIO', 'COLE_CARACTER',
                          'COLE_COD_DEPTO_UBICACION', 'COLE_COD_MCPIO_UBICACION',
                          'COLE_DEPTO_UBICACION', 'COLE_GENERO',
                          'COLE_JORNADA', 'COLE_MCPIO_UBICACION',
                          'COLE_NOMBRE_ESTABLECIMIENTO', 'COLE_NOMBRE_SEDE',
                          'COLE_SEDE_PRINCIPAL', 'ESTU_COD_DEPTO_PRESENTACION',
                          'ESTU_COD_MCPIO_PRESENTACION', 'ESTU_DEPTO_PRESENTACION',
                          'ESTU_DEPTO_RESIDE', 'ESTU_ESTUDIANTE',
                          'ESTU_MCPIO_PRESENTACION', 'ESTU_MCPIO_RESIDE',
                          'ESTU_NACIONALIDAD', 'ESTU_PAIS_RESIDE',
                          'ESTU_PRIVADO_LIBERTAD', 'FAMI_CUARTOSHOGAR',
                          'FAMI_ESTRATOVIVIENDA', 'FAMI_PERSONASHOGAR',
                          'DESEMP_INGLES', 'COLE_COD_DANE_ESTABLECIMIENTO',
                          'COLE_CODIGO_ICFES', 'ESTU_COD_RESIDE_DEPTO',
                          'ESTU_COD_RESIDE_MCPIO'])

## Removing missing values

In [4]:
# The criteria here is to remove any observation if there is a missing value in any column. To do this, we replace null vallues (' ') with 'np.nans':

data = data.replace(' ', np.nan)

In [5]:
# Then, we drop any row containing a missing value (i.e. 'np.nan')

data = data.dropna(ignore_index=True)

## Removing duplicates

In [6]:
# Now we remove the duplicated data:

data = data.drop_duplicates(ignore_index=True)

## Research filtering

In [7]:
# Some students don't have checking identifications. So I decided to keep only students with information available for research:

data = ((data[data['ESTU_ESTADOINVESTIGACION'] == 'PUBLICAR'])
        .reset_index(drop=True))

## Dtype Transformation

In [8]:
# Now we transform the variable types to 'float' type

data = data.astype({'PUNT_MATEMATICAS': 'float64',
                    'PUNT_SOCIALES_CIUDADANAS': 'float64',
                    'PUNT_C_NATURALES': 'float64',
                    'PUNT_LECTURA_CRITICA': 'float64',
                    'PUNT_GLOBAL': 'float64'})

## Estimating and filtering student's age 

In [9]:
# We want to compute the age of students at the time of taking the test.
# First we parse the 'Date of birth' to a 'Datetime' object:

data['ESTU_FECHANACIMIENTO'] = pd.to_datetime(data['ESTU_FECHANACIMIENTO'],
                                              errors='coerce',
                                              dayfirst=True)

In [10]:
# Now we create PeriodIndex from the year and quarter:

index = pd.PeriodIndex(year=data['ANO'],
                       quarter=data['PER'],
                       freq='Q-DEC')

index

  index = pd.PeriodIndex(year=data['ANO'],


PeriodIndex(['2019Q4', '2019Q4', '2019Q4', '2019Q4', '2019Q4', '2019Q4',
             '2019Q4', '2019Q4', '2019Q4', '2019Q4',
             ...
             '2022Q1', '2022Q1', '2022Q1', '2022Q1', '2022Q1', '2022Q1',
             '2022Q1', '2022Q1', '2022Q1', '2022Q1'],
            dtype='period[Q-DEC]', length=845250)

In [11]:
# Now we transform the index to a DatetimeIndex object:

index = index.to_timestamp(how='start')

index

DatetimeIndex(['2019-10-01', '2019-10-01', '2019-10-01', '2019-10-01',
               '2019-10-01', '2019-10-01', '2019-10-01', '2019-10-01',
               '2019-10-01', '2019-10-01',
               ...
               '2022-01-01', '2022-01-01', '2022-01-01', '2022-01-01',
               '2022-01-01', '2022-01-01', '2022-01-01', '2022-01-01',
               '2022-01-01', '2022-01-01'],
              dtype='datetime64[ns]', length=845250, freq=None)

In [12]:
# Now we can merge this object into the DataFrame:

data['ESTU_FECHAPRESENTACION'] = index

In [13]:
# Now we estimate the age (in days) of students by substracting 'Date of birth' from 'Date of presentation of test', and dividing by 365 days:
# And then, we round the age to float number:

data['EDAD'] = np.floor((data['ESTU_FECHAPRESENTACION'] - data['ESTU_FECHANACIMIENTO']).dt.days / 365.25)

In [14]:
# Finally we filter the age from 14-25 years:

age = range(14, 26)

data = (data[data['EDAD'].isin(age)]
        .reset_index(drop=True))

## Dropping unrequired variables

In [15]:
data = data.drop(columns=['PER', 'ESTU_ESTADOINVESTIGACION',
                          'ESTU_FECHANACIMIENTO', 'ESTU_FECHAPRESENTACION'])

## Replacing strings to numeric

In [16]:
# Binary bilingual institution
# map1 = {'N': 0, 'S': 1}

# Yes/Not family questions
# map2 = {'No': 0, 'Si': 1}

# Now we can replace the values from the dictionary in the dataframe:

data = data.replace({'N': 0.0, 'S': 1.0,
                     'No': 0.0, 'Si': 1.0})

  data = data.replace({'N': 0.0, 'S': 1.0,


## Mapping and estimating the parents' education

In [17]:
# The idea is to estimate the mother’s and father’s years of schooling. But first we are filtering out 'No sabe' and 'No Aplica':

names = ['No sabe', 'No Aplica']

# Filtering out in mother's education:

data = (data[~data['FAMI_EDUCACIONMADRE'].isin(names)]
        .reset_index(drop=True))

# Filtering out in father's education:

data = (data[~data['FAMI_EDUCACIONPADRE'].isin(names)]
        .reset_index(drop=True))

In [18]:
# Now we estimate the remaining categories as year of education:

education = {'Secundaria (Bachillerato) completa': 14.0,
             'Primaria incompleta': 6.0,
             'Secundaria (Bachillerato) incompleta': 11.0,
             'Educación profesional completa': 19.0,
             'Primaria completa': 8.0,
             'Técnica o tecnológica completa': 16.0,
             'Ninguno': 0.0,
             'Postgrado': 22.0,
             'Técnica o tecnológica incompleta': 15.0,
             'Educación profesional incompleta': 17.0}

data['FAMI_EDUCACIONMADRE'] = data['FAMI_EDUCACIONMADRE'].map(education)

data['FAMI_EDUCACIONPADRE'] = data['FAMI_EDUCACIONPADRE'].map(education)

## Dummy variables

In [19]:
# Now we can create the dummy/binary variables for 'Ubicación', 'Cole_Naturaleza', and 'Género':

# Since this is a difference-in-differences model, we are generating the time dummy variable (2019 and 2022),
# and treatment variable ('Fami_tienecomputador_x_Fami_tieneinternet'), and the interaction of the variables:
# i.e. ('2022_x_fami_tienecomputador_x_tieneinternet')

names2 = ['COLE_AREA_UBICACION', 'COLE_NATURALEZA', 'ESTU_GENERO']

dummies = pd.get_dummies(data=data[names2],
                         dtype=float)

dummies2 = pd.get_dummies(data=data['ANO'],
                          prefix='ANO',
                          dtype=float)

data_with_dummies = (data.drop(columns=names2)
                     .join(dummies)
                     .join(dummies2))

## Exporting data

In [20]:
# data_with_dummies.to_csv("saber11_2019_2022_cleaned.csv")