In [1]:
# Importing the required libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
plt.rc("figure", figsize=(10, 6))
import os

## Load data

In [2]:
# First, we load the data with 'pandas.read_csv'

df1 = pd.read_csv('Examen_Saber_11_20192.txt', sep=';',
                  header=0, on_bad_lines='skip',
                  low_memory=False)
df2 = pd.read_csv('Examen_Saber_11_20222.txt', sep=';',
                  header=0, on_bad_lines='skip',
                  low_memory=False)

## Concat the data

In [3]:
# Now we concatenate both dataframes along index

data = pd.concat([df1, df2], ignore_index=True)

## Relevant variables

In [4]:
# The model/analysis will only consider the following variables

var = [0, 4, 5, 6, 9, 10, 11, 17, 31, 32, 37, 39, 40,
       42, 46, 47, 57, 58, 59, 63, 64, 67, 68, 79,
       80, 81, 82, 83, 84]

data = data.iloc[:, var]

## Year split

In [5]:
# Split the 'periodo' years 2019 and 2022

data['ano'] = data['periodo'].astype(str).str[:4].astype(int)

## Removing missing values

In [6]:
# Then, we drop any row containing a missing value (i.e. 'np.nan')

data = data.dropna(ignore_index=True)

In [7]:
data.shape

(819589, 30)

In [8]:
# Data by year

data.groupby('ano')['ano'].count()

ano
2019    420676
2022    398913
Name: ano, dtype: int64

In [9]:
# Data by sector

data.groupby('cole_naturaleza')['cole_naturaleza'].count()

cole_naturaleza
NO OFICIAL    170815
OFICIAL       648774
Name: cole_naturaleza, dtype: int64

## School Calendar

In [10]:
# The analysis will be with students in A Calendar school:

data = (
    data
        .query('cole_calendario == "A"')
        .reset_index(drop=True)
)

In [11]:
data.shape

(815460, 30)

In [12]:
# Data by year

data.groupby('ano')['ano'].count()

ano
2019    417890
2022    397570
Name: ano, dtype: int64

In [13]:
# Data by sector

data.groupby('cole_naturaleza')['cole_naturaleza'].count()

cole_naturaleza
NO OFICIAL    166686
OFICIAL       648774
Name: cole_naturaleza, dtype: int64

## Student filter

In [14]:
# We want students who are in the final year (11, 12)

data = (
    data
        .query('estu_grado == 11.0 | estu_grado == 12.0')
        .reset_index(drop=True)
)

In [15]:
data.shape

(736165, 30)

In [16]:
# Data by year

data.groupby('ano')['ano'].count()

ano
2019    373297
2022    362868
Name: ano, dtype: int64

In [17]:
# Data by sector

data.groupby('cole_naturaleza')['cole_naturaleza'].count()

cole_naturaleza
NO OFICIAL    142649
OFICIAL       593516
Name: cole_naturaleza, dtype: int64

## Estimating and filtering student's age 

In [18]:
# We want to compute the age of students when taking the exam.
# First we parse the 'Date of birth' to a 'Datetime' object:

data['estu_fechanacimiento'] = pd.to_datetime(data['estu_fechanacimiento'],
                                              errors='coerce',
                                              dayfirst=True)

In [19]:
# According to webpage of ICFES the exam in 2019 was taken '25 de agosto'
# and in 2022 in '4 de septiembre'

fecha = {2019: pd.Timestamp(2019, 8, 25),
         2022: pd.Timestamp(2022, 9, 4)}

data['estu_fechapresentacion'] = data['ano'].map(fecha)

In [20]:
# Now we estimate the age (in days) of students by substracting 'Date of birth' from 'Date of presentation of exam', and dividing by 365 days:
# And then, we round the age to float number:

data['edad'] = np.floor((data['estu_fechapresentacion'] - data['estu_fechanacimiento']) / pd.Timedelta(days=365.25))

In [21]:
# Finally we filter the age from 14-25 years:

age = range(14, 26)

data = (data[data['edad'].isin(age)]
        .reset_index(drop=True))

In [22]:
data.shape

(733790, 32)

In [23]:
# Data by year

data.groupby('ano')['ano'].count()

ano
2019    371721
2022    362069
Name: ano, dtype: int64

In [24]:
# Data by sector

data.groupby('cole_naturaleza')['cole_naturaleza'].count()

cole_naturaleza
NO OFICIAL    141852
OFICIAL       591938
Name: cole_naturaleza, dtype: int64

## Dropping unrequired variables

In [25]:
data = data.drop(columns=['cole_calendario', 'estu_grado', 'periodo',
                          'estu_fechanacimiento', 'estu_fechapresentacion'])

## Replacing to 'numeric strings'

In [26]:
# Binary bilingual institution
# map1 = {'N': 0, 'S': 1}

# Yes/Not family questions
# map2 = {'No': 0, 'Si': 1}

# Now we can replace 'strings' to 'numeric strings' in the dataframe:

data = data.replace({'N': '0.0', 'S': '1.0',
                     'No': '0.0', 'Si': '1.0'})

## Dtype Transformation

In [27]:
# Now we transform this 'strings' variables to 'float' as well as test scores

data = data.astype({'cole_cod_depto_ubicacion': 'float64',
                    'cole_cod_mcpio_ubicacion': 'float64',
                    'cole_bilingue': 'float64',
                    'fami_tieneautomovil': 'float64',
                    'fami_tienecomputador': 'float64',
                    'fami_tieneinternet': 'float64',
                    'fami_tienelavadora': 'float64',
                    'punt_matematicas': 'float64',
                    'punt_sociales_ciudadanas': 'float64',
                    'punt_c_naturales': 'float64',
                    'punt_lectura_critica': 'float64',
                    'punt_global': 'float64'})

## Mapping the socioeconomic stratum

In [28]:
# We are using the family's socioeconomic stratum as controls in regressions

strat = {'Sin Estrato': 0.0, 'Estrato 1': 1.0,
         'Estrato 2': 2.0, 'Estrato 3': 3.0,
         'Estrato 4': 4.0, 'Estrato 5': 5.0,
         'Estrato 6': 6.0}

data['fami_estratovivienda'] = data['fami_estratovivienda'].map(strat)

## Filtering, mapping and estimating the parents' education

In [29]:
# The idea is to estimate the mother’s and father’s years of schooling. But first we are filtering out 'No sabe' and 'No Aplica':

names = ['No sabe', 'No Aplica']

# Filtering out in mother's education:

data = (data[~data['fami_educacionmadre'].isin(names)]
        .reset_index(drop=True))

# Filtering out in father's education:

data = (data[~data['fami_educacionpadre'].isin(names)]
        .reset_index(drop=True))

In [30]:
# Now we estimate the average year of schooling:

education = {'Secundaria (Bachillerato) completa': 14.0,
             'Primaria incompleta': 6.0,
             'Secundaria (Bachillerato) incompleta': 11.0,
             'Educación profesional completa': 19.0,
             'Primaria completa': 8.0,
             'Técnica o tecnológica completa': 16.0,
             'Ninguno': 0.0,
             'Postgrado': 22.0,
             'Técnica o tecnológica incompleta': 15.0,
             'Educación profesional incompleta': 17.0}

data['fami_educacionmadre'] = data['fami_educacionmadre'].map(education)

data['fami_educacionpadre'] = data['fami_educacionpadre'].map(education)

In [31]:
data.shape

(667474, 27)

In [32]:
# Data by year

data.groupby('ano')['ano'].count()

ano
2019    340765
2022    326709
Name: ano, dtype: int64

In [33]:
# Data by sector

data.groupby('cole_naturaleza')['cole_naturaleza'].count()

cole_naturaleza
NO OFICIAL    127932
OFICIAL       539542
Name: cole_naturaleza, dtype: int64

## Dummy variables

In [34]:
# Now we can create the dummy/binary variables for 'Ubicación', 'Cole_Naturaleza', and 'Género':

names2 = ['cole_area_ubicacion', 'cole_naturaleza', 'estu_genero']

dummies = pd.get_dummies(data=data[names2],
                         dtype=float)

dummies2 = pd.get_dummies(data=data['ano'],
                          prefix='ano',
                          dtype=float)

data_with_dummies = (
    data
        .drop(columns=names2)
        .join(dummies)
        .join(dummies2)
)

In [35]:
data_with_dummies.shape

(667474, 32)

## Graph

In [36]:
# ICFES reports the time of students in internet and reading as a hobby:

#d1 = (
#    data
#        .loc[:, ['ano', 'estu_dedicacioninternet', 'estu_dedicacionlecturadiaria']]
#        .rename(columns={'ano': 'Año',
#                         'estu_dedicacioninternet':'Tiempo en internet',
#                         'estu_dedicacionlecturadiaria':'Tiempo en lectura'})
#)

#sns.set_style('ticks')
#g1 = sns.countplot(d1, y='Tiempo en internet', hue='Año')
#plt.title('Tiempo en internet fuera del aula');
#plt.savefig('tiempo_internet.pdf', bbox_inches='tight')

In [37]:
# Time in reading

# g2 = sns.countplot(d1, y='Tiempo en lectura', hue='Año')
# plt.title('Tiempo de lectura como hobby');
# plt.savefig('tiempo_lectura.pdf', bbox_inches='tight')

In [38]:
# No need in the final df, so now we drop the columns

data_with_dummies = data_with_dummies.drop(columns=['estu_dedicacioninternet',
                                                    'estu_dedicacionlecturadiaria'])

## Exporting data

In [39]:
# data_with_dummies.to_csv("saber11_2019_2022_cleaned.csv")