In [1]:
import numpy as np
import os
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
plt.rc("figure", figsize=(10, 6))
np.set_printoptions(precision=4)
pd.options.display.max_columns = 20
pd.options.display.max_rows = 20
pd.options.display.max_colwidth = 80

## Load data

In [2]:
# First, we load dataframes with 'pandas.read_csv': 

# Saber 11

df1 = pd.read_csv("saber11_2019_2022_cleaned.csv",
                  delimiter=",",
                  index_col=0)

# Scholar census

df2 = pd.read_csv("acceso_tecnologico.csv",
                  delimiter=",",
                  index_col=0)

## Merging dataframes

In [3]:
# Now, we merge the dataframes based on the keys: 1)'Código DANE de la Sede educativa' and 2) 'Año'.

# Before, we rename for the same variable names:

df2 = df2.rename(columns={'SEDE_CODIGO': 'COLE_COD_DANE_SEDE',
                          'PERIODO_ANIO': 'ANO'})

data = pd.merge(df1, df2,
                on=['COLE_COD_DANE_SEDE', 'ANO'],
                how='inner')

## Creating interaction and diff-in-diff variable

In [4]:
# Interaction between family and school having computer and internet:

data['ENTORNO_TECNOLOGICO'] = (data['FAMI_TIENECOMPUTADOR']
                               .mul(data['FAMI_TIENEINTERNET'])
                               .mul(data['SEDETE_INTERNET'])
                               .mul(data['SEDETE_EQUIPO_COMPUTO']))

In [5]:
# Interaction between year and family and school having computer and internet:

data['2022_X_ENTORNO_TECNOLOGICO'] = (data['ANO_2022']
                                      .mul(data['ENTORNO_TECNOLOGICO']))

## Transform column names

In [6]:
# Using the rename function, we transform the column names in Capitalize:

data = data.rename(columns=str.capitalize)

In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 749357 entries, 0 to 749356
Data columns (total 28 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   Ano                         749357 non-null  int64  
 1   Cole_bilingue               749357 non-null  float64
 2   Cole_cod_dane_sede          749357 non-null  float64
 3   Fami_educacionmadre         749357 non-null  float64
 4   Fami_educacionpadre         749357 non-null  float64
 5   Fami_tieneautomovil         749357 non-null  float64
 6   Fami_tienecomputador        749357 non-null  float64
 7   Fami_tieneinternet          749357 non-null  float64
 8   Fami_tienelavadora          749357 non-null  float64
 9   Punt_ingles                 749357 non-null  float64
 10  Punt_matematicas            749357 non-null  float64
 11  Punt_sociales_ciudadanas    749357 non-null  float64
 12  Punt_c_naturales            749357 non-null  float64
 13  Punt_lectura_c

## Standardize test scores

The idea here is to standardize the test scores (ie. to have mean of zero and a variance of 1) by computing the test scores (global, matematicas, ingles, ciencias natutales, etc ) as $z = \frac{x - \mu}{\sigma}$, where:
- $\mu$: is the mean of the popultaion.
- $\sigma$ is the standard deviation of the population.

In [8]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

In [9]:
# Define the columns to standardize

names = ['Punt_ingles', 'Punt_matematicas', 'Punt_sociales_ciudadanas',
         'Punt_c_naturales', 'Punt_lectura_critica', 'Punt_global']

In [10]:
# Standarizing the tests scores

data2 = scaler.fit_transform(data.reindex(columns=names))

data2

array([[ 1.5582,  1.1714,  1.7993,  1.4355,  1.467 ,  1.6406],
       [-0.334 , -0.7324, -1.386 , -0.9118, -0.9833, -1.0482],
       [ 0.8013,  1.0059,  0.0842,  0.8721,  0.9016,  0.7956],
       ...,
       [-0.1826,  0.0954, -0.8142, -0.4423, -0.3236, -0.3952],
       [ 1.7095,  1.2542,  0.7376,  1.1538,  0.9016,  1.1797],
       [-0.9395, -1.3946, -1.0593, -0.4423, -1.5488, -1.221 ]],
      shape=(749357, 6))

In [11]:
# Now we set the standardized variables from numpy.array to pandas.DataFrame

data2 = pd.DataFrame(data2, columns=['Punt_ingles_std', 'Punt_matematicas_std', 'Punt_sociales_ciudadanas_std',
                                     'Punt_c_naturales_std', 'Punt_lectura_critica_std', 'Punt_global_std'])

In [12]:
# Finally we join the standarized variables to the main dataframe

data = data.join(data2)

data

Unnamed: 0,Ano,Cole_bilingue,Cole_cod_dane_sede,Fami_educacionmadre,Fami_educacionpadre,Fami_tieneautomovil,Fami_tienecomputador,Fami_tieneinternet,Fami_tienelavadora,Punt_ingles,...,Sedete_internet,Sedete_equipo_computo,Entorno_tecnologico,2022_x_entorno_tecnologico,Punt_ingles_std,Punt_matematicas_std,Punt_sociales_ciudadanas_std,Punt_c_naturales_std,Punt_lectura_critica_std,Punt_global_std
0,2019,0.0,2.410160e+11,22.0,8.0,0.0,1.0,1.0,1.0,71.0,...,0.0,1.0,0.0,0.0,1.558168,1.171404,1.799349,1.435453,1.467011,1.640599
1,2019,0.0,1.413960e+11,6.0,6.0,0.0,0.0,0.0,0.0,46.0,...,1.0,1.0,0.0,0.0,-0.333983,-0.732379,-1.385960,-0.911761,-0.983315,-1.048179
2,2019,0.0,1.680010e+11,17.0,14.0,0.0,1.0,1.0,1.0,61.0,...,1.0,1.0,1.0,0.0,0.801307,1.005857,0.084182,0.872122,0.901551,0.795554
3,2019,0.0,1.053600e+11,14.0,6.0,0.0,1.0,1.0,1.0,52.0,...,1.0,1.0,1.0,0.0,0.120133,1.171404,0.737579,0.778233,-0.512099,0.584293
4,2019,0.0,1.138380e+11,6.0,6.0,0.0,0.0,1.0,1.0,40.0,...,1.0,1.0,0.0,0.0,-0.788100,-1.477338,-0.814238,-1.662869,-0.983315,-1.317057
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
749352,2022,1.0,3.117690e+11,19.0,19.0,1.0,1.0,1.0,1.0,100.0,...,1.0,1.0,1.0,1.0,3.753064,1.833589,1.799349,2.374338,1.561254,2.274382
749353,2022,1.0,3.110011e+11,19.0,19.0,1.0,1.0,1.0,1.0,76.0,...,1.0,1.0,1.0,1.0,1.936598,0.260899,0.492555,0.590456,0.241848,0.584293
749354,2022,0.0,3.110011e+11,11.0,6.0,0.0,1.0,1.0,1.0,48.0,...,1.0,1.0,1.0,1.0,-0.182611,0.095352,-0.814238,-0.442318,-0.323612,-0.395190
749355,2022,1.0,3.110010e+11,22.0,22.0,1.0,1.0,1.0,1.0,73.0,...,1.0,1.0,1.0,1.0,1.709540,1.254177,0.737579,1.153787,0.901551,1.179666


## Summary statistics

In [13]:
# We calculate the summary statistics by institution area, i.e. Rural, Urbano

sum_stat1 = (data.groupby(['Cole_area_ubicacion_rural', 'Cole_area_ubicacion_urbano'])
    .agg(['count', 'mean', 'std'])
    .stack()
    .T)

  .stack()


In [14]:
# We calculate the summary statistics by institution type, i.e. Oficial, No oficial

sum_stat2 = (data.groupby(['Cole_naturaleza_no oficial', 'Cole_naturaleza_oficial'])
    .agg(['count', 'mean', 'std'])
    .stack()
    .T)

  .stack()


In [15]:
# We calculate the summary statistics by gender, i.e. male, female

sum_stat3 = (data.groupby(['Estu_genero_f', 'Estu_genero_m'])
    .agg(['count', 'mean', 'std'])
    .stack()
    .T)

  .stack()


In [16]:
# We calculate the summary statistics by year:

sum_stat4 = (data.groupby(['Ano'])
    .agg(['count', 'mean', 'std'])
    .stack()
    .T)

  .stack()


## Exporting data

In [17]:
# data.to_csv("final_principal.csv")

In [18]:
# sum_stat1.to_excel("summary_statistics_area.xlsx")

In [19]:
# sum_stat2.to_excel("summary_statistics_type.xlsx")

In [20]:
# sum_stat3.to_excel("summary_statistics_gender.xlsx")

In [21]:
# sum_stat4.to_excel("summary_statistics_year.xlsx")