In [1]:
from pathlib import Path
import joblib

import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer

from pipeline import (
    DropHighNAPercentage,
    m_inst,
    OrdinalColumnMapper,
    DropColumns,
    DataframeOneHotEncoder,
)

pd.set_option("display.max_columns", None)

In [2]:
ROOT_DIR = Path.cwd().parent
DATA_DIR = ROOT_DIR / "data"

DATA_PATH = DATA_DIR / 'indices_institucional_2005_2022.xlsx'
DATA_PATH.exists()

True

In [3]:
def show(df: pd.DataFrame, limit: int = 5) -> None:
    print(df.shape)
    display(df.head(limit))

In [4]:
def show_null_percentages(df: pd.DataFrame) -> None:
    nulls = df.isnull().sum() / df.shape[0]
    nulls = nulls.sort_values(ascending=False)
    display(nulls)

### Loading Data

In [5]:
# inmuebles = pd.read_excel(DATA_PATH, sheet_name="Inmuebles")
# labs = pd.read_excel(DATA_PATH, sheet_name="Laboratorios y Talleres")
# docentes = pd.read_excel(DATA_PATH, sheet_name="Docentes")

# extra_data = inmuebles, labs, docentes
# joblib.dump(extra_data, DATA_DIR / 'extra_data.pkl')

In [19]:
inmuebles, labs, docentes = joblib.load(DATA_DIR / 'extra_data.pkl')
df = joblib.load(DATA_DIR / 'processed_df.pkl')

In [27]:

labs['Nombre Institución'].value_counts().sort_values(ascending=True)

Nombre Institución
C.F.T. ESC. ALTOS EST. DE LA COMUNICACIÓN EACE      1
C.F.T. AQUATECH                                     1
C.F.T. PUKARÁ                                       1
I.P. INSTITUTO SUPERIOR DE ELECTRÓNICA GAMMA        1
C.F.T. ALEMÁN DE VIÑA DEL MAR                       1
                                                 ... 
U. SANTO TOMÁS                                    163
I.P. DE LOS LAGOS                                 164
I.P. DEL VALLE CENTRAL                            204
C.F.T. SANTO TOMÁS                                226
I.P. SANTO TOMÁS                                  231
Name: count, Length: 231, dtype: int64

### Preprocessing Data

In [7]:
# pipes
high_na_perc_dropper = DropHighNAPercentage(na_threshold=0.3, exclude=[])
ordinal_mapper = OrdinalColumnMapper(
    columns=["Tipo Institución"],
    mappings=[m_inst],
)
one_hot_encoder = DataframeOneHotEncoder(
    columns=["Nombre Institución", "Sede"], min_frequency=20
)
inputer = KNNImputer().set_output(transform="pandas")
# one_hot_encoder = OneHotEncoder(sparse_output=False, transform="pandas")

print("processing inmuebles...")
inmuebles = DropColumns(columns_to_drop=["idInstitucion"]).fit_transform(inmuebles)
inmuebles = high_na_perc_dropper.fit_transform(inmuebles)
inmuebles = ordinal_mapper.fit_transform(inmuebles)
inmuebles = one_hot_encoder.fit_transform(inmuebles)
inmuebles = inputer.fit_transform(inmuebles)

print("processing labs...")
labs = ordinal_mapper.fit_transform(labs)
labs = one_hot_encoder.fit_transform(labs)
labs = inputer.fit_transform(labs)

print("processing docentes...")
docentes = high_na_perc_dropper.fit_transform(docentes)
docentes = ordinal_mapper.fit_transform(docentes)
docentes = one_hot_encoder.fit_transform(docentes)
docentes = inputer.fit_transform(docentes)

processing inmuebles...
processing labs...
processing docentes...


In [8]:
show(inmuebles)
show(labs)
show(docentes)
show(df)

(5823, 124)


Unnamed: 0,Tipo Institución,Cód. Institución,idSede,Año Información,N° Inmuebles,M2 Terreno,M2 Construido,M2 Salas,Nº Oficinas,Nº Salas,Nombre Institución_C.F.T. ANDRÉS BELLO,Nombre Institución_C.F.T. CEDUC - UCN,Nombre Institución_C.F.T. DEL MEDIO AMBIENTE,Nombre Institución_C.F.T. ENAC,Nombre Institución_C.F.T. INACAP,Nombre Institución_C.F.T. LA ARAUCANA,Nombre Institución_C.F.T. LOS LAGOS,Nombre Institución_C.F.T. MASSACHUSETTS,Nombre Institución_C.F.T. PROANDES,Nombre Institución_C.F.T. PUCV,Nombre Institución_C.F.T. SALESIANOS DON BOSCO,Nombre Institución_C.F.T. SAN AGUSTÍN DE TALCA,Nombre Institución_C.F.T. SANTO TOMÁS,Nombre Institución_C.F.T. TEODORO WICKEL,Nombre Institución_I.P. AIEP,Nombre Institución_I.P. DE ARTES Y COMUNICACIÓN ARCOS,Nombre Institución_I.P. DE CHILE,Nombre Institución_I.P. DE LOS LAGOS,Nombre Institución_I.P. DEL VALLE CENTRAL,Nombre Institución_I.P. DIEGO PORTALES,Nombre Institución_I.P. DUOC UC,Nombre Institución_I.P. ESCUELA MODERNA DE MÚSICA,Nombre Institución_I.P. GUILLERMO SUBERCASEAUX,Nombre Institución_I.P. INACAP,Nombre Institución_I.P. INTERNACIONAL DE ARTES CULINARIAS Y SERVICIOS,Nombre Institución_I.P. IPEGE,Nombre Institución_I.P. LA ARAUCANA,Nombre Institución_I.P. LATINOAMERICANO DE COMERCIO EXTERIOR,Nombre Institución_I.P. LOS LEONES,Nombre Institución_I.P. PROVIDENCIA,Nombre Institución_I.P. SANTO TOMÁS,Nombre Institución_I.P. VIRGINIO GÓMEZ,Nombre Institución_PONTIFICIA U. CATÓLICA DE CHILE,Nombre Institución_U. ADOLFO IBÁÑEZ,Nombre Institución_U. ANDRÉS BELLO,Nombre Institución_U. ARTURO PRAT,Nombre Institución_U. AUSTRAL DE CHILE,Nombre Institución_U. AUTÓNOMA DE CHILE,Nombre Institución_U. BOLIVARIANA,Nombre Institución_U. CATÓLICA DE LA SANTÍSIMA CONCEPCIÓN,Nombre Institución_U. CATÓLICA DEL MAULE,Nombre Institución_U. CATÓLICA DEL NORTE,Nombre Institución_U. CENTRAL DE CHILE,Nombre Institución_U. DE ACONCAGUA,Nombre Institución_U. DE ARTE Y CIENCIAS SOCIALES ARCIS,Nombre Institución_U. DE ATACAMA,Nombre Institución_U. DE LA FRONTERA,Nombre Institución_U. DE LA SERENA,Nombre Institución_U. DE LAS AMÉRICAS,Nombre Institución_U. DE LOS LAGOS,Nombre Institución_U. DE MAGALLANES,Nombre Institución_U. DE PLAYA ANCHA DE CIENCIAS DE LA EDUCACIÓN,Nombre Institución_U. DE TALCA,Nombre Institución_U. DE TARAPACÁ,Nombre Institución_U. DE VALPARAÍSO,Nombre Institución_U. DE VIÑA DEL MAR,Nombre Institución_U. DEL ALBA,Nombre Institución_U. DEL BÍO-BÍO,Nombre Institución_U. DEL DESARROLLO,Nombre Institución_U. DEL MAR,Nombre Institución_U. DEL PACÍFICO,Nombre Institución_U. GABRIELA MISTRAL,Nombre Institución_U. LA REPÚBLICA,Nombre Institución_U. MAYOR,Nombre Institución_U. METROPOLITANA DE CIENCIAS DE LA EDUCACIÓN,Nombre Institución_U. SAN SEBASTIÁN,Nombre Institución_U. SANTO TOMÁS,Nombre Institución_U. TECNOLÓGICA DE CHILE INACAP,Nombre Institución_U. TECNOLÓGICA METROPOLITANA,Nombre Institución_U. TÉCNICA FEDERICO SANTA MARÍA,Nombre Institución_infrequent_sklearn,Sede_Angol,Sede_Antofagasta,Sede_Arica,Sede_Buin,Sede_Calama,Sede_Cauquenes,Sede_Cañete,Sede_Chillán,Sede_Concepción,Sede_Constitución,Sede_Copiapó,Sede_Coquimbo,Sede_Coyhaique,Sede_Curicó,Sede_Iquique,Sede_La Ligua,Sede_La Reina,Sede_La Serena,Sede_Las Condes,Sede_Linares,Sede_Los Andes,Sede_Los Ángeles,Sede_Melipilla,Sede_Osorno,Sede_Ovalle,Sede_Providencia,Sede_Puerto Montt,Sede_Punta Arenas,Sede_Quillota,Sede_Rancagua,Sede_San Antonio,Sede_San Felipe,Sede_San Fernando,Sede_Santiago,Sede_Talca,Sede_Talcahuano,Sede_Temuco,Sede_Valdivia,Sede_Vallenar,Sede_Valparaíso,Sede_Vitacura,Sede_Viña del Mar,Sede_infrequent_sklearn
0,0.0,1001.0,1001001.0,2022.0,150.0,109043910.0,705759.0,50986.0,3500.0,708.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1001.0,1001001.0,2021.0,140.0,109046688.0,692010.0,51599.0,3500.0,736.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,1001.0,1001001.0,2020.0,140.0,109097053.0,681132.0,50716.0,3500.0,707.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,1001.0,1001001.0,2019.0,140.0,109097053.0,680498.0,44829.0,3500.0,768.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,1001.0,1001001.0,2018.0,140.0,108016252.0,675606.0,43285.0,3489.0,775.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


(5567, 115)


Unnamed: 0,Tipo Institución,Cód. Institución,idSede,Año Proceso,Nº Laboratorios,M2 Construido,Nº de PC para alumnos,N° Computadores con Internet,Nombre Institución_C.F.T. ANDRÉS BELLO,Nombre Institución_C.F.T. CEDUC - UCN,Nombre Institución_C.F.T. ENAC,Nombre Institución_C.F.T. INACAP,Nombre Institución_C.F.T. LOS LAGOS,Nombre Institución_C.F.T. LOTA-ARAUCO,Nombre Institución_C.F.T. MASSACHUSETTS,Nombre Institución_C.F.T. PROANDES,Nombre Institución_C.F.T. PUCV,Nombre Institución_C.F.T. SALESIANOS DON BOSCO,Nombre Institución_C.F.T. SAN AGUSTÍN DE TALCA,Nombre Institución_C.F.T. SANTO TOMÁS,Nombre Institución_C.F.T. TEODORO WICKEL,Nombre Institución_I.P. AIEP,Nombre Institución_I.P. DE ARTES Y COMUNICACIÓN ARCOS,Nombre Institución_I.P. DE CHILE,Nombre Institución_I.P. DE LOS LAGOS,Nombre Institución_I.P. DEL VALLE CENTRAL,Nombre Institución_I.P. DIEGO PORTALES,Nombre Institución_I.P. DUOC UC,Nombre Institución_I.P. ESCUELA MODERNA DE MÚSICA,Nombre Institución_I.P. GUILLERMO SUBERCASEAUX,Nombre Institución_I.P. INACAP,Nombre Institución_I.P. INTERNACIONAL DE ARTES CULINARIAS Y SERVICIOS,Nombre Institución_I.P. IPEGE,Nombre Institución_I.P. LA ARAUCANA,Nombre Institución_I.P. LATINOAMERICANO DE COMERCIO EXTERIOR,Nombre Institución_I.P. LOS LEONES,Nombre Institución_I.P. PROVIDENCIA,Nombre Institución_I.P. SANTO TOMÁS,Nombre Institución_I.P. VIRGINIO GÓMEZ,Nombre Institución_PONTIFICIA U. CATÓLICA DE CHILE,Nombre Institución_U. ADOLFO IBÁÑEZ,Nombre Institución_U. ANDRÉS BELLO,Nombre Institución_U. ARTURO PRAT,Nombre Institución_U. AUSTRAL DE CHILE,Nombre Institución_U. AUTÓNOMA DE CHILE,Nombre Institución_U. BOLIVARIANA,Nombre Institución_U. CATÓLICA DE LA SANTÍSIMA CONCEPCIÓN,Nombre Institución_U. CATÓLICA DEL MAULE,Nombre Institución_U. CATÓLICA DEL NORTE,Nombre Institución_U. CENTRAL DE CHILE,Nombre Institución_U. DE ACONCAGUA,Nombre Institución_U. DE ARTE Y CIENCIAS SOCIALES ARCIS,Nombre Institución_U. DE ATACAMA,Nombre Institución_U. DE LA SERENA,Nombre Institución_U. DE LAS AMÉRICAS,Nombre Institución_U. DE LOS LAGOS,Nombre Institución_U. DE MAGALLANES,Nombre Institución_U. DE PLAYA ANCHA DE CIENCIAS DE LA EDUCACIÓN,Nombre Institución_U. DE TALCA,Nombre Institución_U. DE TARAPACÁ,Nombre Institución_U. DE VALPARAÍSO,Nombre Institución_U. DE VIÑA DEL MAR,Nombre Institución_U. DEL ALBA,Nombre Institución_U. DEL BÍO-BÍO,Nombre Institución_U. DEL DESARROLLO,Nombre Institución_U. DEL MAR,Nombre Institución_U. DEL PACÍFICO,Nombre Institución_U. GABRIELA MISTRAL,Nombre Institución_U. LA REPÚBLICA,Nombre Institución_U. MAYOR,Nombre Institución_U. METROPOLITANA DE CIENCIAS DE LA EDUCACIÓN,Nombre Institución_U. SAN SEBASTIÁN,Nombre Institución_U. SANTO TOMÁS,Nombre Institución_U. TECNOLÓGICA DE CHILE INACAP,Nombre Institución_U. TÉCNICA FEDERICO SANTA MARÍA,Nombre Institución_infrequent_sklearn,Sede_Angol,Sede_Antofagasta,Sede_Arica,Sede_Calama,Sede_Cañete,Sede_Chillán,Sede_Concepción,Sede_Constitución,Sede_Copiapó,Sede_Coquimbo,Sede_Coyhaique,Sede_Curicó,Sede_Iquique,Sede_La Serena,Sede_Las Condes,Sede_Linares,Sede_Los Andes,Sede_Los Ángeles,Sede_Melipilla,Sede_Osorno,Sede_Ovalle,Sede_Providencia,Sede_Puerto Montt,Sede_Punta Arenas,Sede_Quillota,Sede_Rancagua,Sede_San Antonio,Sede_San Felipe,Sede_San Fernando,Sede_Santiago,Sede_Talca,Sede_Talcahuano,Sede_Temuco,Sede_Valdivia,Sede_Vallenar,Sede_Valparaíso,Sede_Vitacura,Sede_Viña del Mar,Sede_infrequent_sklearn
0,0.0,1001.0,1001001.0,2022.0,1091.0,64706.0,5325.0,5231.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1001.0,1001001.0,2021.0,1103.0,65251.0,5190.0,5089.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,1001.0,1001001.0,2020.0,1103.0,65251.0,4718.0,4624.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,1001.0,1001001.0,2019.0,1012.0,69207.0,4280.0,4177.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,1001.0,1001001.0,2018.0,996.0,68435.0,4248.0,4248.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


(5845, 127)


Unnamed: 0,Cód. Institución,idSede,Tipo Institución,Año Proceso,N°DocentesJornadaMedia,N°DocentesJornadaHora,N°DocentesJornadaCompleta,N°HorasJornadaCompleta,N°HorasJornadaMedia,N°HorasJornadaHora,N°HorasProfJornadaHora,N°HorasProfJornadaMedia,N°HorasProfJornadaCompleta,N°ProfesionalJornadaHora,N°ProfesionalJornadaMedia,N°ProfesionalJornadaCompleta,N°Docentes,N°Horas,N°DocentesHombres,N°DocentesMujeres,Nombre Institución_C.F.T. ANDRÉS BELLO,Nombre Institución_C.F.T. CEDUC - UCN,Nombre Institución_C.F.T. DE TARAPACÁ,Nombre Institución_C.F.T. DIEGO PORTALES,Nombre Institución_C.F.T. INACAP,Nombre Institución_C.F.T. LOS LAGOS,Nombre Institución_C.F.T. PROANDES,Nombre Institución_C.F.T. PUCV,Nombre Institución_C.F.T. SALESIANOS DON BOSCO,Nombre Institución_C.F.T. SAN AGUSTÍN DE TALCA,Nombre Institución_C.F.T. SANTO TOMÁS,Nombre Institución_C.F.T. TEODORO WICKEL,Nombre Institución_I.P. AIEP,Nombre Institución_I.P. DE ARTES Y COMUNICACIÓN ARCOS,Nombre Institución_I.P. DE CHILE,Nombre Institución_I.P. DE LOS LAGOS,Nombre Institución_I.P. DEL VALLE CENTRAL,Nombre Institución_I.P. DIEGO PORTALES,Nombre Institución_I.P. DUOC UC,Nombre Institución_I.P. ESCUELA MODERNA DE MÚSICA,Nombre Institución_I.P. GUILLERMO SUBERCASEAUX,Nombre Institución_I.P. INACAP,Nombre Institución_I.P. INTERNACIONAL DE ARTES CULINARIAS Y SERVICIOS,Nombre Institución_I.P. IPEGE,Nombre Institución_I.P. LA ARAUCANA,Nombre Institución_I.P. LATINOAMERICANO DE COMERCIO EXTERIOR,Nombre Institución_I.P. LOS LEONES,Nombre Institución_I.P. PROVIDENCIA,Nombre Institución_I.P. SANTO TOMÁS,Nombre Institución_I.P. VIRGINIO GÓMEZ,Nombre Institución_PONTIFICIA U. CATÓLICA DE CHILE,Nombre Institución_U. ADOLFO IBÁÑEZ,Nombre Institución_U. ANDRÉS BELLO,Nombre Institución_U. ARTURO PRAT,Nombre Institución_U. AUSTRAL DE CHILE,Nombre Institución_U. AUTÓNOMA DE CHILE,Nombre Institución_U. BOLIVARIANA,Nombre Institución_U. CATÓLICA DE LA SANTÍSIMA CONCEPCIÓN,Nombre Institución_U. CATÓLICA DEL MAULE,Nombre Institución_U. CATÓLICA DEL NORTE,Nombre Institución_U. CENTRAL DE CHILE,Nombre Institución_U. DE ACONCAGUA,Nombre Institución_U. DE CONCEPCIÓN,Nombre Institución_U. DE LA FRONTERA,Nombre Institución_U. DE LA SERENA,Nombre Institución_U. DE LAS AMÉRICAS,Nombre Institución_U. DE LOS LAGOS,Nombre Institución_U. DE MAGALLANES,Nombre Institución_U. DE PLAYA ANCHA DE CIENCIAS DE LA EDUCACIÓN,Nombre Institución_U. DE TALCA,Nombre Institución_U. DE TARAPACÁ,Nombre Institución_U. DE VALPARAÍSO,Nombre Institución_U. DE VIÑA DEL MAR,Nombre Institución_U. DEL ALBA,Nombre Institución_U. DEL BÍO-BÍO,Nombre Institución_U. DEL DESARROLLO,Nombre Institución_U. DEL MAR,Nombre Institución_U. DEL PACÍFICO,Nombre Institución_U. GABRIELA MISTRAL,Nombre Institución_U. LA REPÚBLICA,Nombre Institución_U. MAYOR,Nombre Institución_U. METROPOLITANA DE CIENCIAS DE LA EDUCACIÓN,Nombre Institución_U. SAN SEBASTIÁN,Nombre Institución_U. SANTO TOMÁS,Nombre Institución_U. TECNOLÓGICA METROPOLITANA,Nombre Institución_U. TÉCNICA FEDERICO SANTA MARÍA,Nombre Institución_infrequent_sklearn,Sede_Angol,Sede_Antofagasta,Sede_Arica,Sede_Bellavista,Sede_Calama,Sede_Cañete,Sede_Chillán,Sede_Concepción,Sede_Constitución,Sede_Copiapó,Sede_Coquimbo,Sede_Coyhaique,Sede_Curicó,Sede_Iquique,Sede_La Serena,Sede_Las Condes,Sede_Linares,Sede_Los Andes,Sede_Los Ángeles,Sede_Melipilla,Sede_Osorno,Sede_Ovalle,Sede_Providencia,Sede_Puerto Montt,Sede_Punta Arenas,Sede_Quillota,Sede_Rancagua,Sede_San Antonio,Sede_San Felipe,Sede_San Fernando,Sede_Santiago,Sede_Talca,Sede_Talcahuano,Sede_Temuco,Sede_Valdivia,Sede_Valparaíso,Sede_Villarrica,Sede_Vitacura,Sede_Viña del Mar,Sede_infrequent_sklearn
0,1001.0,1001001.0,0.0,2022.0,680.0,1584.0,1727.0,75316.0,15040.0,14201.0,4117.0,4299.0,8129.0,464.0,194.0,190.0,3991.0,104557.0,2440.0,1551.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1001.0,1001001.0,0.0,2021.0,708.0,1544.0,1739.0,75838.0,15663.0,13929.0,4523.0,4771.0,8643.0,508.0,215.0,202.0,3991.0,105430.0,2454.0,1537.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1001.0,1001001.0,0.0,2020.0,684.0,1496.0,1762.0,76737.0,15169.0,13609.0,5710.0,5872.0,11418.0,617.0,265.0,268.0,3942.0,105515.0,2441.0,1501.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1001.0,1001001.0,0.0,2019.0,688.0,1491.0,1779.0,77630.0,15293.0,13301.0,5665.0,6180.0,11812.0,632.0,278.0,276.0,3958.0,106224.0,2481.0,1477.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1001.0,1001001.0,0.0,2018.0,692.0,1420.0,1744.0,76140.0,15383.0,12818.0,5393.0,5927.0,12802.0,583.0,267.0,296.0,3856.0,104341.0,2446.0,1410.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


(39726, 41)


Unnamed: 0,Fecha,Cód. Institución,Nombre Institución,Tipo Institución,Clasificación1,Clasificación2,Clasificación3,Clasificación4,Clasificación5,Clasificación6,Comuna donde se imparte la carrera o programa,Nombre Region,Cód. Carrera,Carrera Genérica,Nombre Programa,Horario,Tipo Programa,Area Conocimiento,Tipo Carrera,IngresoDirecto,Año Inicio Actividades,Nombre del Campus,Duración (en semestres),Cód. Sede,Título,Grado Académico,Promedio Puntaje (promedio matemáticas y lenguaje),Puntaje de corte (primer seleccionado),Puntaje de corte (promedio de la carrera),Puntaje de corte (último seleccionado),Nº Alumnos Ingreso Via PSU o PDT,Valor de matrícula,Valor de arancel,Valor del Título,Vacantes,Matrícula primer año hombres,Matrícula primer año mujeres,Matrícula Primer Año,Matrícula total hombres,Matrícula total mujeres,Matrícula Total
0,2023-01-01,1001,U. DE CHILE,0,0,0,0,0,0,0,Santiago,Región Metropolitana,7444,Administración pública y similares,Administración Pública,Diurno,Programa Regular,Administración y Comercio,Profesional con Licenciatura,Ingreso Directo,1954,Andrés Bello,11.0,1001001,Administrador Público,Licenciatura en Ciencias Políticas y Gubername...,702.0,919.0,758.0,708.0,129.0,177300.0,4861900.0,121000.0,130.0,68.0,82.0,150,302.0,392.0,694
1,2023-01-01,1001,U. DE CHILE,0,0,0,0,0,0,0,Santiago,Región Metropolitana,27288,"Contabilidad, Auditoría y similares",Contador Auditor,Diurno,Programa Regular,Administración y Comercio,Profesional con Licenciatura,Ingreso Directo,2013,Andrés Bello,10.0,1001001,Contador Auditor,Licenciado en Sistemas de Información y Contro...,766.0,885.0,750.0,704.0,84.0,177300.0,6137900.0,121000.0,70.0,54.0,45.0,99,191.0,205.0,396
2,2023-01-01,1001,U. DE CHILE,0,0,0,0,0,0,0,Santiago,Región Metropolitana,7420,Ingeniería Comercial,Ingeniería Comercial,Diurno,Programa Regular,Administración y Comercio,Profesional con Licenciatura,Ingreso Directo,1924,Andrés Bello,10.0,1001001,Ingeniero Comercial,Licenciatura en Ciencias Económicas o en Cienc...,863.0,963.0,838.0,803.0,419.0,177300.0,6844100.0,121000.0,380.0,353.0,234.0,587,1535.0,1126.0,2661
3,2023-01-01,1001,U. DE CHILE,0,0,0,0,0,0,0,Santiago,Región Metropolitana,27287,Ingeniería en Control de Gestión y similares,Ingeniería en Información y Control de Gestión,Diurno,Programa Regular,Administración y Comercio,Profesional con Licenciatura,Ingreso Directo,2013,Andrés Bello,10.0,1001001,Ingeniero en Información y Control de Gestión,Licenciado en Sistemas de Información y Contro...,771.0,881.0,752.0,701.0,172.0,177300.0,6137900.0,121000.0,140.0,101.0,86.0,187,395.0,358.0,753
4,2023-01-01,1001,U. DE CHILE,0,0,0,0,0,0,0,La Pintana,Región Metropolitana,7417,Ingeniería Agronómica,Ingeniería Agronómica,Diurno,Programa Regular,"Agricultura, Silvicultura, Pesca y Veterinaria",Profesional con Licenciatura,Ingreso Directo,1915,Sur,10.0,1001001,Ingeniero Agrónomo,Licenciado en Ciencias Agropecuarias,649.0,830.0,674.0,603.0,129.0,177300.0,5681600.0,121000.0,130.0,76.0,56.0,132,378.0,268.0,646


In [9]:
print(f"Total number of columns: {inmuebles.shape[1] + labs.shape[1] + docentes.shape[1]}")

Total number of columns: 366


In [10]:
X = set(df['Cód. Sede'])
Y = set(inmuebles['idSede'])

In [11]:
len(X), len(Y), len(X & Y), len(X - Y), len(Y - X)

(269, 678, 266, 3, 412)

In [12]:
def prepare_for_merge(df: pd.DataFrame, year_col: str) -> pd.DataFrame:
    df['Año'] = df[year_col].astype(int)
    df = df.drop(columns=[year_col])
    df = df.rename(columns={'idSede': 'Cód. Sede'})
    return df

inmuebles = prepare_for_merge(inmuebles, year_col='Año Información')
labs = prepare_for_merge(labs, year_col='Año Proceso')
docentes = prepare_for_merge(docentes, year_col='Año Proceso')

In [13]:
df['Año'] = df['Fecha'].dt.year

In [28]:
(labs['Nombre Institución_infrequent_sklearn'] == 1).sum()

KeyError: 'Nombre Institución_infrequent_sklearn'

In [14]:
MERGE_COLS = ['Año', 'Cód. Sede']

df = df.merge(inmuebles, on=MERGE_COLS, how='left')
df = df.merge(labs, on=MERGE_COLS, how='left')
df = df.merge(docentes, on=MERGE_COLS, how='left')

MergeError: Passing 'suffixes' which cause duplicate columns {'Cód. Institución_x', 'Tipo Institución_x'} is not allowed.

In [None]:
df