In [24]:
from pathlib import Path
import joblib

import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.preprocessing import OneHotEncoder

from pipeline import (
    DropHighNAPercentage,
    m_inst,
    OrdinalColumnMapper,
    DropColumns,
    DataframeOneHotEncoder,
)

pd.set_option("display.max_columns", 100)

In [2]:
ROOT_DIR = Path.cwd().parent
DATA_DIR = ROOT_DIR / "data"

DATA_PATH = DATA_DIR / 'indices_institucional_2005_2022.xlsx'
DATA_PATH.exists()

True

In [3]:
def show(df: pd.DataFrame, limit: int = 5) -> None:
    print(df.shape)
    display(df.head(limit))

In [4]:
def show_null_percentages(df: pd.DataFrame) -> None:
    nulls = df.isnull().sum() / df.shape[0]
    nulls = nulls.sort_values(ascending=False)
    display(nulls)

### Loading Data

In [6]:
# inmuebles = pd.read_excel(DATA_PATH, sheet_name="Inmuebles")
# labs = pd.read_excel(DATA_PATH, sheet_name="Laboratorios y Talleres")
# docentes = pd.read_excel(DATA_PATH, sheet_name="Docentes")

# extra_data = inmuebles, labs, docentes
# joblib.dump(extra_data, DATA_DIR / 'extra_data.pkl')

['/home/pabloskewes/FCFM/Minería de Datos/proyecto-mineria-de-datos/data/extra_data.pkl']

In [21]:
inmuebles, labs, docentes = joblib.load(DATA_DIR / 'extra_data.pkl')
df = joblib.load(DATA_DIR / 'processed_df.pkl')

### Preprocessing Data

In [22]:
# pipes
inputer = KNNImputer()
inputer.set_output(transform="pandas")

ordinal_mapper = OrdinalColumnMapper(
    columns=["Tipo Institución"],
    mappings=[m_inst],
)
high_na_perc_dropper = DropHighNAPercentage(na_threshold=0.3, exclude=[])
# one_hot_encoder = DataframeOneHotEncoder(columns=["Nombre Institución", "Sede"])
one_hot_encoder = OneHotEncoder(sparse_output=False, transform="pandas")

# process inmuebles
inmuebles = DropColumns(columns_to_drop=["idInstitucion"]).fit_transform(inmuebles)
inmuebles = high_na_perc_dropper.fit_transform(inmuebles)
inmuebles = ordinal_mapper.fit_transform(inmuebles)
inmuebles = one_hot_encoder.fit_transform(inmuebles)
inmuebles = inputer.fit_transform(inmuebles)




In [23]:
show(inmuebles)

(5823, 356)


Unnamed: 0,Tipo Institución,Cód. Institución,idSede,Año Información,N° Inmuebles,M2 Terreno,M2 Construido,M2 Salas,Nº Oficinas,Nº Salas,Nombre Institución_ACADEMIA DE CIENCIAS POLICIALES DE CARABINEROS DE CHILE,Nombre Institución_ACADEMIA DE GUERRA AÉREA,Nombre Institución_ACADEMIA DE GUERRA DEL EJÉRCITO,Nombre Institución_ACADEMIA DE GUERRA NAVAL,Nombre Institución_ACADEMIA NAC. EST. POLÍTICOS Y ESTRATÉGICOS ANEPE,Nombre Institución_ACADEMIA POLITÉCNICA AERONÁUTICA,Nombre Institución_ACADEMIA POLITÉCNICA MILITAR,Nombre Institución_ACADEMIA POLITÉCNICA NAVAL,Nombre Institución_C.F.T. ACCIOMA,Nombre Institución_C.F.T. ICCE,Nombre Institución_C.F.T. ACUARIO DATA,Nombre Institución_C.F.T. ALEMÁN DE VIÑA DEL MAR,Nombre Institución_C.F.T. ALFA,Nombre Institución_C.F.T. ALPES,Nombre Institución_C.F.T. ANDRÉS BELLO,Nombre Institución_C.F.T. AUSTRAL,Nombre Institución_C.F.T. BARROS ARANA,Nombre Institución_C.F.T. CEDEP,Nombre Institución_C.F.T. CEDUC - UCN,Nombre Institución_C.F.T. CEITEC,Nombre Institución_C.F.T. CENAFOM,Nombre Institución_C.F.T. CENCO,Nombre Institución_C.F.T. CENTRO DE ENS. ALTA COSTURA PAULINA DIARD,Nombre Institución_C.F.T. CENTRO DE EST. PARAMÉDICOS DE STGO. CEPSA,Nombre Institución_C.F.T. CENTRO TECNOLÓGICO SUPERIOR INFOMED,Nombre Institución_C.F.T. CEPA DE LA III REGIÓN,Nombre Institución_C.F.T. CEPONAL,Nombre Institución_C.F.T. CHILENO-NORTEAMERICANO,Nombre Institución_C.F.T. CIMA RENGO,Nombre Institución_C.F.T. CRECIC,Nombre Institución_C.F.T. CROWNLIET,Nombre Institución_C.F.T. DE EST. SUP.Y CAPACITACIÓN PROFESIONAL LAPLACE,Nombre Institución_C.F.T. DE LA INDUSTRIA GRÁFICA O C.F.T. INGRAF,Nombre Institución_C.F.T. DE TARAPACÁ,Nombre Institución_C.F.T. DEL MEDIO AMBIENTE,Nombre Institución_C.F.T. DIEGO PORTALES,Nombre Institución_C.F.T. DUOC UC,Nombre Institución_C.F.T. ECATEMA,Nombre Institución_C.F.T. EDUCAP,Nombre Institución_C.F.T. EL ROBLE,...,Sede_Mejillones,Sede_Melipilla,Sede_Osorno,Sede_Ovalle,Sede_Paillaco,Sede_Panguipulli,Sede_Patagonia (Puerto Montt),Sede_Peñalolén,Sede_Portezuelo,Sede_Porvenir,Sede_Providencia,Sede_Pucón,Sede_Puente Alto,Sede_Puerto Aysén,Sede_Puerto Montt,Sede_Puerto Natales,Sede_Puerto Saavedra,Sede_Puerto Varas,Sede_Punta Arenas,Sede_Quilicura,Sede_Quillota,Sede_Quilpué,Sede_Rancagua,Sede_Renca,Sede_Rengo,Sede_Reñaca,Sede_Río Bueno,Sede_San Antonio,Sede_San Bernardo,Sede_San Felipe,Sede_San Fernando,Sede_San Vicente,Sede_Santiago,Sede_Sede Virtual Online,Sede_Talca,Sede_Talcahuano,Sede_Temuco,Sede_Tirúa,Sede_Tocopilla,Sede_Unión,Sede_Valdivia,Sede_Vallenar,Sede_Valparaíso,Sede_Victoria,Sede_Villarrica,Sede_Virtual,Sede_Vitacura,Sede_Viña del Mar,Sede_Ñuble,Sede_Ñuñoa
0,0.0,1001.0,1001001.0,2022.0,150.0,109043910.0,705759.0,50986.0,3500.0,708.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1001.0,1001001.0,2021.0,140.0,109046688.0,692010.0,51599.0,3500.0,736.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,1001.0,1001001.0,2020.0,140.0,109097053.0,681132.0,50716.0,3500.0,707.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,1001.0,1001001.0,2019.0,140.0,109097053.0,680498.0,44829.0,3500.0,768.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,1001.0,1001001.0,2018.0,140.0,108016252.0,675606.0,43285.0,3489.0,775.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:

# process labs
labs = ordinal_mapper.fit_transform(labs)
labs = one_hot_encoder.fit_transform(labs)
labs = inputer.fit_transform(labs)

# process docentes
docentes = high_na_perc_dropper.fit_transform(docentes)
docentes = ordinal_mapper.fit_transform(docentes)
docentes = one_hot_encoder.fit_transform(docentes)
docentes = inputer.fit_transform(docentes)

In [20]:
inputer.get_feature_names_out()

array(['Cód. Institución', 'idSede', 'Tipo Institución', 'Año Proceso',
       'N°DocentesJornadaMedia', 'N°DocentesJornadaHora',
       'N°DocentesJornadaCompleta', 'N°HorasJornadaCompleta',
       'N°HorasJornadaMedia', 'N°HorasJornadaHora',
       'N°HorasProfJornadaHora', 'N°HorasProfJornadaMedia',
       'N°HorasProfJornadaCompleta', 'N°ProfesionalJornadaHora',
       'N°ProfesionalJornadaMedia', 'N°ProfesionalJornadaCompleta',
       'N°Docentes', 'N°Horas', 'N°DocentesHombres', 'N°DocentesMujeres',
       'Nombre Institución_ACADEMIA DE CIENCIAS POLICIALES DE CARABINEROS DE CHILE',
       'Nombre Institución_ACADEMIA DE GUERRA AÉREA',
       'Nombre Institución_ACADEMIA DE GUERRA DEL EJÉRCITO',
       'Nombre Institución_ACADEMIA DE GUERRA NAVAL',
       'Nombre Institución_ACADEMIA NAC. EST. POLÍTICOS Y ESTRATÉGICOS ANEPE',
       'Nombre Institución_ACADEMIA POLITÉCNICA AERONÁUTICA',
       'Nombre Institución_ACADEMIA POLITÉCNICA MILITAR',
       'Nombre Institución_ACADEMIA 

In [18]:
show(inmuebles)
show(labs)
show(docentes)
show(df)

(5823, 356)


AttributeError: 'numpy.ndarray' object has no attribute 'head'

In [21]:
X = set(df['Cód. Sede'])
Y = set(docentes['idSede'])

In [22]:
len(X), len(Y), len(X & Y), len(X - Y), len(Y - X)

(269, 665, 262, 7, 403)

In [13]:
show_null_percentages(inmuebles)

Conectividad Megas    0.768676
Nº Oficinas           0.115233
M2 Salas              0.046196
Nº Salas              0.029538
M2 Terreno            0.020951
N° Inmuebles          0.016315
M2 Construido         0.015971
Tipo Institución      0.000000
Cód. Institución      0.000000
Nombre Institución    0.000000
idInstitucion         0.000000
idSede                0.000000
Sede                  0.000000
Año Información       0.000000
dtype: float64

In [14]:
show_null_percentages(labs)

N° Computadores con Internet    0.039698
Nº de PC para alumnos           0.036465
M2 Construido                   0.015089
Nº Laboratorios                 0.009880
Tipo Institución                0.000000
Cód. Institución                0.000000
Nombre Institución              0.000000
idSede                          0.000000
Sede                            0.000000
Año Proceso                     0.000000
dtype: float64

In [15]:
show_null_percentages(docentes)

N°HorasOtroJornadaCompleta,        0.919247
N°OtroJornadaCompleta              0.918905
N°OtroJornadaMedia                 0.916339
N°HorasOtroJornadaMedia            0.916168
N°HorasEspeJornadaCompleta         0.894440
N°EspecialidadesJornadaCompleta    0.893926
N°HorasEspeJornadaMedia            0.887938
N°EspecialidadesJornadaMedia       0.887254
N°HorasEspeJornadaHora             0.829940
N°EspecialidadesJornadaHora        0.829085
 N°HorasTécnicoJornadaMedia        0.758939
N°TecnicoJornadaMedia              0.758597
N°HorasTécnicoJornadaCompleta      0.756202
N°TecnicoJornadaCompleta           0.756031
N°HorasDoctorJornadaMedia          0.748503
N°DoctorJornadaMedia               0.747134
N°HorasOtroJornadaHora             0.740804
N°OtroJornadaHora                  0.740120
N°HorasDoctorJornadaCompleta       0.697519
N°DoctorJornadaCompleta            0.695979
N°HorasDoctorJornadaHora           0.618648
N°DoctorJornadaHora                0.616253
N°HorasMagísterJornadaMedia     

In [10]:
df[df["Cód. Institución"] == df["id"

Unnamed: 0,Año,Cód. Institución,Nombre Institución,Tipo Institución,Clasificación1,Clasificación2,Clasificación3,Clasificación4,Clasificación5,Clasificación6,...,Valor de matrícula,Valor de arancel,Valor del Título,Vacantes,Matrícula primer año hombres,Matrícula primer año mujeres,Matrícula Primer Año,Matrícula total hombres,Matrícula total mujeres,Matrícula Total
0,2023-01-01,1001,U. DE CHILE,0,0,0,0,0,0,0,...,177300.0,4861900.0,121000.0,130.0,68.0,82.0,150,302.0,392.0,694
1,2023-01-01,1001,U. DE CHILE,0,0,0,0,0,0,0,...,177300.0,6137900.0,121000.0,70.0,54.0,45.0,99,191.0,205.0,396
2,2023-01-01,1001,U. DE CHILE,0,0,0,0,0,0,0,...,177300.0,6844100.0,121000.0,380.0,353.0,234.0,587,1535.0,1126.0,2661
3,2023-01-01,1001,U. DE CHILE,0,0,0,0,0,0,0,...,177300.0,6137900.0,121000.0,140.0,101.0,86.0,187,395.0,358.0,753
4,2023-01-01,1001,U. DE CHILE,0,0,0,0,0,0,0,...,177300.0,5681600.0,121000.0,130.0,76.0,56.0,132,378.0,268.0,646
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39721,2005-01-01,3025,C.F.T. INSTITUTO TECNOLÓGICO DE CHILE,2,3,5,1,0,1,2,...,47000.0,930000.0,232500.0,50.0,10.0,3.0,13,10.0,3.0,13
39722,2005-01-01,3025,C.F.T. INSTITUTO TECNOLÓGICO DE CHILE,2,3,5,1,0,1,2,...,47000.0,930000.0,232500.0,50.0,9.0,0.0,9,55.0,0.0,55
39723,2005-01-01,3025,C.F.T. INSTITUTO TECNOLÓGICO DE CHILE,2,3,5,1,0,1,2,...,47000.0,930000.0,232500.0,50.0,6.0,0.0,6,19.0,1.0,20
39724,2005-01-01,3025,C.F.T. INSTITUTO TECNOLÓGICO DE CHILE,2,3,5,1,0,1,2,...,47000.0,930000.0,232500.0,50.0,14.0,4.0,18,77.0,11.0,88
