In [None]:
from pathlib import Path
import joblib

import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.preprocessing import OneHotEncoder

from pipeline import (
    DropHighNAPercentage,
    m_inst,
    OrdinalColumnMapper,
    DropColumns,
    DataframeOneHotEncoder,
)

pd.set_option("display.max_columns", 100)

In [None]:
ROOT_DIR = Path.cwd().parent
DATA_DIR = ROOT_DIR / "data"

DATA_PATH = DATA_DIR / 'indices_institucional_2005_2022.xlsx'
DATA_PATH.exists()

In [None]:
def show(df: pd.DataFrame, limit: int = 5) -> None:
    print(df.shape)
    display(df.head(limit))

In [None]:
def show_null_percentages(df: pd.DataFrame) -> None:
    nulls = df.isnull().sum() / df.shape[0]
    nulls = nulls.sort_values(ascending=False)
    display(nulls)

### Loading Data

In [None]:
# inmuebles = pd.read_excel(DATA_PATH, sheet_name="Inmuebles")
# labs = pd.read_excel(DATA_PATH, sheet_name="Laboratorios y Talleres")
# docentes = pd.read_excel(DATA_PATH, sheet_name="Docentes")

# extra_data = inmuebles, labs, docentes
# joblib.dump(extra_data, DATA_DIR / 'extra_data.pkl')

In [None]:
inmuebles, labs, docentes = joblib.load(DATA_DIR / 'extra_data.pkl')
df = joblib.load(DATA_DIR / 'processed_df.pkl')

### Preprocessing Data

In [None]:
# pipes
inputer = KNNImputer()
inputer.set_output(transform="pandas")

ordinal_mapper = OrdinalColumnMapper(
    columns=["Tipo Institución"],
    mappings=[m_inst],
)
high_na_perc_dropper = DropHighNAPercentage(na_threshold=0.3, exclude=[])
# one_hot_encoder = DataframeOneHotEncoder(columns=["Nombre Institución", "Sede"])
one_hot_encoder = OneHotEncoder(sparse_output=False, transform="pandas")

# process inmuebles
inmuebles = DropColumns(columns_to_drop=["idInstitucion"]).fit_transform(inmuebles)
inmuebles = high_na_perc_dropper.fit_transform(inmuebles)
inmuebles = ordinal_mapper.fit_transform(inmuebles)
inmuebles = one_hot_encoder.fit_transform(inmuebles)
inmuebles = inputer.fit_transform(inmuebles)


In [None]:
show(inmuebles)

In [None]:
show(docentes)
# show null percentages and sort them
show_null_percentages(docentes)


In [None]:

# process labs
labs = ordinal_mapper.fit_transform(labs)
labs = one_hot_encoder.fit_transform(labs)
labs = inputer.fit_transform(labs)

# process docentes
docentes = high_na_perc_dropper.fit_transform(docentes)
docentes = ordinal_mapper.fit_transform(docentes)
docentes = one_hot_encoder.fit_transform(docentes)
docentes = inputer.fit_transform(docentes)

In [None]:
inputer.get_feature_names_out()

In [None]:
show(inmuebles)
show(labs)
show(docentes)
show(df)

In [None]:
X = set(df['Cód. Sede'])
Y = set(docentes['idSede'])

In [None]:
len(X), len(Y), len(X & Y), len(X - Y), len(Y - X)

In [None]:
show_null_percentages(inmuebles)

In [None]:
show_null_percentages(labs)

In [None]:
show_null_percentages(docentes)

In [None]:
df[df["Cód. Institución"] == df["id"