In [1]:
from pathlib import Path
import joblib
from pprint import pprint

import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer

from pipeline import (
    DropColumns,
    DropHighNAPercentage,
    NormalizeCurrency,
    OrdinalColumnMapper,
    DataframeOneHotEncoder,
    NanInputer,
    InfoDisplayer,
)
from utils import (
    show,
    show_null_percentages,
    count_frequent_values,
)

pd.set_option("display.max_columns", None)

In [2]:
ROOT_DIR = Path.cwd().parent
DATA_DIR = ROOT_DIR / "data"
assert DATA_DIR.exists()

DATA_PATH = DATA_DIR / "base_indices_2005-2023.csv"

In [3]:
df = joblib.load(DATA_DIR / "base_indices.pkl")
df = df.reset_index()
df = df[df["Promedio Puntaje (promedio matemáticas y lenguaje)"].notna()].reset_index(
    drop=True
)
df = (
    df[df["Pregrado/Posgrado"] == "Pregrado"]
    .drop(columns=["Pregrado/Posgrado"])
    .reset_index(drop=True)
    # .rename(columns={"Año": "Fecha"})
)
df["Año"] = df["Año"].dt.year

In [4]:
inmuebles, labs, docentes = joblib.load(DATA_DIR / 'extra_data.pkl')


In [5]:
# nulls = df.isnull().sum() / len(df) * 100
# nulls.sort_values(ascending=False)

In [6]:
# DROP COLUMNS
COLUMNS_TO_DROP = [
    # drop por no ser relevantes
    "Nombre de la Sede",
    "Orden Geográfico de la Región (Norte aSur)",
    "Mención o Especialidad",
    "idgenerocarrera",
    "Códgo SIES",
    "Máximo Puntaje (promedio matemáticas y lenguaje)",
    "Máximo Puntaje NEM",
    "Máximo Puntaje Ranking",
    "Mínimo Puntaje (promedio matemáticas y lenguaje)",
    "Mínimo Puntaje NEM",
    "Mínimo Puntaje Ranking",
    # drop por decision de mining
    "Grado Académico",
    "Cód. Institución",
    "Nombre Region",
    "Carrera Genérica",
    "Cód. Carrera",
    "Nombre Programa",
    "Nombre del Campus",
    "Título",
]

# DROP HIGH NA COLUMNS
EXCLUDE_COLUMNS_OF_DROPHIGHNA = ["Nombre del Campus"]

# NORMALIZE CURRENCY
CURRENCY_COLUMNS = [
    "Valor de matrícula",
    "Valor de arancel",
    "Valor del Título",
]

# ORDINAL ENCODER
m_class1 = {
    "(a) Universidades CRUCH": 0,
    "(b) Universidades Privadas": 1,
    "(c) Institutos Profesionales": 2,
    "(d) Centros de Formación Técnica": 3,
    "(e) Centros de Formación Técnica Estatales": 3,
    "(f) F.F.A.A.": 4,
}

m_class2 = {
    "(a) Universidades Estatales CRUCH": 0,
    "(b) Universidades Privadas CRUCH": 1,
    "(c) Univ. Privadas Adscritas SUA": 2,
    "(d) Universidades Privadas": 3,
    "(e) Institutos Profesionales": 4,
    "(f) Centros de Formación Técnica": 5,
    "(g) Centros de Formación Técnica statales": 5,
    "(h) F.F.A.A.": 6,
}

m_class3 = {"(a) Acreditada": 0, "(b) No Acreditada": 1}

m_class4 = {
    "(a) Autónoma": 0,
    "(b) Licenciamiento": 1,
    "(c) Examinación": 2,
    "(d) Supervisión": 3,
    "(e) F.F.A.A.": 4,
    "(e) Cerrada": 5,
}

m_class5 = {"(a) Adscritas a Gratuidad": 0, "(b) No Adscritas/No Aplica": 1}

m_class6 = {
    "(a) Subsistema Universitario": 0,
    "(b) Subsistema Técnico Profesional": 1,
    "(c) No adscrito": 2,
    "(d) F.F.A.A.": 3,
}

m_inst = {
    "Univ.": 0,
    "I.P.": 1,
    "C.F.T.": 2,
    "F.F.A.A.": 3,
}

m_tipo_programa = {"Programa Regular": 0, "Programa Especial": 1}

m_tipo_carrera = {
    "Profesional con Licenciatura": 0,
    "Técnico Nivel Superior": 3,
    "Profesional": 1,
    "Licenciatura": 2,
    "Bachillerato": 4,
    "Plan Común o Ciclo Básico": 4,
}

m_tipo_ingreso = {"Ingreso Directo": 0, "No es Ingreso Directo": 1}

COLUMNS_TO_MAP = [f"Clasificación{i}" for i in range(1, 7)]
COLUMNS_TO_MAP.extend(
    [
        "Tipo Institución",
        "Tipo Programa",
        "Tipo Carrera",
        "IngresoDirecto",
    ]
)
MAPPINGS = [
    m_class1,
    m_class2,
    m_class3,
    m_class4,
    m_class5,
    m_class6,
    m_inst,
    m_tipo_programa,
    m_tipo_carrera,
    m_tipo_ingreso,
]

# ONE HOT ENCODER
COLUMNS_TO_ONE_HOT = [
    "Nombre Institución",
    "Comuna donde se imparte la carrera o programa",
    "Area Conocimiento",
    "Horario",
]

In [9]:
data_pipeline = Pipeline(
    steps=[
        ("info_displayer1", InfoDisplayer(name="Original Data")),
        (
            "drop_columns",
            DropColumns(
                columns=COLUMNS_TO_DROP,
            ),
        ),
        ("info_displayer2", InfoDisplayer(name="After Drop Columns")),
        (
            "drop_high_na",
            DropHighNAPercentage(
                na_threshold=0.24, exclude=EXCLUDE_COLUMNS_OF_DROPHIGHNA
            ),
        ),
        ("info_displayer3", InfoDisplayer(name="After Drop High NA")),
        (
            "preprocess_tipo_moneda",
            NormalizeCurrency(columns=CURRENCY_COLUMNS),
        ),
        ("info_displayer4", InfoDisplayer(name="After Normalize Currency")),
        (
            "ordinal_encoder",
            OrdinalColumnMapper(columns=COLUMNS_TO_MAP, mappings=MAPPINGS),
        ),
        ("info_displayer5", InfoDisplayer(name="After Ordinal Encoder")),
        (
            "one_hot_encoder",
            DataframeOneHotEncoder(
                columns=COLUMNS_TO_ONE_HOT,
                min_frequency=20,
                # max_categories=30,
            ),
        ),
        ("info_displayer6", InfoDisplayer(name="After One Hot Encoder")),
        (
            "nan_inputer",
            NanInputer(n_neighbors=5, columns="auto"),
        ),
        ("info_displayer7", InfoDisplayer(name="After Nan Inputer")),
    ],
    verbose=True,
)
data_pipeline

In [10]:
processed_df = data_pipeline.fit_transform(df)

[Original Data] shape: (39726, 59)
[Pipeline] .. (step 1 of 13) Processing info_displayer1, total=   0.0s
[Pipeline] ..... (step 2 of 13) Processing drop_columns, total=   0.0s
[After Drop Columns] shape: (39726, 40)
[Pipeline] .. (step 3 of 13) Processing info_displayer2, total=   0.0s
[Pipeline] ..... (step 4 of 13) Processing drop_high_na, total=   0.0s
[After Drop High NA] shape: (39726, 34)
[Pipeline] .. (step 5 of 13) Processing info_displayer3, total=   0.0s
[Pipeline]  (step 6 of 13) Processing preprocess_tipo_moneda, total=   1.3s
[After Normalize Currency] shape: (39726, 33)
[Pipeline] .. (step 7 of 13) Processing info_displayer4, total=   0.0s
[Pipeline] .. (step 8 of 13) Processing ordinal_encoder, total=   0.1s
[After Ordinal Encoder] shape: (39726, 33)
[Pipeline] .. (step 9 of 13) Processing info_displayer5, total=   0.0s
[Pipeline] . (step 10 of 13) Processing one_hot_encoder, total=   0.3s
[After One Hot Encoder] shape: (39726, 161)
[Pipeline] . (step 11 of 13) Processi

In [11]:
joblib.dump(processed_df, DATA_DIR / "processed_df.pkl")

['/home/pabloskewes/FCFM/Minería de Datos/proyecto-mineria-de-datos/data/processed_df.pkl']

In [12]:
processed_df

Unnamed: 0,Año,Tipo Institución,Clasificación1,Clasificación2,Clasificación3,Clasificación4,Clasificación5,Clasificación6,Tipo Programa,Tipo Carrera,IngresoDirecto,Año Inicio Actividades,Duración (en semestres),Cód. Sede,Promedio Puntaje (promedio matemáticas y lenguaje),Puntaje de corte (primer seleccionado),Puntaje de corte (promedio de la carrera),Puntaje de corte (último seleccionado),Nº Alumnos Ingreso Via PSU o PDT,Valor de matrícula,Valor de arancel,Valor del Título,Vacantes,Matrícula primer año hombres,Matrícula primer año mujeres,Matrícula Primer Año,Matrícula total hombres,Matrícula total mujeres,Matrícula Total,Nombre Institución_C.F.T. DIEGO PORTALES,Nombre Institución_C.F.T. JOHN F. KENNEDY,Nombre Institución_C.F.T. SANTO TOMÁS,Nombre Institución_C.F.T. SIMÓN BOLIVAR,Nombre Institución_ESCUELA DE AVIACIÓN,Nombre Institución_I.P. AIEP,Nombre Institución_I.P. DEL VALLE CENTRAL,Nombre Institución_I.P. DIEGO PORTALES,Nombre Institución_I.P. ESCUELA MODERNA DE MÚSICA,Nombre Institución_I.P. GUILLERMO SUBERCASEAUX,Nombre Institución_I.P. LOS LEONES,Nombre Institución_I.P. SANTO TOMÁS,Nombre Institución_PONTIFICIA U. CATÓLICA DE CHILE,Nombre Institución_PONTIFICIA U. CATÓLICA DE VALPARAÍSO,Nombre Institución_U. ACADEMIA DE HUMANISMO CRISTIANO,Nombre Institución_U. ADOLFO IBÁÑEZ,Nombre Institución_U. ADVENTISTA DE CHILE,Nombre Institución_U. ALBERTO HURTADO,Nombre Institución_U. ANDRÉS BELLO,Nombre Institución_U. ARTURO PRAT,Nombre Institución_U. AUSTRAL DE CHILE,Nombre Institución_U. AUTÓNOMA DE CHILE,Nombre Institución_U. BERNARDO O`HIGGINS,Nombre Institución_U. BOLIVARIANA,Nombre Institución_U. CATÓLICA CARDENAL RAÚL SILVA HENRÍQUEZ,Nombre Institución_U. CATÓLICA DE LA SANTÍSIMA CONCEPCIÓN,Nombre Institución_U. CATÓLICA DE TEMUCO,Nombre Institución_U. CATÓLICA DEL MAULE,Nombre Institución_U. CATÓLICA DEL NORTE,Nombre Institución_U. CENTRAL DE CHILE,Nombre Institución_U. CHILENO-BRITÁNICA DE CULTURA,Nombre Institución_U. DE ANTOFAGASTA,Nombre Institución_U. DE ARTE Y CIENCIAS SOCIALES ARCIS,Nombre Institución_U. DE ATACAMA,Nombre Institución_U. DE AYSEN,Nombre Institución_U. DE CHILE,Nombre Institución_U. DE CONCEPCIÓN,Nombre Institución_U. DE LA FRONTERA,Nombre Institución_U. DE LA SERENA,Nombre Institución_U. DE LAS AMÉRICAS,Nombre Institución_U. DE LOS ANDES,Nombre Institución_U. DE LOS LAGOS,Nombre Institución_U. DE MAGALLANES,Nombre Institución_U. DE O`HIGGINS,Nombre Institución_U. DE PLAYA ANCHA DE CIENCIAS DE LA EDUCACIÓN,Nombre Institución_U. DE SANTIAGO DE CHILE,Nombre Institución_U. DE TALCA,Nombre Institución_U. DE TARAPACÁ,Nombre Institución_U. DE VALPARAÍSO,Nombre Institución_U. DE VIÑA DEL MAR,Nombre Institución_U. DEL ALBA,Nombre Institución_U. DEL BÍO-BÍO,Nombre Institución_U. DEL DESARROLLO,Nombre Institución_U. DEL MAR,Nombre Institución_U. DEL PACÍFICO,Nombre Institución_U. DIEGO PORTALES,Nombre Institución_U. FINIS TERRAE,Nombre Institución_U. GABRIELA MISTRAL,Nombre Institución_U. IBEROAMERICANA DE CIENCIAS Y TECNOLOGÍA,Nombre Institución_U. LA REPÚBLICA,Nombre Institución_U. MARÍTIMA DE CHILE,Nombre Institución_U. MAYOR,Nombre Institución_U. METROPOLITANA DE CIENCIAS DE LA EDUCACIÓN,Nombre Institución_U. SAN SEBASTIÁN,Nombre Institución_U. SANTO TOMÁS,Nombre Institución_U. SEK,Nombre Institución_U. TECNOLÓGICA METROPOLITANA,Nombre Institución_U. TÉCNICA FEDERICO SANTA MARÍA,Nombre Institución_U. UCINF,Nombre Institución_infrequent_sklearn,Comuna donde se imparte la carrera o programa_Antofagasta,Comuna donde se imparte la carrera o programa_Arica,Comuna donde se imparte la carrera o programa_Calama,Comuna donde se imparte la carrera o programa_Chillán,Comuna donde se imparte la carrera o programa_Concepción,Comuna donde se imparte la carrera o programa_Copiapó,Comuna donde se imparte la carrera o programa_Coquimbo,Comuna donde se imparte la carrera o programa_Coyhaique,Comuna donde se imparte la carrera o programa_Curicó,Comuna donde se imparte la carrera o programa_El Bosque,Comuna donde se imparte la carrera o programa_Estación Central,Comuna donde se imparte la carrera o programa_Huechuraba,Comuna donde se imparte la carrera o programa_Iquique,Comuna donde se imparte la carrera o programa_La Florida,Comuna donde se imparte la carrera o programa_La Pintana,Comuna donde se imparte la carrera o programa_La Serena,Comuna donde se imparte la carrera o programa_Las Condes,Comuna donde se imparte la carrera o programa_Linares,Comuna donde se imparte la carrera o programa_Los Ángeles,Comuna donde se imparte la carrera o programa_Macul,Comuna donde se imparte la carrera o programa_Maipú,Comuna donde se imparte la carrera o programa_Melipilla,Comuna donde se imparte la carrera o programa_Osorno,Comuna donde se imparte la carrera o programa_Ovalle,Comuna donde se imparte la carrera o programa_Peñalolén,Comuna donde se imparte la carrera o programa_Providencia,Comuna donde se imparte la carrera o programa_Puerto Montt,Comuna donde se imparte la carrera o programa_Punta Arenas,Comuna donde se imparte la carrera o programa_Quillota,Comuna donde se imparte la carrera o programa_Quilpué,Comuna donde se imparte la carrera o programa_Rancagua,Comuna donde se imparte la carrera o programa_Recoleta,Comuna donde se imparte la carrera o programa_San Felipe,Comuna donde se imparte la carrera o programa_San Fernando,Comuna donde se imparte la carrera o programa_San Joaquín,Comuna donde se imparte la carrera o programa_San Miguel,Comuna donde se imparte la carrera o programa_Santiago,Comuna donde se imparte la carrera o programa_Talca,Comuna donde se imparte la carrera o programa_Talcahuano,Comuna donde se imparte la carrera o programa_Temuco,Comuna donde se imparte la carrera o programa_Valdivia,Comuna donde se imparte la carrera o programa_Valparaíso,Comuna donde se imparte la carrera o programa_Victoria,Comuna donde se imparte la carrera o programa_Villarrica,Comuna donde se imparte la carrera o programa_Vitacura,Comuna donde se imparte la carrera o programa_Viña del Mar,Comuna donde se imparte la carrera o programa_Ñuñoa,Comuna donde se imparte la carrera o programa_infrequent_sklearn,Area Conocimiento_Administración y Comercio,"Area Conocimiento_Agricultura, Silvicultura, Pesca y Veterinaria",Area Conocimiento_Arte y Arquitectura,Area Conocimiento_Ciencias,Area Conocimiento_Ciencias Sociales,Area Conocimiento_Derecho,Area Conocimiento_Educación,"Area Conocimiento_FFAA, Orden y Seguridad",Area Conocimiento_Humanidades,Area Conocimiento_Salud,Area Conocimiento_Tecnología,Horario_Diurno,Horario_Otro,Horario_Vespertino
0,2023.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1954.0,11.0,1001001.0,702.0,919.0,758.0,708.0,129.0,177300.0,4861900.0,121000.0,130.0,68.0,82.0,150.0,302.0,392.0,694.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,2023.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2013.0,10.0,1001001.0,766.0,885.0,750.0,704.0,84.0,177300.0,6137900.0,121000.0,70.0,54.0,45.0,99.0,191.0,205.0,396.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,2023.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1924.0,10.0,1001001.0,863.0,963.0,838.0,803.0,419.0,177300.0,6844100.0,121000.0,380.0,353.0,234.0,587.0,1535.0,1126.0,2661.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,2023.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2013.0,10.0,1001001.0,771.0,881.0,752.0,701.0,172.0,177300.0,6137900.0,121000.0,140.0,101.0,86.0,187.0,395.0,358.0,753.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,2023.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1915.0,10.0,1001001.0,649.0,830.0,674.0,603.0,129.0,177300.0,5681600.0,121000.0,130.0,76.0,56.0,132.0,378.0,268.0,646.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39721,2005.0,2.0,3.0,5.0,1.0,0.0,1.0,2.0,0.0,3.0,0.0,2005.0,7.0,3025001.0,470.0,703.0,542.0,450.0,1.0,47000.0,930000.0,232500.0,50.0,10.0,3.0,13.0,10.0,3.0,13.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
39722,2005.0,2.0,3.0,5.0,1.0,0.0,1.0,2.0,0.0,3.0,0.0,2002.0,7.0,3025001.0,580.0,703.0,542.0,450.0,2.0,47000.0,930000.0,232500.0,50.0,9.0,0.0,9.0,55.0,0.0,55.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
39723,2005.0,2.0,3.0,5.0,1.0,0.0,1.0,2.0,0.0,3.0,0.0,1990.0,7.0,3025001.0,510.0,703.0,542.0,450.0,3.0,47000.0,930000.0,232500.0,50.0,6.0,0.0,6.0,19.0,1.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
39724,2005.0,2.0,3.0,5.0,1.0,0.0,1.0,2.0,0.0,3.0,0.0,1990.0,7.0,3025001.0,562.0,703.0,542.0,450.0,2.0,47000.0,930000.0,232500.0,50.0,14.0,4.0,18.0,77.0,11.0,88.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0


In [None]:
def compute_kmeans(
    df: pd.DataFrame,
    cluster_range: Iterable[int],
) -> pd.DataFrame:
    """
    Perform k-means clustering for a range of k values and collect metrics.
    Args:
        df: Dataframe to cluster.
        cluster_range: Range of k values to try.
    Returns:
        Dictionary with k as key and silhouette score as value.
    """
    interia = []
    silhouette = []
    for k in cluster_range:
        kmeans = KMeans(n_clusters=k, random_state=42).fit(df)
        interia.append(kmeans.inertia_)
        silhouette.append(silhouette_score(df, kmeans.labels_))

    return pd.DataFrame(
        {"inertia": interia, "silhouette": silhouette},
        index=cluster_range,
    )

In [None]:
def plot_kmean_results(k_range, inertia_values, silhouette_scores):
    """
    Plot the results of the k-means clustering.
    Args:
        k_range: Range of k values used.
        inertia_values: List of inertia values for each k.
        silhouette_scores: List of silhouette scores for each k.
    """
    fig, ax = plt.subplots(1, 2, figsize=(15, 5))
    ax[0].plot(k_range, inertia_values, marker="o")
    ax[0].set_xlabel("Número de clusters")
    ax[0].set_ylabel("Inercia")
    ax[0].set_title("Método del codo")

    ax[1].plot(k_range, silhouette_scores, marker="o")
    ax[1].set_xlabel("Número de clusters")
    ax[1].set_ylabel("Silhouette")
    ax[1].set_title("Método de la silueta")
    plt.show()

In [None]:
kmeans_df = compute_kmeans(df, range(2, 21))