In [None]:
from pathlib import Path
import joblib

import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.pipeline import Pipeline

from pipeline import (
    DropColumns,
    DropHighNAPercentage,
    NormalizeCurrency,
    OrdinalColumnMapper,
)

pd.set_option('display.max_columns', 100)

In [None]:
ROOT_DIR = Path.cwd().parent
DATA_DIR = ROOT_DIR / "data"
assert DATA_DIR.exists()

DATA_PATH = DATA_DIR / "base_indices_2005-2023.xlsx"

In [None]:
# # xlsx to pkl
# df = pd.read_excel(DATA_PATH)
# joblib.dump(df, DATA_DIR / 'base_indices.pkl')

In [None]:
df = joblib.load(DATA_DIR / "base_indices.pkl")
df = df.reset_index()

# Preprocesamiento

In [None]:
df = (
    df[df["Pregrado/Posgrado"] == "Pregrado"]
    .drop(columns=["Pregrado/Posgrado"])
    .reset_index(drop=True)
)

In [None]:
nulls = df.isnull().sum() / len(df) * 100
nulls.sort_values(ascending=False)

In [None]:
df = df[df["Promedio Puntaje (promedio matemáticas y lenguaje)"].notna()].reset_index(
    drop=True
)

In [None]:
data_pipeline = Pipeline(
    [
        ("drop_columns", DropColumns()),
        ("drop_high_na", DropHighNAPercentage(na_threshold=0.24)),
        ("preprocess_tipo_moneda", NormalizeCurrency()),
        ("ordinal_encoder", OrdinalColumnMapper()),
    ]
)
data_pipeline

In [None]:
processed_df = data_pipeline.fit_transform(df)

In [None]:
joblib.dump(processed_df, DATA_DIR / "processed_df.pkl")

In [None]:
processed_df[processed_df["Nombre del Campus"].isnull()]

In [None]:
nulls = processed_df.isnull().sum() / len(df) * 100
nulls.sort_values(ascending=False)

In [None]:
processed_df[processed_df['Tipo Moneda'] == 'Uf'][['Valor de matrícula', 'Valor del Título']]

In [None]:
uf_index = processed_df[processed_df['Tipo Moneda'] == 'Uf'].index
df.loc[uf_index, ['Valor de matrícula', 'Valor del Título']]

In [None]:

df[df['Tipo Moneda'] == 'Uf']

In [None]:
processed_df

In [None]:
processed_df.info()

In [None]:
processed_df['Tipo Moneda'].value_counts()

In [None]:


get_uf_value()


In [None]:
pprint(processed_df.columns.tolist())

In [None]:
def compute_kmeans(
    df: pd.DataFrame,
    cluster_range: Iterable[int],
) -> pd.DataFrame:
    """
    Perform k-means clustering for a range of k values and collect metrics.
    Args:
        df: Dataframe to cluster.
        cluster_range: Range of k values to try.
    Returns:
        Dictionary with k as key and silhouette score as value.
    """
    interia = []
    silhouette = []
    for k in cluster_range:
        kmeans = KMeans(n_clusters=k, random_state=42).fit(df)
        interia.append(kmeans.inertia_)
        silhouette.append(silhouette_score(df, kmeans.labels_))

    return pd.DataFrame(
        {"inertia": interia, "silhouette": silhouette},
        index=cluster_range,
    )

In [None]:
def plot_kmean_results(k_range, inertia_values, silhouette_scores):
    """
    Plot the results of the k-means clustering.
    Args:
        k_range: Range of k values used.
        inertia_values: List of inertia values for each k.
        silhouette_scores: List of silhouette scores for each k.
    """
    fig, ax = plt.subplots(1, 2, figsize=(15, 5))
    ax[0].plot(k_range, inertia_values, marker="o")
    ax[0].set_xlabel("Número de clusters")
    ax[0].set_ylabel("Inercia")
    ax[0].set_title("Método del codo")

    ax[1].plot(k_range, silhouette_scores, marker="o")
    ax[1].set_xlabel("Número de clusters")
    ax[1].set_ylabel("Silhouette")
    ax[1].set_title("Método de la silueta")
    plt.show()

In [None]:
kmeans_df = compute_kmeans(df, range(2, 21))