## 1. Librerías

In [66]:
from pyspark.ml import Pipeline
from pyspark.sql import SparkSession
from pyspark.ml.clustering import KMeans
from pyspark.sql import functions as F, types as T
from pyspark.ml.evaluation import ClusteringEvaluator
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, StandardScaler

In [23]:
spark = SparkSession.builder \
          .appName("perfilamiento-ml") \
          .getOrCreate()

# spark = SparkSession.builder \
#     .appName("S3Connection") \
#     .master("local[*]") \
#     .config("spark.jars", jars) \
#     .config('fs.s3a.access.key', "AWS_ACCESS_KEY") \
#     .config('fs.s3a.secret.key', "AWS_SECRET_KEY") \
#     .config('fs.s3a.session.token',"AWS_SESSION_TOKEN") \
#     .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
#     .config("spark.hadoop.fs.s3a.path.style.access", "true") \
#     .config("spark.hadoop.fs.s3a.endpoint", "s3.amazonaws.com") \
#     .getOrCreate()

## 2. Datos

In [None]:
!git clone https://<TOKEN>@github.com/mjbernalv/trabajo2-si7006-252-perfilamiento.git

In [None]:
# Lectura desde colab con token github
df_clients = spark.read.csv("trabajo2-si7006-252-perfilamiento/Datasets/detalle_cliente.csv", header = True)
df_poliza = spark.read.csv("trabajo2-si7006-252-perfilamiento/Datasets/detalle_poliza.csv", header = True)
df_products = spark.read.csv("trabajo2-si7006-252-perfilamiento/Datasets/detalle_producto.csv", header = True)

In [None]:
# Lectura desde trusted en S3
df_clients = spark.read.csv("s3://s3-pi-perfilamiento-20252/trusted/detalle_cliente.csv", header = True)
df_poliza = spark.read.csv("s3://s3-pi-perfilamiento-20252/trusted/detalle_poliza.csv", header = True)
df_products = spark.read.csv("s3://s3-pi-perfilamiento-20252/trusted/detalle_producto.csv", header = True)

**Variables a utilizar:**

Cliente:
1. codCliente
2. Edad
3. Tipo Empresa

Poliza:
1. codPoliza
2. formaPagoVigencia
3. valorTotal
4. estado
5. Vigencia

Producto:
1. codProducto
2. nomRamo

In [83]:
df_clients = df_clients.select("codCliente", "Edad", "Tipo Empresa").filter(df_clients["tomador"] == "Tiene poliza").filter(df_clients["Edad"] > 0)
df_clients.show(5)

+----------+----+------------+
|codCliente|Edad|Tipo Empresa|
+----------+----+------------+
|    154978|  55|   P Natural|
|    154980|  34|   P Natural|
|    154985|   8|   P Natural|
|    153109|  44|   P Natural|
|    193347|  40|   P Natural|
+----------+----+------------+
only showing top 5 rows



In [84]:
df_poliza = df_poliza.select("codPoliza", "formaPagoVigencia", "valorTotal", "estado", "Vigencia", "codCliente", "codProducto").filter(df_poliza["valorTotal"] != 0)
df_poliza.show(5)

+---------+-----------------+----------+-------+--------+----------+-----------+
|codPoliza|formaPagoVigencia|valorTotal| estado|Vigencia|codCliente|codProducto|
+---------+-----------------+----------+-------+--------+----------+-----------+
|   199655|          Mensual|   7656000|Vigente|   Otros|    155010|       1145|
|   199655|          Mensual|   6657000|Vigente|   Otros|    155010|       1145|
|   200020|            Anual|   3420768|Vigente|   Otros|    128350|        641|
|   199634|          Mensual|   1063116|Vigente|  Actual|    155021|        776|
|   199631|            Anual|  44336381|Vigente|  Actual|    153722|        776|
+---------+-----------------+----------+-------+--------+----------+-----------+
only showing top 5 rows



In [85]:
df_products = df_products.select("codProducto", "nomRamo")
df_products.show(5)

+-----------+--------------------+
|codProducto|             nomRamo|
+-----------+--------------------+
|         17|Plan Complementar...|
|         20|Plan Complementar...|
|         64|     Vida Individual|
|         11|      Salud Familiar|
|         83|             Masvida|
+-----------+--------------------+
only showing top 5 rows



In [86]:
df_total = df_clients.join(df_poliza, "codCliente").join(df_products, "codProducto")
df_total.show()

+-----------+----------+----+------------+---------+-----------------+----------+-------+--------+-------------------+
|codProducto|codCliente|Edad|Tipo Empresa|codPoliza|formaPagoVigencia|valorTotal| estado|Vigencia|            nomRamo|
+-----------+----------+----+------------+---------+-----------------+----------+-------+--------+-------------------+
|         34|     75602|  43|   P Natural|    32239|            Anual|   4948480|Vigente|  Actual|Seguro de Educacion|
|        601|     59722|  53|   P Natural|   188255|            Anual|    355500|Vigente|  Actual|               Soat|
|         32|    163281|  47|   P Natural|   163627|            Anual|   3729847|Vigente|  Actual|            Masvida|
|         32|    163281|  47|   P Natural|   163627|            Anual|   3222769|Vigente|Anterior|            Masvida|
|        165|    208072|  37|   P Natural|   183652|            Anual|   1116800|Vigente|  Actual|               Soat|
|        165|    208246|  72|   P Natural|   183

In [97]:
numeric_cols = ["Edad", "valorTotal"]
categorical_cols = ["Tipo Empresa", "formaPagoVigencia", "estado", "Vigencia", "nomRamo"]

df_total = df_total.withColumn("valorTotal", F.col("valorTotal").cast("float"))
df_total = df_total.withColumn("Edad", F.col("Edad").cast("int"))

# tratamiento variables categoricas
indexers = [
    StringIndexer(
        inputCol=c,
        outputCol=f"{c}_idx",
        handleInvalid="keep"        # unseen labels go to a special bucket
    ).setStringOrderType("frequencyDesc")  # stable ordering by frequency
    for c in categorical_cols
]

encoders = [
    OneHotEncoder(
        inputCol=f"{c}_idx",
        outputCol=f"{c}_oh"
    )
    for c in categorical_cols
]

# --- 4) Assemble features (numeric + one-hot) ---
assembler = VectorAssembler(
    inputCols=numeric_cols + [f"{c}_oh" for c in categorical_cols],
    outputCol="features"
)

# --- 5) Scale (good practice for KMeans) ---
scaler = StandardScaler(
    inputCol="features",
    outputCol="scaledFeatures",
    withMean=True,
    withStd=True
)

# --- 6) Full pipeline (encoding + assemble + scale) ---
pipeline = Pipeline(stages=indexers + encoders + [assembler, scaler])

model_prep = pipeline.fit(df_total)
df_transform = model_prep.transform(df_total)

In [98]:
df_transform.show(5)

+-----------+----------+----+------------+---------+-----------------+----------+-------+--------+-------------------+----------------+---------------------+----------+------------+-----------+---------------+--------------------+-------------+-------------+---------------+--------------------+--------------------+
|codProducto|codCliente|Edad|Tipo Empresa|codPoliza|formaPagoVigencia|valorTotal| estado|Vigencia|            nomRamo|Tipo Empresa_idx|formaPagoVigencia_idx|estado_idx|Vigencia_idx|nomRamo_idx|Tipo Empresa_oh|formaPagoVigencia_oh|    estado_oh|  Vigencia_oh|     nomRamo_oh|            features|      scaledFeatures|
+-----------+----------+----+------------+---------+-----------------+----------+-------+--------+-------------------+----------------+---------------------+----------+------------+-----------+---------------+--------------------+-------------+-------------+---------------+--------------------+--------------------+
|         34|     75602|  43|   P Natural|    322

In [None]:
# Escribir datos del modelo en S3
df_total.write.mode("overwrite").parquet("s3://s3-pi-perfilamiento-20252/refined/ml_data")

## 3. Modelo

In [101]:
k = 100  # choose your k
kmeans = KMeans(featuresCol="scaledFeatures", predictionCol="prediction", k=k, seed=42)
kmeans_model = kmeans.fit(df_transform)
df_clusters = kmeans_model.transform(df_transform)

df_clusters.select("codCliente", "prediction").show(20, truncate=False)

+----------+----------+
|codCliente|prediction|
+----------+----------+
|75602     |87        |
|59722     |71        |
|163281    |59        |
|163281    |59        |
|208072    |71        |
|208246    |71        |
|59722     |17        |
|59722     |17        |
|89669     |59        |
|220725    |59        |
|89555     |59        |
|72472     |59        |
|193486    |59        |
|100553    |59        |
|100553    |59        |
|72472     |59        |
|193486    |59        |
|100553    |59        |
|112261    |59        |
|193685    |87        |
+----------+----------+
only showing top 20 rows



In [102]:
evaluator = ClusteringEvaluator(
    featuresCol="scaledFeatures",      # must match your feature column name
    predictionCol="prediction",        # cluster assignment column
    metricName="silhouette",           # metric to compute
    distanceMeasure="squaredEuclidean" # or "cosine"
)

silhouette = evaluator.evaluate(df_clusters)
print(f"Silhouette score = {silhouette:.4f}")

Silhouette score = 0.3187
