# <center> Modelo de ML </center>

## Importar las librerías

In [None]:
from google.colab import drive
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import VectorAssembler, StringIndexer, IndexToString

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

## Cargar los Datos

In [None]:
drive.mount('/content/drive')

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

root
 |-- codcliente: long (nullable = true)
 |-- codpoliza: long (nullable = true)
 |-- ramo: string (nullable = true)
 |-- y: string (nullable = true)
 |-- salud: long (nullable = true)
 |-- vida: long (nullable = true)
 |-- autos: long (nullable = true)
 |-- cumplimiento: long (nullable = true)
 |-- patrimoniales: long (nullable = true)
 |-- otros: long (nullable = true)

In [None]:
spark = SparkSession.builder \
    .appName("model-spark") \
    .getOrCreate()

print("Spark iniciado:", spark)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [None]:
path_refined = "/content/drive/MyDrive/Dataset PI/refined/refined_data.csv"
df = spark.read.option("header", "true").option("inferSchema", "true").csv(path_refined)

In [10]:
df.show(5)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+----------+---------+-------------+-------------+-----+----+-----+------------+-------------+-----+
|codcliente|codpoliza|         ramo|            y|salud|vida|autos|cumplimiento|patrimoniales|otros|
+----------+---------+-------------+-------------+-----+----+-----+------------+-------------+-----+
|     10004|    14919|        Autos|        Salud|    0|   0|    1|           0|            0|    0|
|     10031|   211649|        Autos|Patrimoniales|    0|   0|    1|           0|            0|    0|
|     10038|    15246|        Salud|        Autos|    1|   0|    0|           0|            0|    0|
|    100391|    63849|Patrimoniales|        Autos|    0|   0|    0|           0|            1|    0|
|    100442|    64753| Cumplimiento|        Autos|    0|   0|    0|           1|            0|    0|
+----------+---------+-------------+-------------+-----+----+-----+------------+-------------+-----+
only showing top 5 rows

## División de los datos

In [20]:
train_df, test_df = df.randomSplit([0.8, 0.2], seed=42)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [22]:
print("Train df size:", train_df.count())
print("Test df size:", test_df.count())

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Train df size: 7506
Test df size: 1847

## Entrenamiento del modelo

In [24]:
feature_cols = ['salud', 'vida', 'autos', 'cumplimiento', 'patrimoniales', 'otros']

assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
indexer = StringIndexer(inputCol="y", outputCol="label")

rf = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=50, maxDepth=5, seed=42)

pipeline = Pipeline(stages=[indexer, assembler, rf])

rf_model = pipeline.fit(train_df)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

## Evaluación del modelo

In [None]:
predictions = rf_model.transform(test_df)

In [32]:
label_indexer_model = rf_model.stages[0]
labels = label_indexer_model.labels

to_pred = IndexToString(inputCol="prediction", outputCol="prediction_label", labels=labels)
to_true = IndexToString(inputCol="label", outputCol="y_label", labels=labels)

pred_named = to_pred.transform(to_true.transform(predictions))

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [33]:
cm = (
    pred_named
    .groupBy("y_label")
    .pivot("prediction_label", labels)
    .count()
    .na.fill(0)
    .orderBy("y_label")
)
cm.show(truncate=False)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-------------+-----+-------------+----+-----+------------+-----+
|y_label      |Salud|Patrimoniales|Vida|Autos|Cumplimiento|Otros|
+-------------+-----+-------------+----+-----+------------+-----+
|Autos        |133  |6            |138 |48   |0           |0    |
|Cumplimiento |22   |1            |71  |1    |0           |0    |
|Otros        |1    |10           |3   |1    |0           |0    |
|Patrimoniales|148  |205          |66  |22   |0           |0    |
|Salud        |474  |12           |51  |2    |0           |0    |
|Vida         |143  |13           |269 |6    |1           |0    |
+-------------+-----+-------------+----+-----+------------+-----+

In [34]:
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction")
accuracy  = evaluator.setMetricName("accuracy").evaluate(predictions)
f1        = evaluator.setMetricName("f1").evaluate(predictions)
wprec     = evaluator.setMetricName("weightedPrecision").evaluate(predictions)
wrec      = evaluator.setMetricName("weightedRecall").evaluate(predictions)
print(f"Accuracy: {accuracy:.4f} | F1 (weighted): {f1:.4f} | "
      f"Precision (weighted): {wprec:.4f} | Recall (weighted): {wrec:.4f}")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Accuracy: 0.5393 | F1 (weighted): 0.4957 | Precision (weighted): 0.5591 | Recall (weighted): 0.5393