# <center> <img src="img/ITESOLogo.png" alt="ITESO" width="480" height="130"> </center>
# <center> **Departamento de Electrónica, Sistemas e Informática** </center>
---
## <center> **Carrera: Ingeniería en Sistemas Computacionales** </center>
---
### <center> **Primavera 2025** </center>
---

**Lab 10**: Heart attack prediction with Logistic Regression

**Fecha**: 25/04/25

**Nombre del Estudiante**: Angel Ramirez, Roberto Osorno, Yochabel Cazares, Samuel Romero

**Profesor**: Pablo Camarillo Ramirez

In [8]:
import findspark
findspark.init()

In [9]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("MLSpark-Logistic-Regression") \
    .master("spark://f04d2745dc57:7077") \
    .config("spark.ui.port","4040") \
    .getOrCreate()
sc = spark.sparkContext
spark.conf.set("spark.sql.shuffle.partitions", "5")

### Preparación de Datos

In [11]:
from team_name.spark_utils import SparkUtils

#Cargar el dataset
heart_schema = SparkUtils.generate_schema([
    ("male", "integer"), ("age", "integer"), ("education", "integer"),
    ("currentSmoker", "integer"), ("cigsPerDay", "integer"),
    ("BPMeds", "integer"), ("prevalentStroke", "integer"),
    ("prevalentHyp", "integer"), ("diabetes", "integer"),
    ("totChol", "integer"), ("sysBP", "double"), ("diaBP", "double"),
    ("BMI", "double"), ("heartRate", "integer"), ("glucose", "integer"),
    ("TenYearCHD", "integer")
])

heart_df = spark.read \
                .schema(heart_schema) \
                .option("header", "true") \
                .csv("/home/jovyan/notebooks/data/framingham.csv")

heart_df.printSchema()
heart_df.show(5, truncate=False)

root
 |-- male: integer (nullable = true)
 |-- age: integer (nullable = true)
 |-- education: integer (nullable = true)
 |-- currentSmoker: integer (nullable = true)
 |-- cigsPerDay: integer (nullable = true)
 |-- BPMeds: integer (nullable = true)
 |-- prevalentStroke: integer (nullable = true)
 |-- prevalentHyp: integer (nullable = true)
 |-- diabetes: integer (nullable = true)
 |-- totChol: integer (nullable = true)
 |-- sysBP: double (nullable = true)
 |-- diaBP: double (nullable = true)
 |-- BMI: double (nullable = true)
 |-- heartRate: integer (nullable = true)
 |-- glucose: integer (nullable = true)
 |-- TenYearCHD: integer (nullable = true)



                                                                                

+----+---+---------+-------------+----------+------+---------------+------------+--------+-------+-----+-----+-----+---------+-------+----------+
|male|age|education|currentSmoker|cigsPerDay|BPMeds|prevalentStroke|prevalentHyp|diabetes|totChol|sysBP|diaBP|BMI  |heartRate|glucose|TenYearCHD|
+----+---+---------+-------------+----------+------+---------------+------------+--------+-------+-----+-----+-----+---------+-------+----------+
|1   |39 |4        |0            |0         |0     |0              |0           |0       |195    |106.0|70.0 |26.97|80       |77     |0         |
|0   |46 |2        |0            |0         |0     |0              |0           |0       |250    |121.0|81.0 |28.73|95       |76     |0         |
|1   |48 |1        |1            |20        |0     |0              |0           |0       |245    |127.5|80.0 |25.34|75       |70     |0         |
|0   |61 |3        |1            |30        |0     |0              |1           |0       |225    |150.0|95.0 |28.58|65      

In [18]:
#Eliminar valores nulos
heart_df_clean = heart_df.na.drop()

### Ensamblar las características en una sola columna vectorial

In [19]:

from pyspark.ml.feature import VectorAssembler

input_features = [
    "male", "age", "education", "currentSmoker", "cigsPerDay",
    "BPMeds", "prevalentStroke", "prevalentHyp", "diabetes",
    "totChol", "sysBP", "diaBP", "BMI", "heartRate", "glucose"
]

assembler = VectorAssembler(inputCols=input_features, outputCol="features")
data_with_features = assembler.transform(heart_df_clean).withColumnRenamed("TenYearCHD", "label").select("label", "features")

### Dividir los datos en conjuntos de entrenamiento y prueba: 80 % de datos de entrenamiento y 20 % de datos de prueba.

In [20]:
train_df, test_df = data_with_features.randomSplit([0.8, 0.2], seed=57)

In [21]:
#Mostrar el dataset completo
print("Original Dataset")
data_with_features.show()

# Print train dataset
print("train set")
train_df.show()

Original Dataset


                                                                                

+-----+--------------------+
|label|            features|
+-----+--------------------+
|    0|[1.0,39.0,4.0,0.0...|
|    0|(15,[1,2,9,10,11,...|
|    0|[1.0,48.0,1.0,1.0...|
|    1|[0.0,61.0,3.0,1.0...|
|    0|[0.0,46.0,3.0,1.0...|
|    0|[0.0,43.0,2.0,0.0...|
|    1|(15,[1,2,9,10,11,...|
|    0|[0.0,45.0,2.0,1.0...|
|    0|[1.0,52.0,1.0,0.0...|
|    0|[1.0,43.0,1.0,1.0...|
|    0|(15,[1,2,9,10,11,...|
|    0|(15,[1,2,9,10,11,...|
|    0|[1.0,46.0,1.0,1.0...|
|    0|[0.0,41.0,3.0,0.0...|
|    1|[0.0,38.0,2.0,1.0...|
|    0|[1.0,48.0,3.0,1.0...|
|    1|[0.0,46.0,2.0,1.0...|
|    0|[0.0,38.0,2.0,1.0...|
|    0|[1.0,41.0,2.0,0.0...|
|    0|[0.0,42.0,2.0,1.0...|
+-----+--------------------+
only showing top 20 rows

train set


[Stage 3:>                                                          (0 + 1) / 1]

+-----+--------------------+
|label|            features|
+-----+--------------------+
|    0|(15,[1,2,9,10,11,...|
|    0|(15,[1,2,9,10,11,...|
|    0|(15,[1,2,9,10,11,...|
|    0|(15,[1,2,9,10,11,...|
|    0|(15,[1,2,9,10,11,...|
|    0|(15,[1,2,9,10,11,...|
|    0|(15,[1,2,9,10,11,...|
|    0|(15,[1,2,9,10,11,...|
|    0|(15,[1,2,9,10,11,...|
|    0|(15,[1,2,9,10,11,...|
|    0|(15,[1,2,9,10,11,...|
|    0|(15,[1,2,9,10,11,...|
|    0|(15,[1,2,9,10,11,...|
|    0|(15,[1,2,9,10,11,...|
|    0|(15,[1,2,9,10,11,...|
|    0|(15,[1,2,9,10,11,...|
|    0|(15,[1,2,9,10,11,...|
|    0|(15,[1,2,9,10,11,...|
|    0|(15,[1,2,9,10,11,...|
|    0|(15,[1,2,9,10,11,...|
+-----+--------------------+
only showing top 20 rows



                                                                                

### Crear el modelo de regresión logística

In [23]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(maxIter=10, regParam=0.01)

In [25]:
#Imprimir los coeficientes
lr_model = lr.fit(train_df)

print("Coefficients: " + str(lr_model.coefficients))

#Resumen del modelo de visualización
training_summary = lr_model.summary

25/04/25 14:50:57 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
25/04/25 14:50:57 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.VectorBLAS


Coefficients: [0.5607363884781358,0.058016343645115036,-0.03862329485392394,0.08452119260032259,0.013677748121026475,0.2078776671534676,0.6011410129855924,0.23123946639244108,0.1554414037106486,0.0018853859790427148,0.01344076320004709,0.0004131080816762553,0.005988276385655795,-0.001638424476252627,0.007162714925878737]


### Predicciones

In [26]:
#Utilizando el modelo entrenado para hacer predicciones sobre los datos de prueba
predictions = lr_model.transform(test_df)

#Mostrar las predicciones
predictions.select("features", "prediction", "probability").show()

+--------------------+----------+--------------------+
|            features|prediction|         probability|
+--------------------+----------+--------------------+
|(15,[1,2,9,10,11,...|       0.0|[0.97710931709629...|
|(15,[1,2,9,10,11,...|       0.0|[0.97113718912072...|
|(15,[1,2,9,10,11,...|       0.0|[0.96821335312546...|
|(15,[1,2,9,10,11,...|       0.0|[0.98006272989298...|
|(15,[1,2,9,10,11,...|       0.0|[0.95368213125195...|
|(15,[1,2,9,10,11,...|       0.0|[0.96837933386689...|
|(15,[1,2,9,10,11,...|       0.0|[0.97360711539862...|
|(15,[1,2,9,10,11,...|       0.0|[0.96998684828337...|
|(15,[1,2,9,10,11,...|       0.0|[0.95453168452457...|
|(15,[1,2,9,10,11,...|       0.0|[0.96422850127588...|
|(15,[1,2,9,10,11,...|       0.0|[0.97588475774837...|
|(15,[1,2,9,10,11,...|       0.0|[0.95084460829496...|
|(15,[1,2,9,10,11,...|       0.0|[0.96009768991907...|
|(15,[1,2,9,10,11,...|       0.0|[0.96503935477800...|
|(15,[1,2,9,10,11,...|       0.0|[0.97349848460212...|
|(15,[1,2,

                                                                                

### Evaluar el modelo

In [28]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction")

accuracy = evaluator.evaluate(predictions, {evaluator.metricName: "accuracy"})
precision = evaluator.evaluate(predictions, {evaluator.metricName: "weightedPrecision"})
recall = evaluator.evaluate(predictions, {evaluator.metricName: "weightedRecall"})
f1 = evaluator.evaluate(predictions, {evaluator.metricName: "f1"})

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

                                                                                

Accuracy: 0.8299866131191432
Precision: 0.8038821954484605
Recall: 0.8299866131191433
F1 Score: 0.7697118793492835


                                                                                