# <center> <img src="../img/ITESOLogo.png" alt="ITESO" width="480" height="130"> </center>
# <center> **Departamento de Electrónica, Sistemas e Informática** </center>
---
## <center> **Procesamiento de Datos Masivos** </center>
---
### <center> **Primavera 2025** </center>
---
### <center> **Heart attack prediction with Logistic Regression ** </center>

---
**Profesor**: Dr. Pablo Camarillo Ramirez

---
**Alumnos**: David Abraham Naranjo, Benjamin Zarate y Angel Cortes

In [2]:
import findspark
findspark.init()

#### Creacion de la conexión con el cluster de spark


In [3]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Heart attack prediction with Logistic Regression") \
    .master("spark://spark-master:7077") \
    .config("spark.ui.port","4040") \
    .getOrCreate()
sc = spark.sparkContext
spark.conf.set("spark.sql.shuffle.partitions", "5")

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/04/30 16:12:58 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


### Leer el csv

In [4]:
from team_name.spark_utils import SparkUtils

schema = SparkUtils.generate_schema(
    [
        ("male", "integer"),
        ("age", "integer"),
        ("education", "integer"),
        ("currentSmoker", "integer"),
        ("cigsPerDay", "integer"),
        ("BPMeds", "integer"),
        ("prevalentStroke", "integer"),
        ("prevalentHyp", "integer"),
        ("diabetes", "integer"),
        ("totChol", "float"),
        ("sysBP", "float"),
        ("diaBP", "float"),
        ("BMI", "float"),
        ("heartRate", "float"),
        ("glucose", "float"),
        ("TenYearCHD", "integer"),
    ]
)

data = (
    spark.read.schema(schema)
    .option("header", "true")
    .csv("/home/jovyan/notebooks/data/framingham.csv")
)

data.printSchema()
data.show(5)

root
 |-- male: integer (nullable = true)
 |-- age: integer (nullable = true)
 |-- education: integer (nullable = true)
 |-- currentSmoker: integer (nullable = true)
 |-- cigsPerDay: integer (nullable = true)
 |-- BPMeds: integer (nullable = true)
 |-- prevalentStroke: integer (nullable = true)
 |-- prevalentHyp: integer (nullable = true)
 |-- diabetes: integer (nullable = true)
 |-- totChol: float (nullable = true)
 |-- sysBP: float (nullable = true)
 |-- diaBP: float (nullable = true)
 |-- BMI: float (nullable = true)
 |-- heartRate: float (nullable = true)
 |-- glucose: float (nullable = true)
 |-- TenYearCHD: integer (nullable = true)



                                                                                

+----+---+---------+-------------+----------+------+---------------+------------+--------+-------+-----+-----+-----+---------+-------+----------+
|male|age|education|currentSmoker|cigsPerDay|BPMeds|prevalentStroke|prevalentHyp|diabetes|totChol|sysBP|diaBP|  BMI|heartRate|glucose|TenYearCHD|
+----+---+---------+-------------+----------+------+---------------+------------+--------+-------+-----+-----+-----+---------+-------+----------+
|   1| 39|        4|            0|         0|     0|              0|           0|       0|  195.0|106.0| 70.0|26.97|     80.0|   77.0|         0|
|   0| 46|        2|            0|         0|     0|              0|           0|       0|  250.0|121.0| 81.0|28.73|     95.0|   76.0|         0|
|   1| 48|        1|            1|        20|     0|              0|           0|       0|  245.0|127.5| 80.0|25.34|     75.0|   70.0|         0|
|   0| 61|        3|            1|        30|     0|              0|           1|       0|  225.0|150.0| 95.0|28.58|     65.

### Limpiar valores nulos

In [7]:
from pyspark.sql.functions import col

data.describe().show()

data.select([col(c).isNull().cast("int").alias(c) for c in data.columns]).groupBy().sum().show()

data = data.fillna(0)

data.select([col(c).isNull().cast("int").alias(c) for c in data.columns]).groupBy().sum().show()

+-------+-------------------+-----------------+------------------+-------------------+------------------+-------------------+--------------------+-------------------+--------------------+------------------+------------------+------------------+-----------------+------------------+------------------+------------------+
|summary|               male|              age|         education|      currentSmoker|        cigsPerDay|             BPMeds|     prevalentStroke|       prevalentHyp|            diabetes|           totChol|             sysBP|             diaBP|              BMI|         heartRate|           glucose|        TenYearCHD|
+-------+-------------------+-----------------+------------------+-------------------+------------------+-------------------+--------------------+-------------------+--------------------+------------------+------------------+------------------+-----------------+------------------+------------------+------------------+
|  count|               4238|           

### Assemble the features into a single vector column

In [8]:
from pyspark.ml.feature import VectorAssembler

feature_cols = [c for c in data.columns if c != "TenYearCHD"]

assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
data = assembler.transform(data)

### Split the data into training and test sets 80% training data and 20% testing data

In [9]:
train_df, test_df = data.randomSplit([0.8, 0.2], seed=42)

### Show the whole dataset

In [10]:
print("Original Dataset")
data.show()

# Print train dataset
print("train set")
train_df.show()

Original Dataset


                                                                                

+----+---+---------+-------------+----------+------+---------------+------------+--------+-------+-----+-----+-----+---------+-------+----------+--------------------+
|male|age|education|currentSmoker|cigsPerDay|BPMeds|prevalentStroke|prevalentHyp|diabetes|totChol|sysBP|diaBP|  BMI|heartRate|glucose|TenYearCHD|            features|
+----+---+---------+-------------+----------+------+---------------+------------+--------+-------+-----+-----+-----+---------+-------+----------+--------------------+
|   1| 39|        4|            0|         0|     0|              0|           0|       0|  195.0|106.0| 70.0|26.97|     80.0|   77.0|         0|[1.0,39.0,4.0,0.0...|
|   0| 46|        2|            0|         0|     0|              0|           0|       0|  250.0|121.0| 81.0|28.73|     95.0|   76.0|         0|(15,[1,2,9,10,11,...|
|   1| 48|        1|            1|        20|     0|              0|           0|       0|  245.0|127.5| 80.0|25.34|     75.0|   70.0|         0|[1.0,48.0,1.0,1.0...

### Create a logistic regression model

In [11]:
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(featuresCol="features", labelCol="TenYearCHD", maxIter=100)

# TRAINNING

In [12]:
model = lr.fit(train_df)

# Print coefficients
print("Coefficients: " + str(model.coefficients))

# Display model summary
training_summary = model.summary

25/04/30 16:22:56 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
25/04/30 16:22:56 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.VectorBLAS


Coefficients: [0.49600779554261915,0.06095531253445182,-0.026815511962488,0.013034253998157474,0.021949928355587765,0.18445535726571075,0.912173654589193,0.3525868468486528,0.3581601636918068,0.0009391287081768145,0.013558974997571725,-0.0008623652672830381,-0.019601734725967844,-0.0009128447340946759,0.004237402302801891]


# PREDICTIONS

In [13]:
predictions = model.transform(test_df)

predictions.select("features", "prediction", "probability", "TenYearCHD").show(5)

+--------------------+----------+--------------------+----------+
|            features|prediction|         probability|TenYearCHD|
+--------------------+----------+--------------------+----------+
|[0.0,33.0,2.0,1.0...|       0.0|[0.97226999013496...|         0|
|[0.0,34.0,1.0,1.0...|       0.0|[0.95842432556558...|         0|
|(15,[1,2,9,10,11,...|       0.0|[0.97482319150701...|         0|
|[0.0,34.0,2.0,1.0...|       0.0|[0.95759650690736...|         0|
|(15,[1,2,9,10,11,...|       0.0|[0.97606521954991...|         0|
+--------------------+----------+--------------------+----------+
only showing top 5 rows



### Evaluar el modelo

In [14]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(labelCol="TenYearCHD", predictionCol="prediction")

accuracy = evaluator.evaluate(predictions, {evaluator.metricName: "accuracy"})
precision = evaluator.evaluate(predictions, {evaluator.metricName: "weightedPrecision"})
recall = evaluator.evaluate(predictions, {evaluator.metricName: "weightedRecall"})
f1 = evaluator.evaluate(predictions, {evaluator.metricName: "f1"})

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

Accuracy: 0.8605527638190955
Precision: 0.8404992188914064
Recall: 0.8605527638190954
F1 Score: 0.8132224456377826


In [15]:
sc.stop()