# <center> <img src="../img/ITESOLogo.png" alt="ITESO" width="480" height="130"> </center>
# <center> **Departamento de Electrónica, Sistemas e Informática** </center>
---
## <center> **Carrera: Ing. en Sistemas Computacionales** </center>
---
### <center> **Primavera 2025** </center>
---

**Lab 10**: Heart attack prediction with Logistic Regression

**Fecha**: 25 de abril del 2025

**Nombre del Estudiante**: Luis, Benja, Sam

**Profesor**: Pablo Camarillo Ramirez

In [12]:
import findspark
findspark.init()

In [27]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("MLSpark-Logistic-Regression") \
    .master("spark://b33dcc1265b4:7077") \
    .config("spark.ui.port","4040") \
    .getOrCreate()
sc = spark.sparkContext
spark.conf.set("spark.sql.shuffle.partitions", "5")

25/04/29 23:13:00 WARN SparkContext: Another SparkContext is being constructed (or threw an exception in its constructor). This may indicate an error, since only one SparkContext should be running in this JVM (see SPARK-2243). The other SparkContext was created at:
org.apache.spark.api.java.JavaSparkContext.<init>(JavaSparkContext.scala:58)
java.base/jdk.internal.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
java.base/jdk.internal.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:77)
java.base/jdk.internal.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
java.base/java.lang.reflect.Constructor.newInstanceWithCaller(Constructor.java:500)
java.base/java.lang.reflect.Constructor.newInstance(Constructor.java:481)
py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:247)
py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
py4j.Gateway.invoke(Gateway.java:238)
py4j.command

#### Data processing

In [37]:
from gatubelxs.spark_utils import SparkUtils


def generate_schema(columns):
    return [(col, "integer") if col != "BMI" else  (col, "float") for col in columns]

In [36]:
columns = [
    "male", "age", "education", "currentSmoker", "cigsPerDay", "BPMeds",
    "prevalentStroke", "prevalentHyp", "diabetes", "totChol", "sysBP",
    "diaBP", "BMI", "heartRate", "glucose", "TenYearCHD"
]

schema = SparkUtils.generate_schema(generate_schema(columns))

ha_df = spark \
            .read \
            .schema(schema) \
            .option("header", "true") \
            .csv("/home/jovyan/notebooks/data/framingham.csv")

ha_df.show(40)

+----+---+---------+-------------+----------+------+---------------+------------+--------+-------+-----+-----+-----+---------+-------+----------+
|male|age|education|currentSmoker|cigsPerDay|BPMeds|prevalentStroke|prevalentHyp|diabetes|totChol|sysBP|diaBP|  BMI|heartRate|glucose|TenYearCHD|
+----+---+---------+-------------+----------+------+---------------+------------+--------+-------+-----+-----+-----+---------+-------+----------+
|   1| 39|        4|            0|         0|     0|              0|           0|       0|    195|  106|   70|26.97|       80|     77|         0|
|   0| 46|        2|            0|         0|     0|              0|           0|       0|    250|  121|   81|28.73|       95|     76|         0|
|   1| 48|        1|            1|        20|     0|              0|           0|       0|    245| NULL|   80|25.34|       75|     70|         0|
|   0| 61|        3|            1|        30|     0|              0|           1|       0|    225|  150|   95|28.58|       6

##### Replacing null elements with average

In [41]:
from pyspark.sql.functions import col, avg, isnan

#Consideramos tanto nA, como que en el csv vinieran valores vacíos

null_cols = [c for c in ha_df.columns if ha_df.filter(col(c).isNull() | isnan(col(c))).count() > 0]

averages = ha_df.select([avg(col(c)).alias(c) for c in null_cols]).first().asDict()

ha_df = ha_df.fillna(averages)

ha_df.show(40)

+----+---+---------+-------------+----------+------+---------------+------------+--------+-------+-----+-----+-----+---------+-------+----------+
|male|age|education|currentSmoker|cigsPerDay|BPMeds|prevalentStroke|prevalentHyp|diabetes|totChol|sysBP|diaBP|  BMI|heartRate|glucose|TenYearCHD|
+----+---+---------+-------------+----------+------+---------------+------------+--------+-------+-----+-----+-----+---------+-------+----------+
|   1| 39|        4|            0|         0|     0|              0|           0|       0|    195|  106|   70|26.97|       80|     77|         0|
|   0| 46|        2|            0|         0|     0|              0|           0|       0|    250|  121|   81|28.73|       95|     76|         0|
|   1| 48|        1|            1|        20|     0|              0|           0|       0|    245|  132|   80|25.34|       75|     70|         0|
|   0| 61|        3|            1|        30|     0|              0|           1|       0|    225|  150|   95|28.58|       6

##### Assembly into a single vector column

In [44]:
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols=columns[:-1], outputCol="features")
data_with_features = assembler.transform(ha_df).select("TenYearCHD", "features")

##### Split the data into training and test sets

In [46]:
train_df, test_df = data_with_features.randomSplit([0.8, 0.2], seed=57)

data_with_features.show()
train_df.show()

+----------+--------------------+
|TenYearCHD|            features|
+----------+--------------------+
|         0|[1.0,39.0,4.0,0.0...|
|         0|(15,[1,2,9,10,11,...|
|         0|[1.0,48.0,1.0,1.0...|
|         1|[0.0,61.0,3.0,1.0...|
|         0|[0.0,46.0,3.0,1.0...|
|         0|[0.0,43.0,2.0,0.0...|
|         1|(15,[1,2,9,10,11,...|
|         0|[0.0,45.0,2.0,1.0...|
|         0|[1.0,52.0,1.0,0.0...|
|         0|[1.0,43.0,1.0,1.0...|
|         0|(15,[1,2,9,10,11,...|
|         0|(15,[1,2,9,10,11,...|
|         0|[1.0,46.0,1.0,1.0...|
|         0|[0.0,41.0,3.0,0.0...|
|         0|[0.0,39.0,2.0,1.0...|
|         1|[0.0,38.0,2.0,1.0...|
|         0|[1.0,48.0,3.0,1.0...|
|         1|[0.0,46.0,2.0,1.0...|
|         0|[0.0,38.0,2.0,1.0...|
|         0|[1.0,41.0,2.0,0.0...|
+----------+--------------------+
only showing top 20 rows

+----------+--------------------+
|TenYearCHD|            features|
+----------+--------------------+
|         0|(15,[1,2,9,10,11,...|
|         0|(15,[1,2,9

##### Logistic regression model 

In [58]:
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(featuresCol="features", labelCol="TenYearCHD", maxIter=50, regParam=0.01)

In [59]:
lr_model = lr.fit(train_df)

print(f"Coefficients: {str(lr_model.coefficients)}")

training_summary = lr_model.summary

Coefficients: [0.6093336889562436,0.05686563855920877,-0.006811104913662874,0.05043862414111315,0.014680896132736732,0.15109953408809976,0.9618356797751035,0.34454921724887805,0.44311719368984537,0.002093890482430468,0.012032102806718303,0.002652074005620021,-0.004814430532123205,0.0025803732985755458,0.0037639109957434097]


In [60]:
predictions = lr_model.transform(test_df)
predictions.select("features", "prediction", "probability").show()

+--------------------+----------+--------------------+
|            features|prediction|         probability|
+--------------------+----------+--------------------+
|(15,[1,2,9,10,11,...|       0.0|[0.97204891037137...|
|(15,[1,2,9,10,11,...|       0.0|[0.97654476134385...|
|(15,[1,2,9,10,11,...|       0.0|[0.96606844595706...|
|(15,[1,2,9,10,11,...|       0.0|[0.96831352176818...|
|(15,[1,2,9,10,11,...|       0.0|[0.96756823548121...|
|(15,[1,2,9,10,11,...|       0.0|[0.96538272861746...|
|(15,[1,2,9,10,11,...|       0.0|[0.96248816198444...|
|(15,[1,2,9,10,11,...|       0.0|[0.96313737282447...|
|(15,[1,2,9,10,11,...|       0.0|[0.97389242112157...|
|(15,[1,2,9,10,11,...|       0.0|[0.97223671621664...|
|(15,[1,2,9,10,11,...|       0.0|[0.97255560566194...|
|(15,[1,2,9,10,11,...|       0.0|[0.97121528876029...|
|(15,[1,2,9,10,11,...|       0.0|[0.95236582284208...|
|(15,[1,2,9,10,11,...|       0.0|[0.97407197937777...|
|(15,[1,2,9,10,11,...|       0.0|[0.96153091018560...|
|(15,[1,2,

In [62]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(labelCol="TenYearCHD", predictionCol="prediction")

f1 = evaluator.evaluate(predictions, {evaluator.metricName: "f1"})

print('f1: {}'.format(f1))

f1: 0.7950821936016481


##### Predictions

In [26]:
sc.stop()

25/04/29 23:12:23 WARN StandaloneAppClient$ClientEndpoint: Failed to connect to master b33dcc1265b4:7707
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:56)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:310)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:102)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:110)
	at org.apache.spark.deploy.client.StandaloneAppClient$ClientEndpoint$$anon$1.run(StandaloneAppClient.scala:108)
	at java.base/java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:539)
	at java.base/java.util.concurrent.FutureTask.run(FutureTask.java:264)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1136)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:635)
	at j