# <center> <img src="../labs/img/ITESOLogo.png" alt="ITESO" width="480" height="130"> </center>
# <center> **Departamento de Electrónica, Sistemas e Informática** </center>
---
## <center> **Procesamiento de Datos Masivos** </center>
---
### <center> **Primavera 2025** </center>
---
### <center> **Lab10 Grupo Foraneos - Ejemplo de Machine Learning with Heart Attack Dataset** </center>

---
**Profesor**: Dr. Pablo Camarillo Ramirez

In [1]:
import findspark
findspark.init()

#### Creacion de la conexión con el cluster de spark


In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("lab10_foraneos_Heart_Attacks") \
    .master("spark://0638c7435d1d:7077") \
    .config("spark.ui.port","4040") \
    .getOrCreate()
sc = spark.sparkContext
spark.conf.set("spark.sql.shuffle.partitions", "5")

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/04/24 04:11:55 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


### Preparación de Datos

In [3]:
from foraneos.spark_utils import SparkUtils
# Create a small dataset as a list of tuples
# Format: (label, feature_x1, feature_x2)

# Define schema for the DataFrame
schema = SparkUtils.generate_schema([("male", "float"),("age", "float"),("education", "float"),("currentSmoker", "float"),("cigsPerDay", "float"),\
                                    ("BPMeds", "float"),("prevalentStroke", "float"),("prevalentHyp", "float"), ("diabetes", "float"),\
                                    ("totChol", "float"),("sysBP", "float"),("diaBP", "float"),("BMI", "float"),("heartRate", "float"),\
                                    ("glucose", "float"),("label", "float")])


In [4]:
# Create DataFrame
df = spark \
                .read \
                .schema(schema) \
                .option("header", "true") \
                .option("mode", "dropMalformed")\
                .csv("/home/jovyan/notebooks/data/heart_disease_dataset/framingham.csv")
                

### Assemble the features into a single vector column

In [5]:
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols=["male","age","education","currentSmoker","cigsPerDay","BPMeds"\
                                      ,"prevalentStroke","prevalentHyp","diabetes","totChol"\
                                      ,"sysBP","diaBP","BMI","heartRate","glucose"], outputCol="features")

data_with_features = assembler.transform(df).select("label", "features")

### Split the data into training and test sets 80% training data and 20% testing data

In [6]:
train_df, test_df = data_with_features.randomSplit([0.8, 0.2], seed=57)

### Show the whole dataset

In [7]:
print("Original Dataset")
data_with_features.show(5)

# Print train dataset
print("train set")
train_df.show(5)

Original Dataset


25/04/24 04:12:09 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|[1.0,39.0,4.0,0.0...|
|  0.0|(15,[1,2,9,10,11,...|
|  0.0|[1.0,48.0,1.0,1.0...|
|  1.0|[0.0,61.0,3.0,1.0...|
|  0.0|[0.0,46.0,3.0,1.0...|
+-----+--------------------+
only showing top 5 rows

train set


[Stage 1:>                                                          (0 + 1) / 1]

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(15,[1,2,9,10,11,...|
|  0.0|(15,[1,2,9,10,11,...|
|  0.0|(15,[1,2,9,10,11,...|
|  0.0|(15,[1,2,9,10,11,...|
|  0.0|(15,[1,2,9,10,11,...|
+-----+--------------------+
only showing top 5 rows



                                                                                

### Create a logistic regression model

In [8]:
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(maxIter=100, regParam=0.01)


# TRAINNING

In [9]:
lr_model = lr.fit(train_df)

# Print coefficients
print("Coefficients: " + str(lr_model.coefficients))

# Display model summary
training_summary = lr_model.summary

25/04/24 04:12:27 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
25/04/24 04:12:27 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.VectorBLAS


Coefficients: [0.5608404873684583,0.05802837616043308,-0.038562598508306775,0.08503806682220667,0.013656021187291341,0.20835251289475304,0.6024765564909464,0.23142105145408695,0.15477113002466175,0.0018859659559791388,0.01343412459820667,0.00040377612585452315,0.006021704430583728,-0.0016371097819261652,0.007161938013114953]


# PREDICTIONS

In [10]:

# Use the trained model to make predictions on the test data
predictions = lr_model.transform(test_df)

# Show predictions
predictions.select("features", "prediction", "label","probability").show()

+--------------------+----------+-----+--------------------+
|            features|prediction|label|         probability|
+--------------------+----------+-----+--------------------+
|(15,[1,2,9,10,11,...|       0.0|  0.0|[0.97710645470327...|
|(15,[1,2,9,10,11,...|       0.0|  0.0|[0.97115258375690...|
|(15,[1,2,9,10,11,...|       0.0|  0.0|[0.96822287680679...|
|(15,[1,2,9,10,11,...|       0.0|  0.0|[0.98006475236131...|
|(15,[1,2,9,10,11,...|       0.0|  0.0|[0.95370744128282...|
|(15,[1,2,9,10,11,...|       0.0|  0.0|[0.96838754086303...|
|(15,[1,2,9,10,11,...|       0.0|  0.0|[0.97361947558994...|
|(15,[1,2,9,10,11,...|       0.0|  0.0|[0.96999612145446...|
|(15,[1,2,9,10,11,...|       0.0|  0.0|[0.95454761603023...|
|(15,[1,2,9,10,11,...|       0.0|  0.0|[0.96424059345689...|
|(15,[1,2,9,10,11,...|       0.0|  0.0|[0.97588594262403...|
|(15,[1,2,9,10,11,...|       0.0|  0.0|[0.95086485263690...|
|(15,[1,2,9,10,11,...|       0.0|  0.0|[0.96010573057048...|
|(15,[1,2,9,10,11,...|  

In [11]:
sc.stop()