# <center> <img src="../img/ITESOLogo.png" alt="ITESO" width="480" height="130"> </center>
# <center> **Departamento de Electrónica, Sistemas e Informática** </center>
---
## <center> **Big Data** </center>
---
### <center> **Autumn 2025** </center>
---
### <center> **Examples on Machine Learning: Logistic Regression** </center>
---
**Profesor**: Pablo Camarillo Ramirez

**Estudiante**: Sergio Villa Rodriguez

## Recreate docker images and spark cluster:

Go to **spark** directory and run:
     
     sh build-images.sh
     docker compose up --scale spark-worker=1 -d

# Create SparkSession

In [1]:
import findspark
findspark.init()

from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("ML: Logistic Regression") \
    .master("spark://spark-master:7077") \
    .config("spark.ui.port", "4040") \
    .getOrCreate()

sc = spark.sparkContext
sc.setLogLevel("INFO")

# Optimization (reduce the number of shuffle partitions)
spark.conf.set("spark.sql.shuffle.partitions", "5")

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/10/23 03:51:55 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


# Collect Data

In [4]:
from pcamarillor.spark_utils import SparkUtils
# Create a small dataset as a list of tuples
# Format: (label, feature_x1, feature_x2)
data = [
    (1.0, 2.0, 3.0),
    (0.0, 1.0, 2.5),
    (1.0, 3.0, 5.0),
    (0.0, 0.5, 1.0),
    (1.0, 4.0, 6.0)
]

# Define schema for the DataFrame
schema = SparkUtils.generate_schema([("label", "float"), 
                                     ("feature_x1", "float"),
                                     ("feature_x2", "float")])

# Convert list to a DataFrame
df = spark.createDataFrame(data, schema)
df.printSchema()

root
 |-- label: float (nullable = true)
 |-- feature_x1: float (nullable = true)
 |-- feature_x2: float (nullable = true)



### Assemble the features into a single vector column

In [6]:
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols=["feature_x1", "feature_x2"], outputCol="features")
data_with_features = assembler.transform(df).select("label", "features")                 
data_with_features.printSchema()

root
 |-- label: float (nullable = true)
 |-- features: vector (nullable = true)



# Data splitting
#### 80% training data and 20% testing data

In [7]:
train_df, test_df = data_with_features.randomSplit([0.8, 0.2], seed=101)

### Show dataset (for debugging)

In [9]:
print("Original Dataset")
df.show()

# Print train dataset
print("train set")
train_df.show()

Original Dataset
+-----+----------+----------+
|label|feature_x1|feature_x2|
+-----+----------+----------+
|  1.0|       2.0|       3.0|
|  0.0|       1.0|       2.5|
|  1.0|       3.0|       5.0|
|  0.0|       0.5|       1.0|
|  1.0|       4.0|       6.0|
+-----+----------+----------+

train set
+-----+---------+
|label| features|
+-----+---------+
|  0.0|[1.0,2.5]|
|  1.0|[2.0,3.0]|
|  0.0|[0.5,1.0]|
|  1.0|[4.0,6.0]|
+-----+---------+



# Create ML Model

In [10]:
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(maxIter=10, regParam=0.01)

# Train ML Model

In [11]:
lr_model = lr.fit(train_df)

# Print coefficients
print("Coefficients: " + str(lr_model.coefficients))

# Display model summary
training_summary = lr_model.summary

25/10/23 03:54:40 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS


Coefficients: [2.346116998875653,0.7963873036415706]


## Predictions

In [12]:
# Use the trained model to make predictions on the test data
predictions = lr_model.transform(test_df)

# Show predictions
predictions.select("features", "prediction", "probability").show()

+---------+----------+--------------------+
| features|prediction|         probability|
+---------+----------+--------------------+
|[3.0,5.0]|       1.0|[0.00524886113385...|
+---------+----------+--------------------+



# Test ML Model

In [13]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(labelCol="label",
                            predictionCol="prediction")

accuracy = evaluator.evaluate(predictions, 
                  {evaluator.metricName: "accuracy"})
print(f"Accuracy: {accuracy}")
precision = evaluator.evaluate(predictions,
                  {evaluator.metricName: "weightedPrecision"})
print(f"Precision: {precision}")
recall = evaluator.evaluate(predictions,
                  {evaluator.metricName: "weightedRecall"})
print(f"Recall: {recall}")
f1 = evaluator.evaluate(predictions,
                {evaluator.metricName: "f1"})
print(f"F1 Score: {f1}")  

Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1 Score: 1.0


# Lab 11: Logistic regression To predict heart disease

# Data collection

In [16]:
# Define schema for the DataFrame
heart_schema = SparkUtils.generate_schema([
    ("male", "int"), 
    ("age", "int"), 
    ("education", "int"), 
    ("currentSmoker", "int"), 
    ("cigsPerDay", "int"), 
    ("BPMeds", "int"), 
    ("prevalentStroke", "int"), 
    ("prevalentHyp", "int"), 
    ("diabetes", "int"), 
    ("totChol", "int"), 
    ("sysBP", "float"), 
    ("diaBP", "float"), 
    ("BMI", "float"), 
    ("heartRate", "int"), 
    ("glucose", "int"), 
    ("TenYearCHD", "int")])

# Source: https://www.kaggle.com/datasets/dileep070/heart-disease-prediction-using-logistic-regression?resource=download

heart_df = spark.read \
                .option("header", "true") \
                .schema(heart_schema) \
                .csv("/opt/spark/work-dir/data/ml/logistic_regression/framingham.csv")
heart_df.printSchema()

root
 |-- male: integer (nullable = true)
 |-- age: integer (nullable = true)
 |-- education: integer (nullable = true)
 |-- currentSmoker: integer (nullable = true)
 |-- cigsPerDay: integer (nullable = true)
 |-- BPMeds: integer (nullable = true)
 |-- prevalentStroke: integer (nullable = true)
 |-- prevalentHyp: integer (nullable = true)
 |-- diabetes: integer (nullable = true)
 |-- totChol: integer (nullable = true)
 |-- sysBP: float (nullable = true)
 |-- diaBP: float (nullable = true)
 |-- BMI: float (nullable = true)
 |-- heartRate: integer (nullable = true)
 |-- glucose: integer (nullable = true)
 |-- TenYearCHD: integer (nullable = true)



## Cleaning and transforming

In [28]:
print("Rows before cleaning")
print(heart_df.count())
print("Rows after cleaning")
clean_heart_df = heart_df.dropna()
print(clean_heart_df.count())

labeled_heart_df = clean_heart_df.withColumnRenamed("TenYearCHD", "label")

Rows before cleaning
4238
Rows after cleaning
3656


## Labeling

In [29]:
# Assembling the features
assembler = VectorAssembler(inputCols=["male", "age", "education", "currentSmoker", "cigsPerDay", "BPMeds", "prevalentStroke", "prevalentHyp",
                                      "diabetes", "totChol", "sysBP", "diaBP", "BMI", "heartRate", "glucose"], outputCol="features")
data_with_features = assembler.transform(labeled_heart_df).select("label", "features")                 
data_with_features.printSchema()

root
 |-- label: integer (nullable = true)
 |-- features: vector (nullable = true)



In [30]:
data_with_features.show(2)

+-----+--------------------+
|label|            features|
+-----+--------------------+
|    0|[1.0,39.0,4.0,0.0...|
|    0|(15,[1,2,9,10,11,...|
+-----+--------------------+
only showing top 2 rows


# Data Splitting

In [31]:
train_df, test_df = data_with_features.randomSplit([0.8, 0.2], seed=101)

# Create ML Model

In [32]:
lr = LogisticRegression(maxIter=10, regParam=0.01)

# Train ML Model

In [33]:
lr_model = lr.fit(train_df)

# Print coefficients
print("Coefficients: " + str(lr_model.coefficients))

# Display model summary
training_summary = lr_model.summary

Coefficients: [0.4539178927142701,0.054300104165541485,-0.06509799386997998,0.16169985901458214,0.013998751621208247,0.11930562706105892,0.82210014964367,0.16138844067857558,0.27515577551933373,0.0019459182951618275,0.016044620986062384,0.0011611303697878152,0.00024057297349552227,-0.0030622870172235857,0.00580469636987578]


## Predictions

In [34]:
# Use the trained model to make predictions on the test data
predictions = lr_model.transform(test_df)

# Show predictions
predictions.select("features", "prediction", "probability").show()

+--------------------+----------+--------------------+
|            features|prediction|         probability|
+--------------------+----------+--------------------+
|(15,[1,2,9,10,11,...|       0.0|[0.97795597024893...|
|(15,[1,2,9,10,11,...|       0.0|[0.97573692001421...|
|(15,[1,2,9,10,11,...|       0.0|[0.97184744681278...|
|(15,[1,2,9,10,11,...|       0.0|[0.97385723753212...|
|(15,[1,2,9,10,11,...|       0.0|[0.97611352571857...|
|(15,[1,2,9,10,11,...|       0.0|[0.95632151413242...|
|(15,[1,2,9,10,11,...|       0.0|[0.97803057875254...|
|(15,[1,2,9,10,11,...|       0.0|[0.95632835689801...|
|(15,[1,2,9,10,11,...|       0.0|[0.97356786708270...|
|(15,[1,2,9,10,11,...|       0.0|[0.95742934030667...|
|(15,[1,2,9,10,11,...|       0.0|[0.96085157676045...|
|(15,[1,2,9,10,11,...|       0.0|[0.97558501393939...|
|(15,[1,2,9,10,11,...|       0.0|[0.97811790678061...|
|(15,[1,2,9,10,11,...|       0.0|[0.97675474168392...|
|(15,[1,2,9,10,11,...|       0.0|[0.96756338437975...|
|(15,[1,2,

# Test ML Model

In [35]:
evaluator = MulticlassClassificationEvaluator(labelCol="label",
                            predictionCol="prediction")

accuracy = evaluator.evaluate(predictions, 
                  {evaluator.metricName: "accuracy"})
print(f"Accuracy: {accuracy}")
precision = evaluator.evaluate(predictions,
                  {evaluator.metricName: "weightedPrecision"})
print(f"Precision: {precision}")
recall = evaluator.evaluate(predictions,
                  {evaluator.metricName: "weightedRecall"})
print(f"Recall: {recall}")
f1 = evaluator.evaluate(predictions,
                {evaluator.metricName: "f1"})
print(f"F1 Score: {f1}")  

Accuracy: 0.8494623655913979
Precision: 0.8336178600003903
Recall: 0.8494623655913979
F1 Score: 0.8003190708656394


In [36]:
sc.stop()