# <center> <img src="../labs/img/ITESOLogo.png" alt="ITESO" width="480" height="130"> </center>
# <center> **Departamento de Electrónica, Sistemas e Informática** </center>
---
## <center> **Procesamiento de Datos Masivos** </center>
---
### <center> **Primavera 2025** </center>
---
### <center> **Ejemplos de Aprendizaje Automático (Machine Learning): Logistic Regression** </center>

---
**Profesor**: Dr. Pablo Camarillo Ramirez

In [1]:
import findspark
findspark.init()

#### Creacion de la conexión con el cluster de spark


In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("MLSpark-Logistic-Regression") \
    .master("spark://e3b046ba856a:7077") \
    .config("spark.ui.port","4040") \
    .getOrCreate()
sc = spark.sparkContext
spark.conf.set("spark.sql.shuffle.partitions", "5")

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/04/26 15:39:11 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


### Preparación de Datos

In [5]:
from team_name.spark_utils import SparkUtils

csv_path = "/home/jovyan/notebooks/data/heartDisease/framingham.csv"

# Define schema for the DataFrame
schema = SparkUtils.generate_schema([
    ("male", "float"),
    ("age", "float"),
    ("education", "float"),
    ("currentSmoker", "float"),
    ("cigsPerDay", "float"),
    ("BPMeds", "float"),
    ("prevalentStroke", "float"),
    ("prevalentHyp", "float"),
    ("diabetes", "float"),
    ("totChol", "float"),
    ("sysBP", "float"),
    ("diaBP", "float"),
    ("BMI", "float"),
    ("heartRate", "float"),
    ("glucose", "float"),
    ("TenYearCHD", "float")
])


# Convert list to a DataFrame
df = spark.read.csv(csv_path, header=True, schema=schema)
#df = spark.createDataFrame(data, schema)

df.show()

                                                                                

+----+----+---------+-------------+----------+------+---------------+------------+--------+-------+-----+-----+-----+---------+-------+----------+
|male| age|education|currentSmoker|cigsPerDay|BPMeds|prevalentStroke|prevalentHyp|diabetes|totChol|sysBP|diaBP|  BMI|heartRate|glucose|TenYearCHD|
+----+----+---------+-------------+----------+------+---------------+------------+--------+-------+-----+-----+-----+---------+-------+----------+
| 1.0|39.0|      4.0|          0.0|       0.0|   0.0|            0.0|         0.0|     0.0|  195.0|106.0| 70.0|26.97|     80.0|   77.0|       0.0|
| 0.0|46.0|      2.0|          0.0|       0.0|   0.0|            0.0|         0.0|     0.0|  250.0|121.0| 81.0|28.73|     95.0|   76.0|       0.0|
| 1.0|48.0|      1.0|          1.0|      20.0|   0.0|            0.0|         0.0|     0.0|  245.0|127.5| 80.0|25.34|     75.0|   70.0|       0.0|
| 0.0|61.0|      3.0|          1.0|      30.0|   0.0|            0.0|         1.0|     0.0|  225.0|150.0| 95.0|28.58| 

### Assemble the features into a single vector column

In [5]:
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols=["feature_x1", "feature_x2"], outputCol="features")
data_with_features = assembler.transform(df).select("label", "features")

### Split the data into training and test sets 80% training data and 20% testing data

In [8]:
train_df, test_df = data_with_features.randomSplit([0.8, 0.2], seed=57)

### Show the whole dataset

In [9]:
print("Original Dataset")
data_with_features.show()

# Print train dataset
print("train set")
train_df.show()

Original Dataset
+-----+---------+
|label| features|
+-----+---------+
|  1.0|[2.0,3.0]|
|  0.0|[1.0,2.5]|
|  1.0|[3.0,5.0]|
|  0.0|[0.5,1.0]|
|  1.0|[4.0,6.0]|
+-----+---------+

train set


                                                                                

+-----+---------+
|label| features|
+-----+---------+
|  0.0|[1.0,2.5]|
|  1.0|[2.0,3.0]|
|  0.0|[0.5,1.0]|
|  1.0|[4.0,6.0]|
+-----+---------+



### Create a logistic regression model

In [10]:
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(maxIter=10, regParam=0.01)


# TRAINNING

In [11]:
lr_model = lr.fit(train_df)

# Print coefficients
print("Coefficients: " + str(lr_model.coefficients))

# Display model summary
training_summary = lr_model.summary

25/04/25 14:32:17 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
25/04/25 14:32:17 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.VectorBLAS
                                                                                

Coefficients: [2.346116998875653,0.7963873036415706]


# PREDICTIONS

In [12]:
# Use the trained model to make predictions on the test data
predictions = lr_model.transform(test_df)

# Show predictions
predictions.select("features", "prediction", "probability").show()

+---------+----------+--------------------+
| features|prediction|         probability|
+---------+----------+--------------------+
|[3.0,5.0]|       1.0|[0.00524886113385...|
+---------+----------+--------------------+



In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction")
accuracy = evaluator.evaluate(predictions, {evaluator.metricName: "accuracy"})
precision = evaluator.evaluate(predictions, {evaluator.metricName: "weightedPrecision"})
recall = evaluator.evaluate(predictions, {evaluator.metricName: "weightedRecall"})
f1 = evaluator.evaluate(predictions, {evaluator.metricName: "f1"})

print(f"f1:{f1}")


In [13]:
sc.stop()