# <center> <img src="../labs/img/ITESOLogo.png" alt="ITESO" width="480" height="130"> </center>
# <center> **Departamento de Electrónica, Sistemas e Informática** </center>
---
## <center> **Procesamiento de Datos Masivos** </center>
---
### <center> **Primavera 2025** </center>
---
### <center> **Logistic Regression, Decision Trees and Vector Machines** </center>

---
**Profesor**: Dr. Pablo Camarillo Ramirez

**Integrantes**:
- Lorena Ruelas Gaytán
- Yael Alejandro Rodríguez Barreto
- Ximena Isaac Horta
- Alberto Renteria Camacho

In [30]:
import findspark
findspark.init()

In [None]:
SPARK_ID = "7e9458d7d568"

#### Spark Conexion


In [32]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("MLSpark-Supervised") \
    .master(f"spark://{SPARK_ID}:7077") \
    .config("spark.ui.port","4040") \
    .getOrCreate()
sc = spark.sparkContext
spark.conf.set("spark.sql.shuffle.partitions", "5")

#### Extraction of data

In [33]:
data_path = "/home/jovyan/notebooks/data/parquet/"

raw_df = spark.read \
    .parquet(data_path)

print("Rows:", raw_df.count(), "Cols:", len(raw_df.columns))
raw_df.show(10)

                                                                                

Rows: 1580 Cols: 8


[Stage 4:>                                                          (0 + 1) / 1]

+----+--------------------+-------+---------+------+--------------------+-------------+--------------------+
| key|               value|  topic|partition|offset|           timestamp|timestampType|           value_str|
+----+--------------------+-------+---------+------+--------------------+-------------+--------------------+
|NULL|[7B 22 74 77 65 6...|tweet-2|        0|     4|2025-05-07 18:35:...|            0|{"tweet_id": "5db...|
|NULL|[7B 22 74 77 65 6...|tweet-2|        0|     5|2025-05-07 18:35:...|            0|{"tweet_id": "d7b...|
|NULL|[7B 22 74 77 65 6...|tweet-2|        0|     6|2025-05-07 18:35:...|            0|{"tweet_id": "621...|
|NULL|[7B 22 74 77 65 6...|tweet-2|        0|     7|2025-05-07 18:35:...|            0|{"tweet_id": "707...|
|NULL|[7B 22 74 77 65 6...|tweet-2|        0|     8|2025-05-07 18:35:...|            0|{"tweet_id": "bc8...|
|NULL|[7B 22 74 77 65 6...|tweet-2|        0|     9|2025-05-07 18:35:...|            0|{"tweet_id": "738...|
|NULL|[7B 22 74 77 

                                                                                

#### Data preparation

In [34]:
from equipo.spark_utils import SparkUtils
from pyspark.sql.functions import from_json

headers = [
        ("tweet_id", "string"),
        ("user_id", "integer"),
        ("timestamp", "string"),
        ("text", "string"),
        ("hashtags", "string"),
        ("mentions", "string"),
        ("retweet_count", "integer"),
        ("favorite_count", "integer"),
        ("reply_count", "integer"),
        ("quote_count", "integer"),
        ("views", "integer")
]

schema = SparkUtils.generate_schema([(head[0], head[1]) for head in headers])

tweets_df = raw_df.select(from_json(raw_df.value_str, schema).alias("data")).select("data.*")

print("Rows:", tweets_df.count(), "Cols:", len(tweets_df.columns))
tweets_df.printSchema()
tweets_df.show(10)

                                                                                

Rows: 1580 Cols: 11
root
 |-- tweet_id: string (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- timestamp: string (nullable = true)
 |-- text: string (nullable = true)
 |-- hashtags: string (nullable = true)
 |-- mentions: string (nullable = true)
 |-- retweet_count: integer (nullable = true)
 |-- favorite_count: integer (nullable = true)
 |-- reply_count: integer (nullable = true)
 |-- quote_count: integer (nullable = true)
 |-- views: integer (nullable = true)



[Stage 8:>                                                          (0 + 1) / 1]

+--------------------+-------+--------------------+--------------------+--------------------+--------------------+-------------+--------------+-----------+-----------+-----+
|            tweet_id|user_id|           timestamp|                text|            hashtags|            mentions|retweet_count|favorite_count|reply_count|quote_count|views|
+--------------------+-------+--------------------+--------------------+--------------------+--------------------+-------------+--------------+-----------+-----------+-----+
|5dbb99f0-e6c9-4f1...|      2|2024-11-04T17:08:...|Distinctio ration...|                  []|["camila48","sote...|          412|           259|          1|         25|10413|
|d7b832b2-88a8-423...|      2|2025-02-09T22:14:...|Minima labore rep...|["magnam","vel","...|        ["kmaestas"]|          438|           165|         46|         36|15013|
|6219028b-becf-4eb...|      2|2025-01-04T16:59:...|Est asperiores qu...|["consequatur","s...| ["esquiveljoaquin"]|           53|  

                                                                                

#### Assemble the features into a single vector column

In [35]:
from pyspark.ml.feature import VectorAssembler

numeric_cols = [
    "retweet_count", "favorite_count", "reply_count", "quote_count", "views"
]

tweets_df = tweets_df.withColumnRenamed("user_id", "label")

assembler = VectorAssembler(inputCols=numeric_cols, outputCol="features")
data_with_features = assembler.transform(tweets_df).select("label", "features")

#### Split the data into training and test sets 80% training data and 20% testing data

In [None]:
train_df, test_df = data_with_features.randomSplit([0.8, 0.2], seed=57)

#### Show the whole dataset

In [37]:
print("Original Dataset")
data_with_features.show(6)

print("Train set")
train_df.show(6)

Original Dataset


                                                                                

+-----+--------------------+
|label|            features|
+-----+--------------------+
|    2|[412.0,259.0,1.0,...|
|    2|[438.0,165.0,46.0...|
|    2|[53.0,718.0,88.0,...|
|    2|[388.0,660.0,47.0...|
|    2|[38.0,657.0,60.0,...|
|    2|[115.0,614.0,96.0...|
+-----+--------------------+
only showing top 6 rows

Train set


[Stage 10:>                                                         (0 + 1) / 1]

+-----+--------------------+
|label|            features|
+-----+--------------------+
|    1|[2.0,560.0,3.0,50...|
|    1|[12.0,199.0,4.0,4...|
|    1|[33.0,158.0,9.0,3...|
|    1|[33.0,331.0,60.0,...|
|    1|[44.0,72.0,31.0,1...|
|    1|[60.0,465.0,32.0,...|
+-----+--------------------+
only showing top 6 rows



                                                                                

---
# Logistic Regression

#### Create a Logistic Regression model and OnevsAll

In [39]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.classification import OneVsRest

lr = LogisticRegression(maxIter=10, regParam=0.01)
ovr_lr = OneVsRest(classifier=lr)

#### Training

In [40]:
lr_model = ovr_lr.fit(train_df)

25/05/08 00:40:38 WARN Instrumentation: [c701812c] All labels are the same value and fitIntercept=true, so the coefficients will be zeros. Training is not needed.
                                                                                

#### Predictions

In [42]:
# Use the trained model to make predictions on the test data
predictions_lr = lr_model.transform(test_df)

# Show predictions
predictions_lr.select("features", "prediction").show()

[Stage 90:>                                                         (0 + 1) / 1]

+--------------------+----------+
|            features|prediction|
+--------------------+----------+
|[246.0,948.0,3.0,...|       3.0|
|[250.0,332.0,38.0...|       4.0|
|[280.0,834.0,17.0...|       3.0|
|[314.0,435.0,82.0...|       2.0|
|[363.0,998.0,42.0...|       3.0|
|[57.0,506.0,96.0,...|       2.0|
|[94.0,712.0,41.0,...|       3.0|
|[97.0,68.0,11.0,1...|       4.0|
|[218.0,982.0,91.0...|       2.0|
|[223.0,205.0,36.0...|       1.0|
|[264.0,208.0,29.0...|       4.0|
|[289.0,412.0,67.0...|       1.0|
|[365.0,1000.0,39....|       3.0|
|[403.0,491.0,2.0,...|       1.0|
|[445.0,573.0,75.0...|       2.0|
|[464.0,654.0,51.0...|       3.0|
|[471.0,494.0,30.0...|       1.0|
|[473.0,25.0,41.0,...|       1.0|
|[1.0,151.0,72.0,3...|       1.0|
|[76.0,587.0,2.0,3...|       3.0|
+--------------------+----------+
only showing top 20 rows



                                                                                

---
# Decision Tree Model

#### Create a Decision Tree model and OnevsAll

In [43]:
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.classification import OneVsRest

# Initialize and train the Decision Tree model
dt = DecisionTreeClassifier(labelCol="label", featuresCol="features")
ovr_tree = OneVsRest(classifier=dt)


### Trainning

In [44]:
# Training, jeje
dt_model = ovr_tree.fit(train_df)

                                                                                

#### Predictions

In [46]:
# Use the trained model to make predictions on the test data
predictions_dt = dt_model.transform(test_df)

# Show predictions
predictions_dt.select("features", "prediction").show()

25/05/08 00:53:42 WARN TaskSetManager: Lost task 0.0 in stage 173.0 (TID 1291) (172.18.0.3 executor 0): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/opt/conda/spark-3.5.4-bin-hadoop3-scala2.13/python/pyspark/ml/classification.py", line 3818, in <lambda>
    lambda predictions, prediction: predictions + [prediction.tolist()[1]],
                                                   ~~~~~~~~~~~~~~~~~~~^^^
IndexError: list index out of range

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:572)
	at org.apache.spark.sql.execution.python.BasePythonUDFRunner$$anon$1.read(PythonUDFRunner.scala:94)
	at org.apache.spark.sql.execution.python.BasePythonUDFRunner$$anon$1.read(PythonUDFRunner.scala:75)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:525)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Ite

PythonException: 
  An exception was thrown from the Python worker. Please see the stack trace below.
Traceback (most recent call last):
  File "/opt/conda/spark-3.5.4-bin-hadoop3-scala2.13/python/pyspark/ml/classification.py", line 3818, in <lambda>
    lambda predictions, prediction: predictions + [prediction.tolist()[1]],
                                                   ~~~~~~~~~~~~~~~~~~~^^^
IndexError: list index out of range


---
# Support Vector Machines 

#### Create Support Vector Machines and OnevsAll

In [47]:
from pyspark.ml.classification import LinearSVC

# Initialize and train the Decision SVM
lsvc = LinearSVC(maxIter=10, regParam=0.01)
ovr_vector = OneVsRest(classifier=lsvc)

#### Training

In [48]:
svm_model = ovr_vector.fit(data_with_features)  

25/05/08 00:56:05 ERROR Instrumentation: java.lang.IllegalArgumentException: requirement failed: LinearSVC only supports binary classification. 1 classes detected in LinearSVC_3fbd64c1a758__labelCol
	at scala.Predef$.require(Predef.scala:337)
	at org.apache.spark.ml.classification.LinearSVC.$anonfun$train$1(LinearSVC.scala:217)
	at org.apache.spark.ml.util.Instrumentation$.$anonfun$instrumented$1(Instrumentation.scala:191)
	at scala.util.Try$.apply(Try.scala:210)
	at org.apache.spark.ml.util.Instrumentation$.instrumented(Instrumentation.scala:191)
	at org.apache.spark.ml.classification.LinearSVC.train(LinearSVC.scala:172)
	at org.apache.spark.ml.classification.LinearSVC.train(LinearSVC.scala:77)
	at org.apache.spark.ml.Predictor.fit(Predictor.scala:114)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:77)
	at java.base/jdk.internal.reflect.DelegatingMethodA

IllegalArgumentException: requirement failed: LinearSVC only supports binary classification. 1 classes detected in LinearSVC_3fbd64c1a758__labelCol

#### Predictions

In [None]:
predictions_svm = svm_model.transform(test_df)
predictions_dt.select("features", "prediction").show(20)

---
# Models Testing

In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction")

### Logistic
print("\n *** LOGISTIC ***")
accuracy = evaluator.evaluate(predictions_lr, {evaluator.metricName: "accuracy"})
print(f"Accuracy: {accuracy}")
precision = evaluator.evaluate(predictions_lr, {evaluator.metricName: "weightedPrecision"})
print(f"Precision: {precision}")
recall = evaluator.evaluate(predictions_lr, {evaluator.metricName: "weightedRecall"})
print(f"Recall: {recall}")
f1 = evaluator.evaluate(predictions_lr, {evaluator.metricName: "f1"})
print(f"F1 Score: {f1}")

### Tree
print("\n *** TREE ***")
accuracy = evaluator.evaluate(predictions_dt, {evaluator.metricName: "accuracy"})
print(f"Accuracy: {accuracy}")
precision = evaluator.evaluate(predictions_dt, {evaluator.metricName: "weightedPrecision"})
print(f"Precision: {precision}")
recall = evaluator.evaluate(predictions_dt, {evaluator.metricName: "weightedRecall"})
print(f"Recall: {recall}")
f1 = evaluator.evaluate(predictions_dt, {evaluator.metricName: "f1"})
print(f"F1 Score: {f1}")  

### Vector
print("\n *** VECTOR ***")
accuracy = evaluator.evaluate(predictions_svm, {evaluator.metricName: "accuracy"})
print(f"Accuracy: {accuracy}")
precision = evaluator.evaluate(predictions_svm, {evaluator.metricName: "weightedPrecision"})
print(f"Precision: {precision}")
recall = evaluator.evaluate(predictions_svm, {evaluator.metricName: "weightedRecall"})
print(f"Recall: {recall}")
f1 = evaluator.evaluate(predictions_svm, {evaluator.metricName: "f1"})
print(f"F1 Score: {f1}") 

In [49]:
sc.stop()