# <center> <img src="../labs/img/ITESOLogo.png" alt="ITESO" width="480" height="130"> </center>
# <center> **Departamento de Electrónica, Sistemas e Informática** </center>
---
## <center> **Procesamiento de Datos Masivos** </center>
---
### <center> **Primavera 2025** </center>
---
### <center> **Ejemplos de Operaciones Map-Reduce** </center>
---
**Profesor**: Pablo Camarillo Ramirez

#### Encontrar la instalación de PySpark

In [31]:
import findspark
findspark.init()

#### Creacion de la conexión con el cluster de spark


In [32]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Map-Reduce-Examples") \
    .master("spark://f04d2745dc57:7077") \
    .config("spark.ui.port","4040") \
    .getOrCreate()

# Create SparkContext
sc = spark.sparkContext
sc.setLogLevel("ERROR")

### Example 1: Convert Temperatures from Celsius to Fahrenheit
This example demonstrates how to use `map()` to transform data.

In [33]:
# Create an RDD of temperatures in Celsius
temperatures_celsius = [0, 10, 20, 30, 40]
rdd = sc.parallelize(temperatures_celsius)

#### Map Phase

In [36]:
# Define a function to convert Celsius to Fahrenheit
def celsius_to_fahrenheit(celsius):
    return (celsius * 9/5) + 32

In [37]:
# Apply the map transformation
temperatures_fahrenheit = rdd.map(celsius_to_fahrenheit)

#### Reduce Phase

In [38]:
# Collect and print the results
print("Temperatures in Fahrenheit:", temperatures_fahrenheit.collect())

ERROR:root:KeyboardInterrupt while sending command.][Stage 2:>    (0 + 0) / 2]
Traceback (most recent call last):
  File "/opt/conda/spark-3.5.4-bin-hadoop3-scala2.13/python/lib/py4j-0.10.9.7-src.zip/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/conda/spark-3.5.4-bin-hadoop3-scala2.13/python/lib/py4j-0.10.9.7-src.zip/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
                          ^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/conda/lib/python3.11/socket.py", line 706, in readinto
    return self._sock.recv_into(b)
           ^^^^^^^^^^^^^^^^^^^^^^^
KeyboardInterrupt


KeyboardInterrupt: 

[Stage 0:>    (0 + 0) / 2][Stage 1:>    (0 + 0) / 2][Stage 2:>    (0 + 0) / 2]

### Example 2: Parse Log Files to Extract Timestamps

In [18]:
# Create an RDD of log entries
logs = [
    "2023-10-01 12:00:00 INFO: System started",
    "2023-10-01 12:05:00 ERROR: Disk full",
    "2023-10-01 12:10:00 INFO: Backup completed"
]
rdd = sc.parallelize(logs)

#### Map Phase

In [19]:
# Define a function to extract timestamps
timestamps = rdd.map(lambda log : log.split(" ")[1]) # Extract the time part

#### Reduce Phase

In [None]:
# Collect and print the results
print("Extracted Timestamps:", timestamps.collect())

### Example 3: Split a string
This example demonstrates how to use `flatMap()` to transform data into iterable.

In [7]:
# Create a RDD with sentences
sentences = ["Hello world", "Big Data is fun", "PySpark is powerful"]
sentences_rdd = sc.parallelize(sentences)

In [8]:
def split_into_words(sentence):
    # Split the sentence by spaces
    return sentence.split()

# Use flatMap to apply the function and flatten the results
words_rdd = sentences_rdd.flatMap(split_into_words)

In [None]:
# Collect and print the results
print("Extracted Words:", words_rdd.collect())

### Example 4: Adding up items by key

In [None]:
data = [
    ('apple', 1),
    ('banana', 2),
    ('apple', 3),
    ('orange', 4),
    ('banana', 1),
    ('orange', 2)
]
rdd = sc.parallelize(data)
                    
reduced_rdd = rdd.reduceByKey(lambda a, b: a + b)
reduced_rdd.collect()

### Example 5: Grouping Strings

In [14]:
data = [
       ('fruit', 'apple'),
       ('fruit', 'banana'),
       ('vegetable', 'carrot'),
       ('fruit', 'orange'),
       ('vegetable', 'broccoli')
]
rdd = sc.parallelize(data)
                    
grouped_rdd = rdd.groupByKey()
result = grouped_rdd.collect()
for key, values in result:
    print(f"{key}: {list(values)}")

vegetable: ['carrot', 'broccoli']
fruit: ['apple', 'banana', 'orange']


### Example 6: Counting words

In [15]:
words = ["apple", "banana", "apple", "orange", 
         "banana", "apple", "orange", "orange"]
words_rdd = sc.parallelize(words)
            
word_counts = words_rdd.countByValue()
for word, count in word_counts.items():
    print(f"{word}: {count}")

apple: 3
banana: 2
orange: 3


### Example 6: Filter even numbers

In [16]:
numbers = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
numbers_rdd = sc.parallelize(numbers)

even_numbers_rdd = numbers_rdd.filter(lambda x: x % 2 == 0)
even_numbers_rdd.collect()

[2, 4, 6, 8, 10]

In [18]:
# Stop the SparkContext
sc.stop()