# <center> <img src="../img/ITESOLogo.png" alt="ITESO" width="480" height="130"> </center>
# <center> **Departamento de Electrónica, Sistemas e Informática** </center>
---
## <center> **Procesamiento de Datos Masivos (Big Data)** </center>
---
### <center> **Spring 2025** </center>
---
### <center> **Laboratorio 02** </center>
---
**Professor**: Pablo Camarillo Ramirez<br>
**Alumno**: Francisco Aarón Ortega Anguiano

In [65]:
# Initialize findspark to get acces to de PySpark installation
import findspark
findspark.init()

In [66]:
from pyspark.sql import SparkSession

# Create connection to the spark cluster
spark = SparkSession.builder \
    .appName("Lab 03 - Aaron") \
    .master("spark://4956a989a50c:7077") \
    .config("spark.ui.port","4040") \
    .getOrCreate()

# Create SparkContext
sc = spark.sparkContext
sc.setLogLevel("ERROR")

In [67]:
# Create a RDD with the input
posts = [
	"Excited to start learning #MachineLearning and #AI! #DataScience",
	"Just finished a great book on #BigData and #DataEngineering. #AI",
	"Attending a workshop on #PySpark and #DataScience. #BigData",
	"Exploring the world of #DeepLearning and #NeuralNetworks. #AI",
	"Working on a project using #PySpark and #Hadoop. #BigData",
	"Reading about #NaturalLanguageProcessing and #AI. #DataScience",
	"Just completed a course on #DataVisualization. #DataScience",
	"Excited about the future of #AI and #MachineLearning! #BigData",
	"Learning #DataEngineering with #PySpark. #DataScience",
	"Exploring #CloudComputing and #BigData. #AI"
]
posts_rdd = sc.parallelize(posts)

In [68]:
def extract_hashtags(post):
	# Use a list comprehension to split the post and return the ones that start with a '#'
	return [htag for htag in post.split() if htag.startswith('#')]

# Use flatMap to send the post one by one and to flatten the result.
hashtags_rdd = posts_rdd.flatMap(extract_hashtags)
hashtags_rdd.collect()

                                                                                

['#MachineLearning',
 '#AI!',
 '#DataScience',
 '#BigData',
 '#DataEngineering.',
 '#AI',
 '#PySpark',
 '#DataScience.',
 '#BigData',
 '#DeepLearning',
 '#NeuralNetworks.',
 '#AI',
 '#PySpark',
 '#Hadoop.',
 '#BigData',
 '#NaturalLanguageProcessing',
 '#AI.',
 '#DataScience',
 '#DataVisualization.',
 '#DataScience',
 '#AI',
 '#MachineLearning!',
 '#BigData',
 '#DataEngineering',
 '#PySpark.',
 '#DataScience',
 '#CloudComputing',
 '#BigData.',
 '#AI']

In [69]:
# With the list of htags that we got we can count each by the value we retrieved in the previous part
count_htags_rdd = hashtags_rdd.countByValue()
for htag, occ in count_htags_rdd.items():
	print(f"{htag}: {occ}")

#MachineLearning: 1
#AI!: 1
#DataScience: 4
#BigData: 4
#DataEngineering.: 1
#AI: 4
#PySpark: 2
#DataScience.: 1
#DeepLearning: 1
#NeuralNetworks.: 1
#Hadoop.: 1
#NaturalLanguageProcessing: 1
#AI.: 1
#DataVisualization.: 1
#MachineLearning!: 1
#DataEngineering: 1
#PySpark.: 1
#CloudComputing: 1
#BigData.: 1


In [70]:
# Map the htags_rdd so it becomes a list of tuples
mapped_htags_rdd = hashtags_rdd.map(lambda x : (x, 1))
mapped_htags_rdd.collect()

[('#MachineLearning', 1),
 ('#AI!', 1),
 ('#DataScience', 1),
 ('#BigData', 1),
 ('#DataEngineering.', 1),
 ('#AI', 1),
 ('#PySpark', 1),
 ('#DataScience.', 1),
 ('#BigData', 1),
 ('#DeepLearning', 1),
 ('#NeuralNetworks.', 1),
 ('#AI', 1),
 ('#PySpark', 1),
 ('#Hadoop.', 1),
 ('#BigData', 1),
 ('#NaturalLanguageProcessing', 1),
 ('#AI.', 1),
 ('#DataScience', 1),
 ('#DataVisualization.', 1),
 ('#DataScience', 1),
 ('#AI', 1),
 ('#MachineLearning!', 1),
 ('#BigData', 1),
 ('#DataEngineering', 1),
 ('#PySpark.', 1),
 ('#DataScience', 1),
 ('#CloudComputing', 1),
 ('#BigData.', 1),
 ('#AI', 1)]

In [71]:
# Use the list of tuples to reduce the list by each "key" and add the occurence
reduced_htag_rdd = mapped_htags_rdd.reduceByKey(lambda x, y: x + y)
reduced_htag_rdd.collect()

[('#AI!', 1),
 ('#DataScience', 4),
 ('#BigData', 4),
 ('#AI', 4),
 ('#Hadoop.', 1),
 ('#DataEngineering', 1),
 ('#CloudComputing', 1),
 ('#BigData.', 1),
 ('#MachineLearning', 1),
 ('#DataEngineering.', 1),
 ('#PySpark', 2),
 ('#DataScience.', 1),
 ('#DeepLearning', 1),
 ('#NeuralNetworks.', 1),
 ('#NaturalLanguageProcessing', 1),
 ('#AI.', 1),
 ('#DataVisualization.', 1),
 ('#MachineLearning!', 1),
 ('#PySpark.', 1)]

In [72]:
# Use map to transform the tuples into (len(key), tuple), then us groupByKey to group all the ones that have the same len
same_len_rdd = reduced_htag_rdd.map(lambda x: (len(x[0]), x)).groupByKey()

# Transform into list to see it better
# list(v) turn the iterator of values into a list for a better output
result = [(k, list(v)) for k, v in same_len_rdd.collect()]
print(result)

[(4, [('#AI!', 1), ('#AI.', 1)]), (12, [('#DataScience', 4)]), (8, [('#BigData', 4), ('#Hadoop.', 1), ('#PySpark', 2)]), (16, [('#DataEngineering', 1), ('#MachineLearning', 1), ('#NeuralNetworks.', 1)]), (26, [('#NaturalLanguageProcessing', 1)]), (3, [('#AI', 4)]), (15, [('#CloudComputing', 1)]), (9, [('#BigData.', 1), ('#PySpark.', 1)]), (17, [('#DataEngineering.', 1), ('#MachineLearning!', 1)]), (13, [('#DataScience.', 1), ('#DeepLearning', 1)]), (19, [('#DataVisualization.', 1)])]


In [73]:
# Stop the SparkContext
sc.stop()