In [1]:
import findspark
findspark.init()

In [3]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Map-Reduce-Examples") \
    .master("spark://3945b932edc5:7077") \
    .config("spark.ui.port","4040") \
    .getOrCreate()

spark.conf.set("spark.sql.shuffle.partitions", "1")

# Create SparkContext
sc = spark.sparkContext
sc.setLogLevel("ERROR")

In [5]:
post = [
    "Excited to start learning #MachineLearning and #AI! #DataScience",
    "Just finished a great book on #BigData and #DataEngineering. #AI",
    "Attending a workshop on #PySpark and #DataScience. #BigData",
    "Exploring the world of #DeepLearning and #NeuralNetworks. #AI",
    "Working on a project using #PySpark and #Hadoop. #BigData",
    "Reading about #NaturalLanguageProcessing and #AI. #DataScience",
    "Just completed a course on #DataVisualization. #DataScience",
    "Excited about the future of #AI and #MachineLearning! #BigData",
    "Learning #DataEngineering with #PySpark. #DataScience",
    "Exploring #CloudComputing and #BigData. #AI"
]
posts_rdd = sc.parallelize(post)

In [6]:
def extract_hashtags(post):
    return [htag for htag in post.split() if htag.startswith('#')]

hashtags_rdd = posts_rdd.flatMap(extract_hashtags)
hashtags_rdd.collect()

                                                                                

['#MachineLearning',
 '#AI!',
 '#DataScience',
 '#BigData',
 '#DataEngineering.',
 '#AI',
 '#PySpark',
 '#DataScience.',
 '#BigData',
 '#DeepLearning',
 '#NeuralNetworks.',
 '#AI',
 '#PySpark',
 '#Hadoop.',
 '#BigData',
 '#NaturalLanguageProcessing',
 '#AI.',
 '#DataScience',
 '#DataVisualization.',
 '#DataScience',
 '#AI',
 '#MachineLearning!',
 '#BigData',
 '#DataEngineering',
 '#PySpark.',
 '#DataScience',
 '#CloudComputing',
 '#BigData.',
 '#AI']

In [7]:
mapped_hashtags_rdd = hashtags_rdd.map(lambda x: (x, 1))
mapped_hashtags_rdd.collect()

[('#MachineLearning', 1),
 ('#AI!', 1),
 ('#DataScience', 1),
 ('#BigData', 1),
 ('#DataEngineering.', 1),
 ('#AI', 1),
 ('#PySpark', 1),
 ('#DataScience.', 1),
 ('#BigData', 1),
 ('#DeepLearning', 1),
 ('#NeuralNetworks.', 1),
 ('#AI', 1),
 ('#PySpark', 1),
 ('#Hadoop.', 1),
 ('#BigData', 1),
 ('#NaturalLanguageProcessing', 1),
 ('#AI.', 1),
 ('#DataScience', 1),
 ('#DataVisualization.', 1),
 ('#DataScience', 1),
 ('#AI', 1),
 ('#MachineLearning!', 1),
 ('#BigData', 1),
 ('#DataEngineering', 1),
 ('#PySpark.', 1),
 ('#DataScience', 1),
 ('#CloudComputing', 1),
 ('#BigData.', 1),
 ('#AI', 1)]

In [20]:
counted_hashtags_rdd = hashtags_rdd.countByValue()
for htag, count in counted_hashtags_rdd.items():
    print(f"{htag}: {count}")

#MachineLearning: 1
#AI!: 1
#DataScience: 4
#BigData: 4
#DataEngineering.: 1
#AI: 4
#PySpark: 2
#DataScience.: 1
#DeepLearning: 1
#NeuralNetworks.: 1
#Hadoop.: 1
#NaturalLanguageProcessing: 1
#AI.: 1
#DataVisualization.: 1
#MachineLearning!: 1
#DataEngineering: 1
#PySpark.: 1
#CloudComputing: 1
#BigData.: 1


                                                                                

In [35]:
# Convertir el diccionario en un RDD de tuplas (recuento, hashtag)
counted_hashtags_rdd = sc.parallelize([(count, htag) for htag, count in counted_hashtags_rdd.items()])

# Agrupar los hashtags por su recuento
grouped_hashtags_rdd = counted_hashtags_rdd.groupByKey()

# Mostrar los resultados
for count, hashtags in grouped_hashtags_rdd.collect():
    print(f"{count}: {list(hashtags)}")


4: ['#DataScience', '#BigData', '#AI']
2: ['#PySpark']
1: ['#MachineLearning', '#AI!', '#DataEngineering.', '#DataScience.', '#DeepLearning', '#NeuralNetworks.', '#Hadoop.', '#NaturalLanguageProcessing', '#AI.', '#DataVisualization.', '#MachineLearning!', '#DataEngineering', '#PySpark.', '#CloudComputing', '#BigData.']
