In [28]:
import findspark
findspark.init()

In [29]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Lab 03 - Map Reducer") \
    .master("spark://f5db43ce3d38:7077") \
    .config("spark.ui.port","4040") \
    .getOrCreate()

# Create SparkContext
sc = spark.sparkContext
sc.setLogLevel("ERROR")

In [30]:
posts = [
    "Excited to start learning #MachineLearning and #AI! #DataScience",
    "Just finished a great book on #BigData and #DataEngineering. #AI",
    "Attending a workshop on #PySpark and #DataScience. #BigData",
    "Exploring the world of #DeepLearning and #NeuralNetworks. #AI",
    "Working on a project using #PySpark and #Hadoop. #BigData",
    "Reading about #NaturalLanguageProcessing and #AI. #DataScience",
    "Just completed a course on #DataVisualization. #DataScience",
    "Excited about the future of #AI and #MachineLearning! #BigData",
    "Learning #DataEngineering with #PySpark. #DataScience",
    "Exploring #CloudComputing and #BigData. #AI"
]

posts_rdd = sc.parallelize(posts)

In [31]:
def extract_hashtags(posts):
    return [htag for htag in posts.split() if htag.startswith('#')]

hashtags_rdd = posts_rdd.flatMap(extract_hashtags)
hashtags_rdd.collect()

['#MachineLearning',
 '#AI!',
 '#DataScience',
 '#BigData',
 '#DataEngineering.',
 '#AI',
 '#PySpark',
 '#DataScience.',
 '#BigData',
 '#DeepLearning',
 '#NeuralNetworks.',
 '#AI',
 '#PySpark',
 '#Hadoop.',
 '#BigData',
 '#NaturalLanguageProcessing',
 '#AI.',
 '#DataScience',
 '#DataVisualization.',
 '#DataScience',
 '#AI',
 '#MachineLearning!',
 '#BigData',
 '#DataEngineering',
 '#PySpark.',
 '#DataScience',
 '#CloudComputing',
 '#BigData.',
 '#AI']

In [32]:
mapped_htags_rdd = hashtags_rdd.map(lambda x: {x,1})
mapped_htags_rdd.collect()

[{'#MachineLearning', 1},
 {'#AI!', 1},
 {'#DataScience', 1},
 {'#BigData', 1},
 {'#DataEngineering.', 1},
 {'#AI', 1},
 {'#PySpark', 1},
 {'#DataScience.', 1},
 {'#BigData', 1},
 {'#DeepLearning', 1},
 {'#NeuralNetworks.', 1},
 {'#AI', 1},
 {'#PySpark', 1},
 {'#Hadoop.', 1},
 {'#BigData', 1},
 {'#NaturalLanguageProcessing', 1},
 {'#AI.', 1},
 {'#DataScience', 1},
 {'#DataVisualization.', 1},
 {'#DataScience', 1},
 {'#AI', 1},
 {'#MachineLearning!', 1},
 {'#BigData', 1},
 {'#DataEngineering', 1},
 {'#PySpark.', 1},
 {'#DataScience', 1},
 {'#CloudComputing', 1},
 {'#BigData.', 1},
 {'#AI', 1}]

In [33]:
htag_count = hashtags_rdd.countByValue()
for htag, count in htag_count.items():
    print(f"{htag}: {count}")

#MachineLearning: 1
#AI!: 1
#DataScience: 4
#BigData: 4
#DataEngineering.: 1
#AI: 4
#PySpark: 2
#DataScience.: 1
#DeepLearning: 1
#NeuralNetworks.: 1
#Hadoop.: 1
#NaturalLanguageProcessing: 1
#AI.: 1
#DataVisualization.: 1
#MachineLearning!: 1
#DataEngineering: 1
#PySpark.: 1
#CloudComputing: 1
#BigData.: 1


In [None]:
# Stop the SparkContext
sc.stop()