In [1]:
import findspark
findspark.init()

In [2]:
from pyspark import SparkContext
sc = SparkContext("local", "WordCount")

22/03/01 22:47:03 WARN Utils: Your hostname, ubuntu-virtual-machine resolves to a loopback address: 127.0.1.1; using 192.168.80.128 instead (on interface ens33)
22/03/01 22:47:03 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
22/03/01 22:47:04 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [3]:
# Lazy evaluation, files won't be read until some action is applied on data
fileRdd = sc.textFile("hdfs://localhost:9000/words.txt")


In [5]:
# Count is an action method, it has read file, get the count from executors
# the files shall be read from hdfs by executor, load content into partitions, get the count
fileRdd.count()

6

In [8]:
# collect is a action method, this also create job, read data from hdfs etc
fileRdd.collect()

['   spark kafka  ',
 ' kafka   spark pyspark ',
 '                ',
 'spark',
 '',
 'APACHE Kafka APache SParK ']

In [6]:
# RDD Lineage
# Map is transformation ,lazy evaluation, no job is created
lowerCaseRdd = fileRdd.map (lambda line: line.strip().lower())


In [7]:
# collect is action, creates job, load files, read file, remove space
lowerCaseRdd.collect()

['spark kafka',
 'kafka   spark pyspark',
 '',
 'spark',
 '',
 'apache kafka apache spark']

In [8]:
wordListRdd = lowerCaseRdd.map (lambda line: line.split(" "))

In [9]:
wordListRdd.collect()

[['spark', 'kafka'],
 ['kafka', '', '', 'spark', 'pyspark'],
 [''],
 ['spark'],
 [''],
 ['apache', 'kafka', 'apache', 'spark']]

In [10]:
# flatMap, remove the list, project element in the list as record
wordRdd = wordListRdd.flatMap(lambda elements: elements)


In [11]:
wordRdd.collect()

['spark',
 'kafka',
 'kafka',
 '',
 '',
 'spark',
 'pyspark',
 '',
 'spark',
 '',
 'apache',
 'kafka',
 'apache',
 'spark']

In [12]:
wordRdd =  wordRdd.filter (lambda word: word != "")


In [13]:
wordRdd.collect()

['spark',
 'kafka',
 'kafka',
 'spark',
 'pyspark',
 'spark',
 'apache',
 'kafka',
 'apache',
 'spark']

In [14]:
# convert word into (key,value) rdd (spark, 1) for reduceByKey
pairRdd = wordRdd.map (lambda word: (word, 1))


In [22]:
wordRdd.toDebugString()

b'(1) PythonRDD[6] at collect at /tmp/ipykernel_95905/3746809788.py:1 []\n |  hdfs://localhost:9000/words.txt MapPartitionsRDD[1] at textFile at NativeMethodAccessorImpl.java:0 []\n |  hdfs://localhost:9000/words.txt HadoopRDD[0] at textFile at NativeMethodAccessorImpl.java:0 []'

In [15]:
pairRdd.collect()

[('spark', 1),
 ('kafka', 1),
 ('kafka', 1),
 ('spark', 1),
 ('pyspark', 1),
 ('spark', 1),
 ('apache', 1),
 ('kafka', 1),
 ('apache', 1),
 ('spark', 1)]

In [16]:
# get word count using reduceByKey
# transformation
wordCountRdd = pairRdd.reduceByKey(lambda acc, value: acc + value)


In [17]:
wordCountRdd.collect()

[('spark', 4), ('kafka', 3), ('pyspark', 1), ('apache', 2)]

In [18]:
# plan
wordCountRdd.getNumPartitions()

1

In [21]:
# write the result into text file in hdfs
# saveAsTextFile is an ACTION Method
# word-count-results1 is a folder, inside we will shall partition files

wordCountRdd.saveAsTextFile ("hdfs://localhost:9000/word-count-results3")

In [19]:
# hdfs dfs -ls /word-count-results1
#  _SUCCESS 0 bytes , to state that last operation successfuly stored
# part-00000 - partition files 
# note the partition file name, part-00000 or other file name 
# hdfs dfs -cat /word-count-results1/part-00000

# use hdfs web ui  http://localhost:50070/

In [20]:
# saveAsTextFile with two partitioned data
wordCountRdd.repartition(2)\
            .saveAsTextFile("hdfs://localhost:9000/word-count-results2")

In [None]:
# hdfs dfs -ls /word-count-results2
# hdfs dfs -cat /word-count-results2/part-00000
# hdfs dfs -cat /word-count-results2/part-00001