In [22]:
# Set the PySpark environment variables
import os
os.environ['SPARK_HOME'] = "/usr/local/spark"
os.environ['PYSPARK_DRIVER_PYTHON'] = 'jupyter'
os.environ['PYSPARK_DRIVER_PYTHON_OPTS'] = 'lab'
os.environ['PYSPARK_PYTHON'] = 'python'

In [23]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import desc

In [24]:
# Create a SparkSession
spark = SparkSession.builder.appName("DataFrame-Demo").getOrCreate()

### Using RDDs

In [25]:
rdd = spark.sparkContext.textFile("file:///home/bms/data.txt")
result_rdd = rdd.flatMap(lambda line: line.split(" ")) \
    .map(lambda word: (word, 1)) \
    .reduceByKey(lambda a, b: a + b) \
    .sortBy(lambda x: x[1], ascending=False)

In [26]:
result_rdd.take(10)

[('', 353),
 ('\tat', 27),
 ('=', 12),
 ('in', 8),
 ('->', 8),
 ('File', 7),
 ('return', 5),
 ('"""', 4),
 ('partitionFunc)', 3),
 ('+\\', 3)]

### Using DataFrames

In [27]:
# df = spark.read.text("./data/data.txt")
df = spark.read.text("file:///home/bms/data.txt")
result_df = df.selectExpr("explode(split(value, ' ')) as word") \
    .groupBy("word").count().orderBy(desc("count"))

In [28]:
result_df.take(10)

[Row(word='', count=353),
 Row(word='\tat', count=27),
 Row(word='=', count=12),
 Row(word='in', count=8),
 Row(word='->', count=8),
 Row(word='File', count=7),
 Row(word='return', count=5),
 Row(word='"""', count=4),
 Row(word='func,', count=3),
 Row(word='partitionFunc)', count=3)]

In [29]:
spark.stop()