In [1]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Aula 2.4") \
    .getOrCreate()

spark.version

'3.2.1'

In [2]:
linesRdd = spark.sparkContext.textFile("README.md")

linesRdd.count()

125

In [3]:
# map
mapRdd = linesRdd.map(lambda line: (line, len(line)))
mapRdd.collect()


[('# Apache Spark', 14),
 ('', 0),
 ('Spark is a unified analytics engine for large-scale data processing. It provides',
  80),
 ('high-level APIs in Scala, Java, Python, and R, and an optimized engine that',
  75),
 ('supports general computation graphs for data analysis. It also supports a',
  73),
 ('rich set of higher-level tools including Spark SQL for SQL and DataFrames,',
  74),
 ('pandas API on Spark for pandas workloads, MLlib for machine learning, GraphX for graph processing,',
  98),
 ('and Structured Streaming for stream processing.', 47),
 ('', 0),
 ('<https://spark.apache.org/>', 27),
 ('', 0),
 ('[![GitHub Actions Build](https://github.com/apache/spark/actions/workflows/build_main.yml/badge.svg)](https://github.com/apache/spark/actions/workflows/build_main.yml)',
  167),
 ('[![AppVeyor Build](https://img.shields.io/appveyor/ci/ApacheSoftwareFoundation/spark/master.svg?style=plastic&logo=appveyor)](https://ci.appveyor.com/project/ApacheSoftwareFoundation/spark)',
  189),


In [4]:
# flatMap
mapRdd = linesRdd.flatMap(lambda line: line.split())
mapRdd.collect()


['#',
 'Apache',
 'Spark',
 'Spark',
 'is',
 'a',
 'unified',
 'analytics',
 'engine',
 'for',
 'large-scale',
 'data',
 'processing.',
 'It',
 'provides',
 'high-level',
 'APIs',
 'in',
 'Scala,',
 'Java,',
 'Python,',
 'and',
 'R,',
 'and',
 'an',
 'optimized',
 'engine',
 'that',
 'supports',
 'general',
 'computation',
 'graphs',
 'for',
 'data',
 'analysis.',
 'It',
 'also',
 'supports',
 'a',
 'rich',
 'set',
 'of',
 'higher-level',
 'tools',
 'including',
 'Spark',
 'SQL',
 'for',
 'SQL',
 'and',
 'DataFrames,',
 'pandas',
 'API',
 'on',
 'Spark',
 'for',
 'pandas',
 'workloads,',
 'MLlib',
 'for',
 'machine',
 'learning,',
 'GraphX',
 'for',
 'graph',
 'processing,',
 'and',
 'Structured',
 'Streaming',
 'for',
 'stream',
 'processing.',
 '<https://spark.apache.org/>',
 '[![GitHub',
 'Actions',
 'Build](https://github.com/apache/spark/actions/workflows/build_main.yml/badge.svg)](https://github.com/apache/spark/actions/workflows/build_main.yml)',
 '[![AppVeyor',
 'Build](https:/

In [5]:
# filter
# flatMap
filterRdd = linesRdd.flatMap(lambda line: line.split()) \
                    .filter(lambda word : word.startswith("a"))
filterRdd.collect()


['a',
 'analytics',
 'and',
 'and',
 'an',
 'analysis.',
 'also',
 'a',
 'and',
 'and',
 'a',
 'and',
 'a',
 'available',
 'at',
 'an',
 'also',
 'also',
 'a',
 'a',
 'and',
 'also',
 'an',
 'abbreviated',
 'are',
 'a',
 'also',
 'a',
 'and',
 'against',
 'at',
 'and',
 'a',
 'and',
 'an']

In [6]:
lista = ["um", "um", "dois", "tres"]

rdd = spark.sparkContext.parallelize(lista)

rdd2 = rdd.map(lambda w: (w, 1)) \
          .reduceByKey(lambda a,b: a+b)

rdd2.collect()


[('tres', 1), ('um', 2), ('dois', 1)]

In [7]:
# sortByKey
lista = ["um", "um", "dois", "tres"]

rdd = spark.sparkContext.parallelize(lista)

rdd2 = rdd.map(lambda w: (w, 1)) \
          .reduceByKey(lambda a,b: a+b) \
          .sortByKey("asc")

rdd2.collect()

[('dois', 1), ('tres', 1), ('um', 2)]

In [8]:
# sortBy
lista = ["um", "um", "dois", "tres"]

rdd = spark.sparkContext.parallelize(lista)

rdd2 = rdd.map(lambda w: (w, 1)) \
          .reduceByKey(lambda a,b: a+b) \
          .sortBy(lambda t: t[1])

rdd2.collect()

[('tres', 1), ('dois', 1), ('um', 2)]

In [9]:
# union
lista1 = ["um", "um", "dois", "tres"]
lista2 = ["quatro", "cinco"]

rdd1 = spark.sparkContext.parallelize(lista1)
rdd2 = spark.sparkContext.parallelize(lista2)

rddUnion = rdd1.union(rdd2)
rddUnion.collect()

['um', 'um', 'dois', 'tres', 'quatro', 'cinco']

In [10]:
# intersection

lista1 = ["um", "um", "dois", "tres"]
lista2 = ["um", "quatro", "cinco"]

rdd1 = spark.sparkContext.parallelize(lista1)
rdd2 = spark.sparkContext.parallelize(lista2)

rddUnion = rdd1.intersection(rdd2)
rddUnion.collect()


['um']

In [11]:
# distinct
lista1 = ["um", "um", "dois", "tres"]

rdd1 = spark.sparkContext.parallelize(lista1)
rddDistinct = rdd1.distinct()

rddDistinct.collect()

['tres', 'um', 'dois']

In [12]:
# join
lista1 = [("Pedro", 39), ("Maria", 30)]
lista2 = [("Pedro", "BH"), ("Maria", "SP"), ("João", "RJ")]

rdd1 = spark.sparkContext.parallelize(lista1)
rdd2 = spark.sparkContext.parallelize(lista2)

rddJoin = rdd1.join(rdd2)

rddJoin.collect()

[('Maria', (30, 'SP')), ('Pedro', (39, 'BH'))]

In [13]:
# Vamos falar de ações?

# join
lista1 = [("Pedro", 39), ("Maria", 30)]
lista2 = [("Pedro", "BH"), ("Maria", "SP"), ("João", "RJ")]

rdd1 = spark.sparkContext.parallelize(lista1)
rdd2 = spark.sparkContext.parallelize(lista2)

rddJoin = rdd1.join(rdd2)

rddJoin

PythonRDD[74] at RDD at PythonRDD.scala:53

In [14]:
rddJoin.collect()

[('Maria', (30, 'SP')), ('Pedro', (39, 'BH'))]

In [15]:
#count 
rddJoin.count()

2

In [16]:
#take
rddJoin.take(1)

[('Maria', (30, 'SP'))]

In [17]:
# top

lista1 = ["um", "um", "dois", "tres"]
lista2 = ["um", "quatro", "cinco"]

rdd1 = spark.sparkContext.parallelize(lista1)
rdd2 = spark.sparkContext.parallelize(lista2)

rddUnion = rdd1.union(rdd2)
rddUnion.collect()

rddUnion.top(3)


['um', 'um', 'um']

In [3]:
# countByValue

lista1 = ["um", "um", "dois", "tres"]
lista2 = ["um", "quatro", "cinco"]

rdd1 = spark.sparkContext.parallelize(lista1)
rdd2 = spark.sparkContext.parallelize(lista2)

rddUnion = rdd1.union(rdd2)
rddUnion.collect()

rddUnion.countByValue()

defaultdict(int, {'um': 3, 'dois': 1, 'tres': 1, 'quatro': 1, 'cinco': 1})

In [2]:
rddUnion.saveAsTextFile('saida3.txt')

NameError: name 'rddUnion' is not defined