In [7]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("ExAula").getOrCreate()
sc = spark.sparkContext

In [8]:
sc

In [9]:
hdd = sc.parallelize([200, 75, 120, 350, 500])

In [10]:
hdd.take(2)

[200, 75]

In [11]:
# Aplicar desconto de 10% em cada valor
hdd.map(lambda x: x * 0.9).take(100)

[180.0, 67.5, 108.0, 315.0, 450.0]

In [12]:
def aplicar_desconto(valor):
    return valor * 0.9

# Aplicar desconto de 10% em cada valor
hdd.map(aplicar_desconto).take(100)

[180.0, 67.5, 108.0, 315.0, 450.0]

In [13]:
# Calcular o total
hdd.map(lambda x: x * 0.9).reduce(lambda x, y: x + y)

1120.5

In [14]:
rdd_w = sc.parallelize([
    "bom dia", "boa tarde", "boa noite", "ola", "oi"
    ])

In [15]:
rdd_w.take(3)

['bom dia', 'boa tarde', 'boa noite']

In [16]:
# Filtrar frases que comecam com "b"
rdd_w.filter(lambda x: x[0] == 'b').take(100)

['bom dia', 'boa tarde', 'boa noite']

In [17]:
def comeca_com_b(palavra):
    return palavra[0] == "b"

# Aplicar filtro
rdd_w.filter(comeca_com_b).take(100)

['bom dia', 'boa tarde', 'boa noite']

In [None]:
rdd_w.take(10)

In [18]:
rdd_w = sc.parallelize([
    "bom dia", "boa tarde", "boa noite", "ola", "oi"
    ])

# Fazer split
rdd_w.map(lambda x: x.split()).take(100)

[['bom', 'dia'], ['boa', 'tarde'], ['boa', 'noite'], ['ola'], ['oi']]

In [19]:
# map vs flatMap
rdd_w.flatMap(lambda x: x.split()).take(100)

['bom', 'dia', 'boa', 'tarde', 'boa', 'noite', 'ola', 'oi']

In [20]:
rdd_w.flatMap(lambda x: x.split()).map(lambda x: (x[0], [x])).take(100)

[('b', ['bom']),
 ('d', ['dia']),
 ('b', ['boa']),
 ('t', ['tarde']),
 ('b', ['boa']),
 ('n', ['noite']),
 ('o', ['ola']),
 ('o', ['oi'])]

In [21]:
# Exemplo
# Tupla inicial + todas as palavras com reduceByKey 
rdd_w.flatMap(lambda x: x.split()).map(lambda x: (x[0], [x])).reduceByKey(lambda x, y: x + y).take(100)

[('o', ['ola', 'oi']),
 ('n', ['noite']),
 ('b', ['bom', 'boa', 'boa']),
 ('d', ['dia']),
 ('t', ['tarde'])]

In [22]:
rdd_w \
    .flatMap(lambda x: x.split()) \
    .map(lambda x: (x[0], [x])) \
    .reduceByKey(lambda x, y: x + y) \
    .take(100)

[('o', ['ola', 'oi']),
 ('n', ['noite']),
 ('b', ['bom', 'boa', 'boa']),
 ('d', ['dia']),
 ('t', ['tarde'])]