# Instalação do PySpark no Colab

In [1]:
!pip install pyspark


Collecting pyspark
  Downloading pyspark-3.4.1.tar.gz (310.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.4.1-py2.py3-none-any.whl size=311285387 sha256=7ba3e094695094b140916587eaacd1c5e236be8b5d4037249df489f339095269
  Stored in directory: /root/.cache/pip/wheels/0d/77/a3/ff2f74cc9ab41f8f594dabf0579c2a7c6de920d584206e0834
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.4.1


In [2]:
!pip install findspark

Collecting findspark
  Downloading findspark-2.0.1-py2.py3-none-any.whl (4.4 kB)
Installing collected packages: findspark
Successfully installed findspark-2.0.1


In [3]:
from pyspark import SparkContext
from pyspark.sql import SparkSession

In [4]:
sc = SparkContext.getOrCreate()

In [5]:
spark = SparkSession.builder.appName('PySpark DataFrame').getOrCreate()

# Transformations

## map()

In [6]:
data= [1, 2, 3, 4, 5]   #criei conjunto de dados
myRDD= sc.parallelize(data)  #usei o conjunto de dados e criei um RDD
newRDD= myRDD.map(lambda x: x*2)  #cria novo RDD multiplicando os dados do primeiro RDD por 2
print(newRDD.collect())  #collect: forma de visualizar os dados no spark

[2, 4, 6, 8, 10]


## flatMap()

In [8]:
data= [1, 2, 3]
myRDD= sc.parallelize(data)
#map() returns [[1], [1, 2], [1, 2, 3]]
mapRDD= myRDD.map(lambda x: range(1,x))
#flatmap() returns [1, 1, 2, 1, 2, 3]
flatMapRDD = myRDD.flatMap(lambda x: range(1,x))

## filter()

In [7]:
data= [1, 2, 3, 4, 5, 6]
myRDD= sc.parallelize(data)
newRDD= myRDD.filter(lambda x: x%2 == 0) #filtra só os dados que podem ser divididos por 2 (resto zero, números pares)
print(newRDD.collect())

[2, 4, 6]


## distinct()

In [10]:
data= [1, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 6]
myRDD= sc.parallelize(data)
newRDD= myRDD.distinct()  #pegar valores únicos do RDD. #DÚVIDA: pq ele pega os pares primeiro e depois os ímpares?
print(newRDD.collect())
print(newRDD.count())

[2, 4, 6, 1, 3, 5]
6


## groupByKey()

In [11]:

myRDD = sc.parallelize([('a', 1), ('a', 2), ('a', 3), ('b', 1)])
#print result as list
resultList= myRDD.groupByKey().mapValues(list) #agrupar por uma chave e mapear os valores do RDD
resultList.collect()

[('b', [1]), ('a', [1, 2, 3])]

## reduceByKey()

In [12]:
from operator import add
myRDD = sc.parallelize([('a', 1), ('a', 2), ('a', 3), ('b', 1)])
#adds the values by keys
newRDD= myRDD.reduceByKey(add)  #reduzir os valores com base em uma chave. Tinha 3 valores de a e transformou em um só (somou os 3 valores de a)
newRDD.collect()

[('b', 1), ('a', 6)]

## sortByKey()

In [13]:
myRDD = sc.parallelize([('c', 1), ('d', 2), ('a', 3), ('b', 4)])
#sort by key
newRDD= myRDD.sortByKey()  #ordenar valores pela chave
newRDD.collect()

[('a', 3), ('b', 4), ('c', 1), ('d', 2)]

## union()

In [14]:

myRDD1 = sc.parallelize([1, 2, 3, 4])
myRDD2 = sc.parallelize([ 3, 4, 5, 6, 7])
#union of myRDD1 and myRDD2
newRDD = myRDD1.union(myRDD2)   #unificar as estruturas dos dois RDDs.
newRDD.collect()

[1, 2, 3, 4, 3, 4, 5, 6, 7]

# Actions

## count()

In [15]:
data= ['Scala', 'Python', 'Java', 'R']
myRDD= sc.parallelize(data)
myRDD.count()

4

## reduce()

In [16]:

data= [1, 2, 3, 4, 5]
myRDD= sc.parallelize(data)
#returns the product of all the elements
myRDD.reduce(lambda x, y: x * y)

120

In [17]:
data= ['Scala', 'Python', 'Java', 'R']
myRDD= sc.parallelize(data)
#Concatenate the string elements
myRDD.reduce( lambda x, y: x + y)

'ScalaPythonJavaR'

## foreach()

In [18]:
def fun(x):
    print(x)
data= ['Scala', 'Python', 'Java', 'R']
myRDD= sc.parallelize(data)
#function applied to all the elements
myRDD.foreach(fun)

## countByValue()

In [19]:
data= ['Python', 'Scala', 'Python', 'R', 'Python', 'Java', 'R']
myRDD= sc.parallelize(data)
myRDD.countByValue().items() #conta a qtde de cada tipo de valores e retorna em dict.

dict_items([('Python', 3), ('Scala', 1), ('R', 2), ('Java', 1)])

## countByKey()

In [20]:
data= [('a', 1), ('b', 1), ('c', 1), ('a', 1)]
myRDD = sc.parallelize(data)
myRDD.countByKey().items()   #conta a qtde de valores pra cada chave e retorna em dict

dict_items([('a', 2), ('b', 1), ('c', 1)])

## take(n)

In [21]:
data= [2, 5, 3, 8, 4]
myRDD= sc.parallelize(data)
#return the first 3 elements
myRDD.take(3) #msm coisa que select top / limit no SQL. Limitar o resultado da operação, pega os 3 primeiros registros do RDD.

[2, 5, 3]

## top(n)

In [22]:
data= [2, 5, 3, 8, 4]
myRDD= sc.parallelize(data)
#return the first 2 elements
myRDD.top(3) #Limitar o resultado da operação, pega os 3 maiores registros do RDD.

[8, 5, 4]