# Introdução ao PySpark

### importação de biblioteca

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

### Criação/Inicio de sessão

inicialmente cria-se a sessão spark como mostrado na proxima parte, abaixo segue a explicação sobre algumas variaveis da construção.
- SparkSession.builder é o construtor da sessão spark
- master é o cluster
- appName é o nome da sessão spark
- getOrCreate se vicê tiver criado uma sessão anterior de mesmo nome, essa função vai resgata-la, caso não, ela criará uma nova

In [2]:
spark = (
    SparkSession.builder
    .master('local')
    .appName('aula01')
    .getOrCreate()
)

### Criação de dataframe

a leitura de dataframe é parecida com a do pandas


In [3]:
df = spark.read.csv('../datasets/netflix_titles.csv', sep=",", header=True, inferSchema=True)

para exibir no pandas se usa o .head(), no spark se usa o .show()

In [4]:
df.show()

+-------+-------+--------------------+--------------------+--------------------+--------------------+------------------+------------+------+---------+--------------------+--------------------+
|show_id|   type|               title|            director|                cast|             country|        date_added|release_year|rating| duration|           listed_in|         description|
+-------+-------+--------------------+--------------------+--------------------+--------------------+------------------+------------+------+---------+--------------------+--------------------+
|     s1|  Movie|Dick Johnson Is Dead|     Kirsten Johnson|                NULL|       United States|September 25, 2021|        2020| PG-13|   90 min|       Documentaries|As her father nea...|
|     s2|TV Show|       Blood & Water|                NULL|Ama Qamata, Khosi...|        South Africa|September 24, 2021|        2021| TV-MA|2 Seasons|International TV ...|After crossing pa...|
|     s3|TV Show|           Ganglan

In [5]:
df.printSchema()

root
 |-- show_id: string (nullable = true)
 |-- type: string (nullable = true)
 |-- title: string (nullable = true)
 |-- director: string (nullable = true)
 |-- cast: string (nullable = true)
 |-- country: string (nullable = true)
 |-- date_added: string (nullable = true)
 |-- release_year: string (nullable = true)
 |-- rating: string (nullable = true)
 |-- duration: string (nullable = true)
 |-- listed_in: string (nullable = true)
 |-- description: string (nullable = true)



### verificar dados nulos

In [6]:
for column in df.columns:
    print(column, df.filter(df[column].isNull()).count())

show_id 0
type 1
title 2
director 2636
cast 826
country 832
date_added 13
release_year 2
rating 6
duration 5
listed_in 3
description 3


In [7]:
df = df.dropna()

In [8]:
for column in df.columns:
    print(column, df.filter(df[column].isNull()).count())

show_id 0
type 0
title 0
director 0
cast 0
country 0
date_added 0
release_year 0
rating 0
duration 0
listed_in 0
description 0


In [9]:
df.show(5)

+-------+-------+--------------------+-------------------+--------------------+--------------------+------------------+------------+------+---------+--------------------+--------------------+
|show_id|   type|               title|           director|                cast|             country|        date_added|release_year|rating| duration|           listed_in|         description|
+-------+-------+--------------------+-------------------+--------------------+--------------------+------------------+------------+------+---------+--------------------+--------------------+
|     s8|  Movie|             Sankofa|       Haile Gerima|Kofi Ghanaba, Oya...|United States, Gh...|September 24, 2021|        1993| TV-MA|  125 min|Dramas, Independe...|On a photo shoot ...|
|     s9|TV Show|The Great British...|    Andy Devonshire|Mel Giedroyc, Sue...|      United Kingdom|September 24, 2021|        2021| TV-14|9 Seasons|British TV Shows,...|A talented batch ...|
|    s10|  Movie|        The Starling|  

### Seleção de colunas

In [10]:
#forma mais rapida
df.select('title', 'director', 'rating').show(10)

+--------------------+-------------------+------+
|               title|           director|rating|
+--------------------+-------------------+------+
|             Sankofa|       Haile Gerima| TV-MA|
|The Great British...|    Andy Devonshire| TV-14|
|        The Starling|     Theodore Melfi| PG-13|
|        Je Suis Karl|Christian Schwochow| TV-MA|
|               Jeans|         S. Shankar| TV-14|
|           Grown Ups|       Dennis Dugan| PG-13|
|          Dark Skies|      Scott Stewart| PG-13|
|            Paranoia|     Robert Luketic| PG-13|
| Birth of the Dragon|       George Nolfi| PG-13|
|                Jaws|   Steven Spielberg|    PG|
+--------------------+-------------------+------+
only showing top 10 rows



In [11]:
df.select(col('title'), col('release_year')).show(5)

+--------------------+------------+
|               title|release_year|
+--------------------+------------+
|             Sankofa|        1993|
|The Great British...|        2021|
|        The Starling|        2021|
|        Je Suis Karl|        2021|
|               Jeans|        1998|
+--------------------+------------+
only showing top 5 rows



In [12]:
df.select(df['title']).show(5)

+--------------------+
|               title|
+--------------------+
|             Sankofa|
|The Great British...|
|        The Starling|
|        Je Suis Karl|
|               Jeans|
+--------------------+
only showing top 5 rows



### Seleção com alias

In [13]:
df.select(col('rating').alias('censura')).show(5)

+-------+
|censura|
+-------+
|  TV-MA|
|  TV-14|
|  PG-13|
|  TV-MA|
|  TV-14|
+-------+
only showing top 5 rows



### Organizar seleção

In [14]:
#pode chamar na ordem que quiser
df.select('release_year','director','title','rating','type').show(5)

+------------+-------------------+--------------------+------+-------+
|release_year|           director|               title|rating|   type|
+------------+-------------------+--------------------+------+-------+
|        1993|       Haile Gerima|             Sankofa| TV-MA|  Movie|
|        2021|    Andy Devonshire|The Great British...| TV-14|TV Show|
|        2021|     Theodore Melfi|        The Starling| PG-13|  Movie|
|        2021|Christian Schwochow|        Je Suis Karl| TV-MA|  Movie|
|        1998|         S. Shankar|               Jeans| TV-14|  Movie|
+------------+-------------------+--------------------+------+-------+
only showing top 5 rows



In [15]:
df.show(5)

+-------+-------+--------------------+-------------------+--------------------+--------------------+------------------+------------+------+---------+--------------------+--------------------+
|show_id|   type|               title|           director|                cast|             country|        date_added|release_year|rating| duration|           listed_in|         description|
+-------+-------+--------------------+-------------------+--------------------+--------------------+------------------+------------+------+---------+--------------------+--------------------+
|     s8|  Movie|             Sankofa|       Haile Gerima|Kofi Ghanaba, Oya...|United States, Gh...|September 24, 2021|        1993| TV-MA|  125 min|Dramas, Independe...|On a photo shoot ...|
|     s9|TV Show|The Great British...|    Andy Devonshire|Mel Giedroyc, Sue...|      United Kingdom|September 24, 2021|        2021| TV-14|9 Seasons|British TV Shows,...|A talented batch ...|
|    s10|  Movie|        The Starling|  

### Filtrar Dataframe

In [16]:
df.filter("type != 'TV Show'").show()

+-------+-----+--------------------+-------------------+--------------------+--------------------+------------------+------------+------+--------+--------------------+--------------------+
|show_id| type|               title|           director|                cast|             country|        date_added|release_year|rating|duration|           listed_in|         description|
+-------+-----+--------------------+-------------------+--------------------+--------------------+------------------+------------+------+--------+--------------------+--------------------+
|     s8|Movie|             Sankofa|       Haile Gerima|Kofi Ghanaba, Oya...|United States, Gh...|September 24, 2021|        1993| TV-MA| 125 min|Dramas, Independe...|On a photo shoot ...|
|    s10|Movie|        The Starling|     Theodore Melfi|Melissa McCarthy,...|       United States|September 24, 2021|        2021| PG-13| 104 min|    Comedies, Dramas|A woman adjusting...|
|    s13|Movie|        Je Suis Karl|Christian Schwochow

In [17]:
df.filter("country = 'Japan'").show()

+-------+-------+--------------------+--------------------+----------------------+-------+------------------+------------+------+--------+--------------------+--------------------+
|show_id|   type|               title|            director|                  cast|country|        date_added|release_year|rating|duration|           listed_in|         description|
+-------+-------+--------------------+--------------------+----------------------+-------+------------------+------------+------+--------+--------------------+--------------------+
|    s52|  Movie|InuYasha the Movi...|   Toshiya Shinohara|  Kappei Yamaguchi,...|  Japan|September 15, 2021|        2002| TV-14|  99 min|Action & Adventur...|With their bigges...|
|    s53|  Movie|InuYasha the Movi...|   Toshiya Shinohara|  Kappei Yamaguchi,...|  Japan|September 15, 2021|        2003| TV-14|  99 min|Action & Adventur...|The Great Dog Dem...|
|    s54|  Movie|InuYasha the Movi...|   Toshiya Shinohara|  Kappei Yamaguchi,...|  Japan|Septe

In [18]:
df.filter(col('director') == 'Hideaki Anno').show()

+-------+-----+--------------------+------------+--------------------+-------+-------------+------------+------+--------+--------------------+--------------------+
|show_id| type|               title|    director|                cast|country|   date_added|release_year|rating|duration|           listed_in|         description|
+-------+-----+--------------------+------------+--------------------+-------+-------------+------------+------+--------+--------------------+--------------------+
|  s3721|Movie|EVANGELION: DEATH...|Hideaki Anno|Megumi Ogata, Kot...|  Japan|June 21, 2019|        1998| TV-MA|  69 min|Action & Adventur...|Fifteen years aft...|
+-------+-----+--------------------+------------+--------------------+-------+-------------+------------+------+--------+--------------------+--------------------+



In [19]:
df.filter(col('cast').contains('Sky Ferreira')).show()

+-------+-----+-----------------+--------------------+--------------------+--------------------+-----------------+------------+------+--------+--------------------+--------------------+
|show_id| type|            title|            director|                cast|             country|       date_added|release_year|rating|duration|           listed_in|         description|
+-------+-----+-----------------+--------------------+--------------------+--------------------+-----------------+------------+------+--------+--------------------+--------------------+
|  s3263|Movie|The Green Inferno|            Eli Roth|Ignacia Allamand,...|United States, Chile|November 16, 2019|        2013|     R| 101 min|Horror Movies, In...|Determined to sav...|
|  s8538|Movie|        The Trust|Ben Brewer, Alex ...|Nicolas Cage, Eli...|       United States|    April 1, 2019|        2015|     R|  91 min|           Thrillers|Discovering the l...|
+-------+-----+-----------------+--------------------+----------------

filtrar por parte do valor

In [20]:
df.filter(col('country').contains("United States")).show()

+-------+-----+--------------------+--------------------+--------------------+--------------------+------------------+------------+------+--------+--------------------+--------------------+
|show_id| type|               title|            director|                cast|             country|        date_added|release_year|rating|duration|           listed_in|         description|
+-------+-----+--------------------+--------------------+--------------------+--------------------+------------------+------------+------+--------+--------------------+--------------------+
|     s8|Movie|             Sankofa|        Haile Gerima|Kofi Ghanaba, Oya...|United States, Gh...|September 24, 2021|        1993| TV-MA| 125 min|Dramas, Independe...|On a photo shoot ...|
|    s10|Movie|        The Starling|      Theodore Melfi|Melissa McCarthy,...|       United States|September 24, 2021|        2021| PG-13| 104 min|    Comedies, Dramas|A woman adjusting...|
|    s28|Movie|           Grown Ups|        Dennis

### Filtragem em duas condições

In [21]:
df.filter((col('country') == 'Japan') & (col('type') == 'TV Show')).show()

+-------+-------+--------------------+--------------------+---------------------+-------+-----------------+------------+------+---------+--------------------+--------------------+
|show_id|   type|               title|            director|                 cast|country|       date_added|release_year|rating| duration|           listed_in|         description|
+-------+-------+--------------------+--------------------+---------------------+-------+-----------------+------------+------+---------+--------------------+--------------------+
|   s804|TV Show|Ouran High School...|     Takuya Igarashi| Maaya Sakamoto, M...|  Japan|     June 2, 2021|        2006| TV-PG| 1 Season|Anime Series, Rom...|New student Haruh...|
|  s2603|TV Show|The Forest of Lov...|           Sion Sono| Kippei Shiina, Sh...|  Japan|   April 30, 2020|        2020| TV-MA| 1 Season|Crime TV Shows, I...|Nothing's as it s...|
|  s3138|TV Show|    Girls und Panzer|   Tsutomu Mizushima| Mai Fuchigami, Ai...|  Japan|December 15

In [22]:
df.filter('type = "Movie"').filter(col('cast').contains('Mary Elizabeth Winstead')).show()

+-------+-----+--------------------+--------------------+--------------------+--------------------+------------------+------------+------+--------+--------------------+--------------------+
|show_id| type|               title|            director|                cast|             country|        date_added|release_year|rating|duration|           listed_in|         description|
+-------+-----+--------------------+--------------------+--------------------+--------------------+------------------+------------+------+--------+--------------------+--------------------+
|    s82|Movie|                Kate|Cedric Nicolas-Tr...|Mary Elizabeth Wi...|       United States|September 10, 2021|        2021|     R| 106 min|  Action & Adventure|Slipped a fatal p...|
|  s4292|Movie|  Kill the Messenger|      Michael Cuesta|Jeremy Renner, Ro...|       United States| December 16, 2018|        2014|     R| 112 min|   Dramas, Thrillers|In the 1980s, a c...|
|  s6037|Movie|A Glimpse Inside ...|       Roman C

### Filtrar com duas condições (OR / |)

In [23]:
df.filter((col('country') == 'Germany') | (col('country') == 'Sweden')).show()

+-------+-----+--------------------+--------------------+--------------------+-------+-----------------+------------+------+--------+--------------------+--------------------+
|show_id| type|               title|            director|                cast|country|       date_added|release_year|rating|duration|           listed_in|         description|
+-------+-----+--------------------+--------------------+--------------------+-------+-----------------+------------+------+--------+--------------------+--------------------+
|   s706|Movie|      Biking Borders|            Max Jabs|Max Jabs, Nono Ko...|Germany|    June 16, 2021|        2019| TV-14|  89 min|Documentaries, In...|Best friends Max ...|
|   s769|Movie|      Dancing Queens|    Helena Bergström|Molly Nutley, Fre...| Sweden|     June 3, 2021|        2021| TV-MA| 111 min|Comedies, Dramas,...|A dancer who gets...|
|  s1032|Movie|       Into the Beat|  Stefan Westerwelle|Alexandra Pfeifer...|Germany|   April 16, 2021|        2020| TV

### Combinando & e | (And e OR)

In [25]:
df.filter(((col("type") == "Movie") & (col('director') == 'Hideaki Anno'))| (col("release_year") == 2000 )).show()

+-------+-----+--------------------+--------------------+--------------------+--------------------+------------------+------------+------+--------+--------------------+--------------------+
|show_id| type|               title|            director|                cast|             country|        date_added|release_year|rating|duration|           listed_in|         description|
+-------+-----+--------------------+--------------------+--------------------+--------------------+------------------+------------+------+--------+--------------------+--------------------+
|   s351|Movie|       Space Cowboys|      Clint Eastwood|Clint Eastwood, T...|       United States|    August 1, 2021|        2000| PG-13| 130 min|Action & Adventur...|A retired enginee...|
|   s360|Movie|The Original King...|           Spike Lee|Steve Harvey, D.L...|       United States|    August 1, 2021|        2000|     R| 111 min|     Stand-Up Comedy|Comedians Steve H...|
|   s567|Movie|    Charlie's Angels|              

### Criar coluna condicional utilizando substring

In [26]:
df.withColumn('Sub', substring('type', 1, 3)).show(5)

+-------+-------+--------------------+-------------------+--------------------+--------------------+------------------+------------+------+---------+--------------------+--------------------+---+
|show_id|   type|               title|           director|                cast|             country|        date_added|release_year|rating| duration|           listed_in|         description|Sub|
+-------+-------+--------------------+-------------------+--------------------+--------------------+------------------+------------+------+---------+--------------------+--------------------+---+
|     s8|  Movie|             Sankofa|       Haile Gerima|Kofi Ghanaba, Oya...|United States, Gh...|September 24, 2021|        1993| TV-MA|  125 min|Dramas, Independe...|On a photo shoot ...|Mov|
|     s9|TV Show|The Great British...|    Andy Devonshire|Mel Giedroyc, Sue...|      United Kingdom|September 24, 2021|        2021| TV-14|9 Seasons|British TV Shows,...|A talented batch ...|TV |
|    s10|  Movie|   

In [27]:
df.show()

+-------+-------+--------------------+-------------------+--------------------+--------------------+------------------+------------+------+---------+--------------------+--------------------+
|show_id|   type|               title|           director|                cast|             country|        date_added|release_year|rating| duration|           listed_in|         description|
+-------+-------+--------------------+-------------------+--------------------+--------------------+------------------+------------+------+---------+--------------------+--------------------+
|     s8|  Movie|             Sankofa|       Haile Gerima|Kofi Ghanaba, Oya...|United States, Gh...|September 24, 2021|        1993| TV-MA|  125 min|Dramas, Independe...|On a photo shoot ...|
|     s9|TV Show|The Great British...|    Andy Devonshire|Mel Giedroyc, Sue...|      United Kingdom|September 24, 2021|        2021| TV-14|9 Seasons|British TV Shows,...|A talented batch ...|
|    s10|  Movie|        The Starling|  

### alteração de tipo de coluna

In [28]:
df.show(5)

+-------+-------+--------------------+-------------------+--------------------+--------------------+------------------+------------+------+---------+--------------------+--------------------+
|show_id|   type|               title|           director|                cast|             country|        date_added|release_year|rating| duration|           listed_in|         description|
+-------+-------+--------------------+-------------------+--------------------+--------------------+------------------+------------+------+---------+--------------------+--------------------+
|     s8|  Movie|             Sankofa|       Haile Gerima|Kofi Ghanaba, Oya...|United States, Gh...|September 24, 2021|        1993| TV-MA|  125 min|Dramas, Independe...|On a photo shoot ...|
|     s9|TV Show|The Great British...|    Andy Devonshire|Mel Giedroyc, Sue...|      United Kingdom|September 24, 2021|        2021| TV-14|9 Seasons|British TV Shows,...|A talented batch ...|
|    s10|  Movie|        The Starling|  

In [29]:
df.printSchema()

root
 |-- show_id: string (nullable = true)
 |-- type: string (nullable = true)
 |-- title: string (nullable = true)
 |-- director: string (nullable = true)
 |-- cast: string (nullable = true)
 |-- country: string (nullable = true)
 |-- date_added: string (nullable = true)
 |-- release_year: string (nullable = true)
 |-- rating: string (nullable = true)
 |-- duration: string (nullable = true)
 |-- listed_in: string (nullable = true)
 |-- description: string (nullable = true)



In [33]:
df = df.withColumn('release_year', col('release_year').cast(IntegerType()))

In [34]:
df.printSchema()

root
 |-- show_id: string (nullable = true)
 |-- type: string (nullable = true)
 |-- title: string (nullable = true)
 |-- director: string (nullable = true)
 |-- cast: string (nullable = true)
 |-- country: string (nullable = true)
 |-- date_added: string (nullable = true)
 |-- release_year: integer (nullable = true)
 |-- rating: string (nullable = true)
 |-- duration: string (nullable = true)
 |-- listed_in: string (nullable = true)
 |-- description: string (nullable = true)

