<a href="https://colab.research.google.com/github/nortonvanz/PySpark-Basics/blob/main/notebooks/Case_4_Exerc%C3%ADcios_PySpark_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Imports

In [1]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://archive.apache.org/dist/spark/spark-3.3.1/spark-3.3.1-bin-hadoop2.tgz
!tar xf spark-3.3.1-bin-hadoop2.tgz
!pip install -q findspark

In [2]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.3.1-bin-hadoop2"

In [3]:
import findspark
findspark.init()

In [4]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

spark = SparkSession.builder \
    .master('local[*]') \
    .appName('Iniciando com Spark') \
    .config('spark.ui.port', '4050') \
    .getOrCreate()

In [6]:
import pyspark.pandas as ps



## Load datasets do Colab

In [None]:
ps.re

In [5]:
!wget --quiet --show-progress https://raw.githubusercontent.com/hadley/data-baby-names/master/births.csv
!wget --quiet --show-progress https://raw.githubusercontent.com/hadley/data-baby-names/master/baby-names-by-state.csv



## Load with Spark Pandas

In [22]:
#usando o pandas:
dfbirth = ps.read_csv('births.csv')
dfbaby = ps.read_csv('baby-names-by-state.csv')

In [27]:
dfbirth.head(3)

Unnamed: 0,year,state,sex,births
0,1981,AK,boy,4721
1,1982,AK,boy,5139
2,1983,AK,boy,5662


In [29]:
dfbaby.head(3)

Unnamed: 0,state,year,name,number,sex
0,AK,1960,David,151,boy
1,AK,1960,Michael,139,boy
2,AK,1960,Robert,135,boy


In [None]:
#O problama é não conseguir usar SPARK SQL a partir do dataset do pandas. Vamos importar com o PySpark, pra seguir usando o SQPARK SQL.

## Load with PySpark

In [31]:
df_birth = spark.read.csv("births.csv", sep=",", inferSchema=True, header=True)
df_baby = spark.read.csv("baby-names-by-state.csv", sep=",", inferSchema=True, header=True)

In [32]:
#SPARK SQL
df_birth.createOrReplaceTempView("BIRTH")
df_baby.createOrReplaceTempView("BABY")

In [37]:
spark.sql('''
          SELECT * FROM BIRTH
        ''').show(3)

+----+-----+---+------+
|year|state|sex|births|
+----+-----+---+------+
|1981|   AK|boy|  4721|
|1982|   AK|boy|  5139|
|1983|   AK|boy|  5662|
+----+-----+---+------+
only showing top 3 rows



In [38]:
spark.sql('''
          SELECT * FROM BABY
        ''').show(3)

+-----+----+-------+------+---+
|state|year|   name|number|sex|
+-----+----+-------+------+---+
|   AK|1960|  David|   151|boy|
|   AK|1960|Michael|   139|boy|
|   AK|1960| Robert|   135|boy|
+-----+----+-------+------+---+
only showing top 3 rows



# Questões de Negócio

## 1. Quantidade de bebes (por sexo) nascidos por estado

In [None]:
spark.sql('''
          SELECT * FROM BIRTH

        ''').show(5)

In [34]:
spark.sql('''
          SELECT * FROM BIRTH

        ''').show(5)

+----+-----+---+------+
|year|state|sex|births|
+----+-----+---+------+
|1981|   AK|boy|  4721|
|1982|   AK|boy|  5139|
|1983|   AK|boy|  5662|
|1984|   AK|boy|  5909|
|1985|   AK|boy|  6413|
+----+-----+---+------+
only showing top 5 rows



In [47]:
spark.sql('''
          SELECT
            STATE,
            SEX,
            SUM(NUMBER) AS NUM_BEBES
           FROM BABY
           GROUP BY STATE, SEX
           ORDER BY 1,2
        ''').show(6)

+-----+----+---------+
|STATE| SEX|NUM_BEBES|
+-----+----+---------+
|   AK| boy| 148537.0|
|   AK|girl| 104848.0|
|   AL| boy| 974091.0|
|   AL|girl| 733784.0|
|   AR| boy| 568482.0|
|   AR|girl| 428509.0|
+-----+----+---------+
only showing top 6 rows



## 2. Rankei o nome mais comum da década por sexo

In [67]:
#ver range de anos
spark.sql('''
          SELECT
            MIN(YEAR),
            MAX(YEAR)
          FROM BABY
        ''').show(3)

+---------+---------+
|min(YEAR)|max(YEAR)|
+---------+---------+
|     1960|     2008|
+---------+---------+



In [99]:
spark.sql('''
          WITH FULL_BABY AS (
            SELECT
              YEAR, NAME, NUMBER, SEX,
              CASE
                WHEN year >= 1910 AND year < 1920 THEN 10
                WHEN year >= 1930 AND year < 1940 THEN 30
                WHEN year >= 1940 AND year < 1950 THEN 40
                WHEN year >= 1950 AND year < 1960 THEN 50
                WHEN year >= 1960 AND year < 1970 THEN 60
                WHEN year >= 1970 AND year < 1980 THEN 70
                WHEN year >= 1980 AND year < 1990 THEN 80
                WHEN year >= 1990 AND year < 2000 THEN 90
                WHEN year >= 2000 AND year < 2010 THEN 2000
                WHEN year >= 2010 AND year < 2020 THEN 2010
                WHEN year >= 2020 AND year < 2030 THEN 2020
                ELSE NULL
              END AS DECADE
          FROM BABY
          WHERE NAME IS NOT NULL
        )
        --SELECT NAME, NUMBER, SEX, DECADE FROM FULL_BABY


        SELECT
          DECADE, SEX, NAME
        FROM (
            SELECT DECADE, SEX, NAME,
                  ROW_NUMBER() OVER (PARTITION BY DECADE, SEX ORDER BY SUM(NUMBER) DESC) AS rank
            FROM FULL_BABY
            GROUP BY DECADE, SEX, NAME
        ) ranked
        WHERE rank = 1;

        ''').show(15)

# Nesta consulta, a janela é particionada por década (DECADE) e sexo (SEX).
# A soma das contagens (NUMBER) é calculada para cada nome dentro de cada década e sexo.
# Em seguida, a função ROW_NUMBER() é usada para atribuir um número de linha a cada linha classificada pela soma em ordem decrescente dentro de cada partição.
# A consulta externa filtra as linhas onde o número de linha é igual a 1, o que corresponde ao nome mais comum para cada década e sexo.

+------+----+--------+
|DECADE| SEX|    NAME|
+------+----+--------+
|    60| boy| Michael|
|    60|girl|    Lisa|
|    70| boy| Michael|
|    70|girl|Jennifer|
|    80| boy| Michael|
|    80|girl| Jessica|
|    90| boy| Michael|
|    90|girl| Jessica|
|  2000| boy|   Jacob|
|  2000|girl|   Emily|
+------+----+--------+



## 3. Rankei o nome mais comum da década por sexo em cada estado

In [202]:
spark.sql('''
          WITH FULL_BABY AS (
            SELECT
              YEAR, NAME, NUMBER, SEX, STATE,
              CASE
                WHEN year >= 1910 AND year < 1920 THEN 10
                WHEN year >= 1930 AND year < 1940 THEN 30
                WHEN year >= 1940 AND year < 1950 THEN 40
                WHEN year >= 1950 AND year < 1960 THEN 50
                WHEN year >= 1960 AND year < 1970 THEN 60
                WHEN year >= 1970 AND year < 1980 THEN 70
                WHEN year >= 1980 AND year < 1990 THEN 80
                WHEN year >= 1990 AND year < 2000 THEN 90
                WHEN year >= 2000 AND year < 2010 THEN 2000
                WHEN year >= 2010 AND year < 2020 THEN 2010
                WHEN year >= 2020 AND year < 2030 THEN 2020
                ELSE NULL
              END AS DECADE
          FROM BABY
          WHERE NAME IS NOT NULL
        )
        --SELECT NAME, NUMBER, SEX, DECADE FROM FULL_BABY


        SELECT
          DECADE, STATE, SEX, NAME
        FROM (
            SELECT DECADE, STATE, SEX, NAME,
                  ROW_NUMBER() OVER (PARTITION BY DECADE, STATE, SEX ORDER BY SUM(NUMBER) DESC) AS rank
            FROM FULL_BABY
            GROUP BY DECADE, STATE, SEX, NAME
        ) ranked
        WHERE rank = 1;

        ''').show(15)

+------+-----+----+-------+
|DECADE|STATE| SEX|   NAME|
+------+-----+----+-------+
|    60|   AK| boy|Michael|
|    60|   AK|girl|   Mary|
|    60|   AL| boy|  James|
|    60|   AL|girl|   Lisa|
|    60|   AR| boy|  James|
|    60|   AR|girl|   Lisa|
|    60|   AZ| boy|  David|
|    60|   AZ|girl|   Lisa|
|    60|   CA| boy|Michael|
|    60|   CA|girl|   Lisa|
|    60|   CO| boy|Michael|
|    60|   CO|girl|   Lisa|
|    60|   CT| boy|Michael|
|    60|   CT|girl|   Lisa|
|    60|   DC| boy|Michael|
+------+-----+----+-------+
only showing top 15 rows



## 4. Gerar um dataframe com as seguintes informações a artir dos dataframes existentes: ano de nascimento, nome, porcentagem que ele aparece por ano e salvar em parquet


In [201]:
df_perc = spark.sql('''
        WITH FIRST AS (
          SELECT
            YEAR,
            TRIM(NAME) AS NAME,
            COUNT(TRIM(NAME)) AS NUM_NAMES

          FROM BABY
          WHERE YEAR = 2008
          --AND NAME IN ('Henry', 'John')

          AND NAME IS NOT NULL
          AND YEAR IS NOT NULL
          AND LENGTH(TRIM(NAME)) > 0
          AND LENGTH(TRIM(YEAR)) > 0
          AND TRIM(NAME) <> ''
          AND TRIM(YEAR) <> ''

          GROUP BY YEAR, NAME
          )

          SELECT
              YEAR,
              NAME,
             CASE
                  WHEN SUM(NUM_NAMES) OVER (PARTITION BY YEAR) > 0
                  THEN ROUND((NUM_NAMES / CAST(SUM(NUM_NAMES) OVER (PARTITION BY YEAR) AS FLOAT)) * 100, 2)
                  ELSE 0  -- ou qualquer valor padrão desejado
              END AS PERCENTAGE
          FROM FIRST
          ORDER BY PERCENTAGE DESC;

        ''')

df_perc.show(3)
#Apesar das tratativas nas condicionais, a para não fazer a divisão por zero, e casts, segue trazendo nome em branco.
#Idealmente é necessário uma tratativa inicial nos dados, para remover nulos, caracteres em branco, e afins.

+----+------+----------+
|YEAR|  NAME|PERCENTAGE|
+----+------+----------+
|2008|      |      1.45|
|2008| Riley|      0.74|
|2008|Jordan|      0.61|
+----+------+----------+
only showing top 3 rows



In [196]:
#Exportar em Parquet
df_perc.write.mode("overwrite").parquet("df_perc.parquet")

## 5. Mostrar o nome mais escolhido para os bebes de todos os tempos

In [198]:
df_perc_saved = spark.read.parquet("df_perc.parquet")
df_perc_saved.show(5)

+----+------+----------+
|YEAR|  NAME|PERCENTAGE|
+----+------+----------+
|2008|      |      1.45|
|2008| Riley|      0.74|
|2008|Jordan|      0.61|
|2008|Peyton|      0.59|
|2008|Jayden|      0.58|
+----+------+----------+
only showing top 5 rows



In [236]:
spark.sql('''

          SELECT
            NAME,
            COUNT(NAME) AS NUM_NAMES
          FROM BABY
          GROUP BY 1
          ORDER BY 2 DESC

        ''').show(10)

#James

+-----------+---------+
|       NAME|NUM_NAMES|
+-----------+---------+
|           |     4264|
|      James|     2451|
|    William|     2451|
|    Michael|     2451|
|     Joseph|     2451|
|  Elizabeth|     2451|
|     Thomas|     2450|
|     Daniel|     2450|
|Christopher|     2450|
|      David|     2450|
+-----------+---------+
only showing top 10 rows

