<a href="https://colab.research.google.com/github/louisecastrof/Projects_and_Studies/blob/main/How_Bootcamps_Spark.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Inicialização - Instalação e Contexto

In [1]:
# Instalar pyspark

%pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.4.0.tar.gz (310.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.4.0-py2.py3-none-any.whl size=311317145 sha256=1543c02a96a3d975710b45c67cee60ba768777cbdc747038325720576175375a
  Stored in directory: /root/.cache/pip/wheels/9f/34/a4/159aa12d0a510d5ff7c8f0220abbea42e5d81ecf588c4fd884
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.4.0


In [2]:
# Importar SparkSession - possibilitando criar uma interface para interagir com o Spark e suas funcionalidades
from pyspark.sql import SparkSession

# Iniciando a sessão Spark
#https://spark.apache.org/docs/latest/sql-getting-started.html
spark = SparkSession.builder \
        .appName("Spark - How Bootcamps") \
        .getOrCreate()

In [3]:
# Testando (printando) a sessão
spark

# Criando o DataFrame e visualizando dados

In [4]:
# Criando uma função para ler csv e retornar um schema
def ler_csv(filename):
  df = spark.read.format('csv').options(header='true', inferSchema='true').load(filename)
  return df

ler_csv('Matches.csv')

DataFrame[Match_ID: int, Div: string, Season: int, Date: date, HomeTeam: string, AwayTeam: string, FTHG: int, FTAG: int, FTR: string]

In [5]:
# Atribuindo à um dataframe e mostrando as 5 primeiras linhas
df = ler_csv('Matches.csv')
df.limit(5).show()

+--------+---+------+----------+-------------+--------------+----+----+---+
|Match_ID|Div|Season|      Date|     HomeTeam|      AwayTeam|FTHG|FTAG|FTR|
+--------+---+------+----------+-------------+--------------+----+----+---+
|       1| D2|  2009|2010-04-04|   Oberhausen|Kaiserslautern|   2|   1|  H|
|       2| D2|  2009|2009-11-01|  Munich 1860|Kaiserslautern|   0|   1|  A|
|       3| D2|  2009|2009-10-04|Frankfurt FSV|Kaiserslautern|   1|   1|  D|
|       4| D2|  2009|2010-02-21|Frankfurt FSV|     Karlsruhe|   2|   1|  H|
|       5| D2|  2009|2009-12-06|        Ahlen|     Karlsruhe|   1|   3|  A|
+--------+---+------+----------+-------------+--------------+----+----+---+



In [6]:
# Contando o total de linhas do dataframe
df.count()

24625

In [7]:
# Checando os tipos dos dados de cada coluna
df.dtypes

[('Match_ID', 'int'),
 ('Div', 'string'),
 ('Season', 'int'),
 ('Date', 'date'),
 ('HomeTeam', 'string'),
 ('AwayTeam', 'string'),
 ('FTHG', 'int'),
 ('FTAG', 'int'),
 ('FTR', 'string')]

In [8]:
# Uma outra possibilidade de visualização para as 5 primeiras linhas
df.take(5)

[Row(Match_ID=1, Div='D2', Season=2009, Date=datetime.date(2010, 4, 4), HomeTeam='Oberhausen', AwayTeam='Kaiserslautern', FTHG=2, FTAG=1, FTR='H'),
 Row(Match_ID=2, Div='D2', Season=2009, Date=datetime.date(2009, 11, 1), HomeTeam='Munich 1860', AwayTeam='Kaiserslautern', FTHG=0, FTAG=1, FTR='A'),
 Row(Match_ID=3, Div='D2', Season=2009, Date=datetime.date(2009, 10, 4), HomeTeam='Frankfurt FSV', AwayTeam='Kaiserslautern', FTHG=1, FTAG=1, FTR='D'),
 Row(Match_ID=4, Div='D2', Season=2009, Date=datetime.date(2010, 2, 21), HomeTeam='Frankfurt FSV', AwayTeam='Karlsruhe', FTHG=2, FTAG=1, FTR='H'),
 Row(Match_ID=5, Div='D2', Season=2009, Date=datetime.date(2009, 12, 6), HomeTeam='Ahlen', AwayTeam='Karlsruhe', FTHG=1, FTAG=3, FTR='A')]

In [9]:
# Visualizando o dataframe com Pandas
df.limit(5).toPandas()

Unnamed: 0,Match_ID,Div,Season,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR
0,1,D2,2009,2010-04-04,Oberhausen,Kaiserslautern,2,1,H
1,2,D2,2009,2009-11-01,Munich 1860,Kaiserslautern,0,1,A
2,3,D2,2009,2009-10-04,Frankfurt FSV,Kaiserslautern,1,1,D
3,4,D2,2009,2010-02-21,Frankfurt FSV,Karlsruhe,2,1,H
4,5,D2,2009,2009-12-06,Ahlen,Karlsruhe,1,3,A


# Transformando dados

In [11]:
# Traduzindo as colunas
# FTHG = Full Time Home Team Goals
# FTAG = Full Time Away Team Goals
# FTR and Res = Full Time Result (H=Home Win, D=Draw, A=Away Win)
df.columns

['Match_ID',
 'Div',
 'Season',
 'Date',
 'HomeTeam',
 'AwayTeam',
 'FTHG',
 'FTAG',
 'FTR']

In [13]:
# Possível forma de renomear uma coluna
df_traduzido1 = df.withColumnRenamed('Match_ID', 'IDPartida') \
.withColumnRenamed('Div','Divisao') \
.withColumnRenamed('Season','Temporada') \
.withColumnRenamed('Date','Data') \
.withColumnRenamed('HomeTeam','TimeDaCasa') \
.withColumnRenamed('AwayTeam', 'TimeDeFora') \
.withColumnRenamed('FTGH', 'ResultadoTimeDaCasa') \
.withColumnRenamed('FTAG', 'ResultadoTimeDeFora') \
.withColumnRenamed('FTR', 'ResultadoFinal') \

df_traduzido1.limit(5).show()

+---------+-------+---------+----------+-------------+--------------+----+-------------------+--------------+
|IDPartida|Divisao|Temporada|      Data|   TimeDaCasa|    TimeDeFora|FTHG|ResultadoTimeDeFora|ResultadoFinal|
+---------+-------+---------+----------+-------------+--------------+----+-------------------+--------------+
|        1|     D2|     2009|2010-04-04|   Oberhausen|Kaiserslautern|   2|                  1|             H|
|        2|     D2|     2009|2009-11-01|  Munich 1860|Kaiserslautern|   0|                  1|             A|
|        3|     D2|     2009|2009-10-04|Frankfurt FSV|Kaiserslautern|   1|                  1|             D|
|        4|     D2|     2009|2010-02-21|Frankfurt FSV|     Karlsruhe|   2|                  1|             H|
|        5|     D2|     2009|2009-12-06|        Ahlen|     Karlsruhe|   1|                  3|             A|
+---------+-------+---------+----------+-------------+--------------+----+-------------------+--------------+



In [14]:
# Possível forma de renomear várias colunas
df_traduzido2 = df
nomes_antigos = df_traduzido2.columns
nomes_novos = ["IDPartida", "Divisao", "Temporada", "Data", "TimeDaCasa", "TimeDeFora", "ResultadoTimeDaCasa", "ResultadoTimeDeFora", "ResultadoFinal"]
df_traduzido = [*zip(nomes_antigos, nomes_novos)]
for antigo, novo in df_traduzido:
    df_traduzido2 = df_traduzido2.withColumnRenamed(antigo, novo)

df_traduzido2.limit(5).show()

+---------+-------+---------+----------+-------------+--------------+-------------------+-------------------+--------------+
|IDPartida|Divisao|Temporada|      Data|   TimeDaCasa|    TimeDeFora|ResultadoTimeDaCasa|ResultadoTimeDeFora|ResultadoFinal|
+---------+-------+---------+----------+-------------+--------------+-------------------+-------------------+--------------+
|        1|     D2|     2009|2010-04-04|   Oberhausen|Kaiserslautern|                  2|                  1|             H|
|        2|     D2|     2009|2009-11-01|  Munich 1860|Kaiserslautern|                  0|                  1|             A|
|        3|     D2|     2009|2009-10-04|Frankfurt FSV|Kaiserslautern|                  1|                  1|             D|
|        4|     D2|     2009|2010-02-21|Frankfurt FSV|     Karlsruhe|                  2|                  1|             H|
|        5|     D2|     2009|2009-12-06|        Ahlen|     Karlsruhe|                  1|                  3|             A|


# Usando SQL com Spark

In [16]:
# Criando uma view para poder usar o SQL
df_traduzido2.createOrReplaceTempView("PartidasDeFutebol")

In [17]:
# Selecionado Divisão e datas
spark.sql("SELECT Divisao, Data FROM PartidasDeFutebol") \
     .show(5)

+-------+----------+
|Divisao|      Data|
+-------+----------+
|     D2|2010-04-04|
|     D2|2009-11-01|
|     D2|2009-10-04|
|     D2|2010-02-21|
|     D2|2009-12-06|
+-------+----------+
only showing top 5 rows



In [38]:
# Selecionado e transformando data
spark.sql("SELECT Divisao, TimeDaCasa, TimeDeFora, Data FROM PartidasDeFutebol WHERE Data > '2010-01-01' ORDER BY Data DESC")  \
     .show()

+-------+--------------+------------------+----------+
|Divisao|    TimeDaCasa|        TimeDeFora|      Data|
+-------+--------------+------------------+----------+
|     D2|    Ingolstadt|    Kaiserslautern|2018-05-13|
|     D2|      Duisburg|          St Pauli|2018-05-13|
|     D2|    Heidenheim|    Greuther Furth|2018-05-13|
|     D2|       Dresden|      Union Berlin|2018-05-13|
|     D2|          Kiel|      Braunschweig|2018-05-13|
|     D2|     Darmstadt|    Erzgebirge Aue|2018-05-13|
|     D2|        Bochum|        Regensburg|2018-05-13|
|     D2|      Nurnberg|Fortuna Dusseldorf|2018-05-13|
|     D2|     Bielefeld|        Sandhausen|2018-05-13|
|     D1|    Hoffenheim|          Dortmund|2018-05-12|
|     D1|    Schalke 04|     Ein Frankfurt|2018-05-12|
|     D1|     Wolfsburg|           FC Koln|2018-05-12|
|     D1| Bayern Munich|         Stuttgart|2018-05-12|
|     D1|         Mainz|     Werder Bremen|2018-05-12|
|     D1|      Freiburg|          Augsburg|2018-05-12|
|     D1| 