# Nível PySpark

**Importação das bibliotecas necessárias:**

In [28]:
!pip install PySpark



In [29]:
import pyspark.sql.functions as F
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType,StructField, StringType, IntegerType, FloatType, DateType
from pyspark.sql.types import ArrayType, DoubleType, BooleanType
from pyspark.sql.functions import col,array_contains
from pyspark.sql.window import Window
from pyspark import SparkConf, SparkContext

**Extração de dados para o DataFrame:**

In [30]:
spark = (
    SparkSession.builder
      .master('local')
      .appName('projeto-deloitte')
      .config('spark.ui.port', '4050')
      .getOrCreate()
)

In [31]:
Schema = (StructType([
                        StructField('AnoCalendario', DateType()),  
                        StructField('DataArquivamento', DateType()),  
                        StructField('DataAbertura', DateType()),  
                        StructField('Regiao', StringType()), 
                        StructField('UF', StringType()),  
                        StructField('RazaoSocial', StringType()),  
                        StructField('NomeFantasia', StringType()), 
                        StructField('AtividadePrincipal', StringType()),  
                        StructField('StatusAtendimento', StringType()),  
                        StructField('DescricaoAssunto', StringType()),
                        StructField('DescricaoProblema', StringType()), 
                        StructField('SexoConsumidor', StringType()), 
                        StructField('FaixaEtariaConsumidor',  StringType()),        
                    ]))
local_arquivo = "gs://deloitte_g10/saída/Pandas/PROCON/Procon_normalizado.csv"

df = spark.read.load(local_arquivo, format="csv", header="true", sep=',', schema=Schema)

**Realizando consultas:**

In [32]:
df.select('UF', 'StatusAtendimento').groupBy('UF', 'StatusAtendimento') \
                                              .count() \
                                              .sort("count", ascending=False).show()

[Stage 44:>                                                         (0 + 1) / 1]

+---+-----------------+-----+
| UF|StatusAtendimento|count|
+---+-----------------+-----+
| SP|                N|12819|
| MG|                S| 8927|
| SP|                S| 8920|
| PE|                S| 7180|
| MS|                S| 5257|
| CE|                S| 5129|
| ES|                S| 4811|
| ES|                N| 4708|
| PE|                N| 4173|
| SC|                S| 3761|
| PB|                S| 3450|
| RJ|                N| 3334|
| MG|                N| 3298|
| RJ|                S| 2955|
| AL|                S| 2950|
| MS|                N| 2840|
| BA|                N| 2818|
| GO|                S| 2677|
| CE|                N| 2561|
| TO|                N| 2460|
+---+-----------------+-----+
only showing top 20 rows



                                                                                

In [33]:
df.select('Regiao', 'StatusAtendimento').groupBy('Regiao', 'StatusAtendimento') \
                                              .count() \
                                              .sort("count", ascending=False).show()

[Stage 47:>                                                         (0 + 1) / 1]

+------------+-----------------+-----+
|      Regiao|StatusAtendimento|count|
+------------+-----------------+-----+
|     Sudeste|                S|25613|
|    Nordeste|                S|24488|
|     Sudeste|                N|24159|
|    Nordeste|                N|17072|
|Centro-oeste|                S|10875|
|Centro-oeste|                N| 6648|
|       Norte|                S| 4954|
|         Sul|                S| 4902|
|       Norte|                N| 4312|
|         Sul|                N| 3536|
+------------+-----------------+-----+



                                                                                

In [34]:
df.select('RazaoSocial', 'StatusAtendimento').groupBy('RazaoSocial', 'StatusAtendimento') \
                                              .count() \
                                              .sort("count", ascending=False).show()

[Stage 50:>                                                         (0 + 1) / 1]

+--------------------+-----------------+-----+
|         RazaoSocial|StatusAtendimento|count|
+--------------------+-----------------+-----+
|CAIXA ECONOMICA F...|                S| 6756|
|CAIXA ECONOMICA F...|                N| 5691|
|   BANCO BRADESCO SA|                S| 5288|
|        BANCO BMG SA|                S| 3788|
|   BANCO BRADESCO SA|                N| 3663|
|  BANCO DO BRASIL SA|                N| 3475|
|  BANCO SANTANDER SA|                S| 3291|
|   BANCO ITAUCARD SA|                S| 3040|
|  BANCO SANTANDER SA|                N| 3032|
|  BANCO DO BRASIL SA|                S| 2966|
|   BANCO ITAUCARD SA|                N| 2524|
|        BANCO BMG SA|                N| 2449|
|    ITAU UNIBANCO SA|                N| 1467|
|BANCO CRUZEIRO DO...|                N| 1240|
|    ITAU UNIBANCO SA|                S|  980|
|  BANCO DAYCOVAL S/A|                S|  924|
|BV FINANCEIRA SA ...|                S|  922|
|BANCO VOTORANTIM ...|                S|  861|
|       BANCO

                                                                                

In [35]:
df.select('SexoConsumidor', 'StatusAtendimento').groupBy('SexoConsumidor', 'StatusAtendimento') \
                                              .count() \
                                              .sort("count", ascending=False).show()

+--------------+-----------------+-----+
|SexoConsumidor|StatusAtendimento|count|
+--------------+-----------------+-----+
|             F|                S|38307|
|             M|                S|32230|
|             F|                N|28634|
|             M|                N|26006|
|             N|                N| 1087|
|             N|                S|  295|
+--------------+-----------------+-----+



In [36]:
#Contando Atendimento de acordo com o Status de Atendimento
df.groupBy(F.col('StatusAtendimento')).count().show()

+-----------------+-----+
|StatusAtendimento|count|
+-----------------+-----+
|                N|55727|
|                S|70832|
+-----------------+-----+



In [37]:
#Contando Consumidores de acordo com o Sexo
df.groupBy(F.col('SexoConsumidor')).count().show()

+--------------+-----+
|SexoConsumidor|count|
+--------------+-----+
|             F|66941|
|             M|58236|
|             N| 1382|
+--------------+-----+



**Exportando para Bucket:**

In [38]:
#Salvando em formato parquet
(df.write.format("parquet")
.option("header", "true")
.option("inferschema", "true")
.save("gs://deloitte_g10/saída/Pyspark/Procon/Procon.parquet")
)

                                                                                