In [32]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

spark = (
    SparkSession.builder
    .master('spark://172.19.0.4:7777')
    .appName('ETL_LAB03')
    .getOrCreate()
)

In [33]:
bank_df = spark.read.csv('../data/bronze/banks/EnquadramentoInicia_v2.tsv', header=True, sep='\t')

# Data Transformation for bank dataset
for column in bank_df.columns:
    bank_df = bank_df.withColumnRenamed(
        existing=column, 
        new=column.lower()
    )

for replacement_action in [
    ("nome", "- PRUDENCIAL", ""),
    ("nome","(\.+|\/+|\-+)", ""),
]:
    bank_df = bank_df.withColumn(
        "nome", regexp_replace(
            replacement_action[0],
            replacement_action[1],
            replacement_action[2]
        )
    )

print(bank_df.count())
bank_df = bank_df.na.drop()
bank_df = bank_df.dropDuplicates()
bank_df.show(5, truncate=False)
print(f"Number of rows x columns - Bank Data: {bank_df.count()} x {len(bank_df.columns)}")
bank_df.write.mode("overwrite").csv("../data/silver/banks", header=True, sep=";")

1474
+--------+--------+---------------------------------------------------------------------------------------+
|segmento|cnpj    |nome                                                                                   |
+--------+--------+---------------------------------------------------------------------------------------+
|S4      |14388334|JMALUCELLI                                                                             |
|S5      |22753982|COOPERATIVA DE CRDITO DE LIVRE ADMISSO DE SETE LAGOAS LTDA  SICOOB CREDISETE           |
|S5      |20833976|COOPERATIVA DE CRDITO DOS SERVIDORES PBLICOS MUNICIPAIS DE TIMTEO LTDA  COOPERTIM      |
|S5      |10453077|COOPERATIVA DE CRDITO RURAL COM INTERAO SOLIDRIA DE POUSO REDONDO  CRESOL POUSO REDONDO|
|S4      |53518684|HSBC                                                                                   |
+--------+--------+---------------------------------------------------------------------------------------+
only showing top 5 rows

In [34]:
# Lendo separado devido a estrutura diferente
employee_df_1 = spark.read.format("csv").option("header", "true").option('delimiter','|').load("../data/bronze/employees/glassdoor_consolidado_join_match_less_v2.csv")
employee_df_2 = spark.read.format("csv").option("header", "true").option('delimiter','|').load("../data/bronze/employees/glassdoor_consolidado_join_match_v2.csv")

# Criando colunas
employee_df_1 = employee_df_1.withColumn('Segmento', lit(''))
employee_df_2 = employee_df_2.withColumn('CNPJ', lit(''))

# Ordenando as colunas
columns = ["employer_name", "reviews_count", "culture_count", "salaries_count", "benefits_count", "employer-website", "employer-headquarters", "employer-founded", "employer-industry", "employer-revenue", "url", "Geral", "Cultura e valores", "Diversidade e inclusão", "Qualidade de vida", "Alta liderança", "Remuneração e benefícios", "Oportunidades de carreira", "Recomendam para outras pessoas(%)", "Perspectiva positiva da empresa(%)", "CNPJ", "Segmento", "Nome", "match_percent"]

employee_df_1 = employee_df_1.select(columns)
employee_df_2 = employee_df_2.select(columns)

# unindo os dados
employee_df = employee_df_1.union(employee_df_2)

# Data Transformation for employee dataset
for column in employee_df.columns:
    employee_df = employee_df.withColumnRenamed(
        column, 
        column.replace("-","_").replace(" ","_").lower()
    )

employee_df.show(truncate=False)
employee_df.cache()

for replacement_action in [
    ("nome", "- PRUDENCIAL", ""),
    ("nome","(\.+|\/+|\-+)", ""),
]:
    employee_df = employee_df.withColumn(
        "nome", regexp_replace(
            replacement_action[0],
            replacement_action[1],
            replacement_action[2]
        )
    )

print(employee_df.count())
employee_df = employee_df.na.drop()
employee_df = employee_df.dropDuplicates()
employee_df.show(5, truncate=False)
print(f"Number of rows x columns - Employee Data: {employee_df.count()} x {len(employee_df.columns)}")
employee_df.write.mode("overwrite").csv("../data/silver/employee", header=True, sep=";")

+-------------------------------+-------------+-------------+--------------+--------------+---------------------------------------------------+---------------------+----------------+-----------------------------------------------------------------------------------+---------------------------+----------------------------------------------------------------------------------------------------------------+-----+-----------------+----------------------+-----------------+--------------+------------------------+-------------------------+---------------------------------+----------------------------------+--------+--------+--------------------------------------------------+-------------+
|employer_name                  |reviews_count|culture_count|salaries_count|benefits_count|employer_website                                   |employer_headquarters|employer_founded|employer_industry                                                                  |employer_revenue           |url            

In [59]:
#Claims
claims_df = spark.read.format("csv").option("header", "true").option('delimiter',',').load("../data/bronze/claims")

#print(claims_df.count()) == 918

# Data Transformation for employee dataset
for column in claims_df.columns:
    claims_df = claims_df.withColumnRenamed(
        column, 
        column.replace("-","_").replace(" ","_").lower()
    )

claims_df = claims_df.withColumnRenamed('cnpj_if', 'cnpj')\
    .withColumnRenamed('instituição_financeira', 'nome')

for replacement_action in [
    ("nome", "- PRUDENCIAL", ""),
    ("nome","(\.+|\/+|\-+)", ""),
    ("nome"," \(conglomerado\)", ""),
]:
    claims_df = claims_df.withColumn(
        "nome", regexp_replace(
            replacement_action[0],
            replacement_action[1],
            replacement_action[2]
        )
    )

claims_df = claims_df.na.drop()
claims_df = claims_df.dropDuplicates()
claims_df.show(5, truncate=False)
print(f"Number of rows x columns - Employee Data: {employee_df.count()} x {len(employee_df.columns)}")
claims_df.write.mode("overwrite").csv("../data/silver/claims", header=True, sep=";")

+----+---------+------------------------------------------------------+----------------+--------+---------------------------------------------------------+------+-----------------------------------------------+--------------------------------------------+---------------------------------------+-------------------------------+----------------------------------------+----------------------------+----------------------------+
|ano |trimestre|categoria                                             |tipo            |cnpj    |nome                                                     |índice|quantidade_de_reclamações_reguladas_procedentes|quantidade_de_reclamações_reguladas___outras|quantidade_de_reclamações_não_reguladas|quantidade_total_de_reclamações|quantidade_total_de_clientes_–_ccs_e_scr|quantidade_de_clientes_–_ccs|quantidade_de_clientes_–_scr|
+----+---------+------------------------------------------------------+----------------+--------+-------------------------------------------------