In [1]:
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
  
sc = SparkContext.getOrCreate()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)

Welcome to the Glue Interactive Sessions Kernel

For more information on available magic commands, please type %help in any new cell.



Please view our Getting Started page to access the most up-to-date information on the Interactive Sessions kernel: https://docs.aws.amazon.com/glue/latest/dg/interactive-sessions.html

Installed kernel version: 0.38.1 

Authenticating with environment variables and user-defined glue_role_arn: arn:aws:iam::805766217211:role/LabRole

Trying to create a Glue session for the kernel.

Worker Type: G.1X

Number of Workers: 5

Session ID: c67cd576-3818-4a8e-82af-9404e3081bdd

Job Type: glueetl

Applying the following default arguments:

--glue_kernel_version 0.38.1

--enable-glue-datacatalog true

Waiting for session c67cd576-3818-4a8e-82af-9404e3081bdd to get into ready status...

Session c67cd576-3818-4a8e-82af-9404e3081bdd has been created.




In [2]:
import os

class DataWriter:
    def write_parquet(self, df, output_directory, mode="overwrite", repartition=True):
        if repartition:
            df.repartition(1).write.mode(mode).parquet(output_directory)
        else:
            df.write.mode(mode).parquet(output_directory)
            





In [3]:
from pyspark.sql import DataFrame

class UnsupportedFileType(Exception):
    def __init__(self, file_type):
        self.file_type = file_type
        self.message = f"File(s) of type {file_type} not supported"
        super().__init__(self.message)

class ExtractData:
    def __init__(self, spark, file_directory: list, file_type: str, separator: str = ';', header: bool = True, encoding='utf-8'):
        self.file_directory = file_directory
        self.file_type = file_type
        self.separator = separator
        self.spark = spark
        self.header = header
        self.encoding = encoding

    def extract(self) -> DataFrame:
        if self.file_type in ('csv', 'tsv'):
            return \
                self.spark.read.options(
                    delimiter=self.separator,
                    header=self.header,
                    encoding=self.encoding
                ).csv(self.file_directory)
        elif self.file_type == 'parquet':
            return self.spark.read.parquet(self.file_directory)
        else:
            raise UnsupportedFileType(self.file_type)




In [4]:
json_column_rename = {
    "BanksTransformation": {
        "Segmento" : "segmento",
        "CNPJ": "cnpj",
        "Nome": "nome"
    },
    "EmployeesTransformation": {
        "employer-website" : "employer_website",
        "employer-headquarters": "employer_headquarters",
        "employer-founded": "employer_founded",
        "employer-industry": "employer_industry",
        "employer-revenue": "employer_revenue",
        "Geral": "geral",
        "Cultura e valores": "cultura_valores",
        "Diversidade e inclusão": "diversidade_inclusao",
        "Qualidade de vida": "qualidade_vida",
        "Alta liderança": "alta_lideranca",
        "Remuneração e benefícios": "remuneracao_beneficios",
        "Oportunidades de carreira": "oportunidades_carreira",
        "Recomendam para outras pessoas(%)": "percentual_recomendam_para_outras_pessoas",
        "Perspectiva positiva da empresa(%)": "percentual_perspectiva_positiva_empresa",
        "CNPJ": "cnpj",
        "Nome": "nome",
        "Segmento": "segmento"
    },
    "ComplaintsTransformation": {
        "Ano" : "ano",
        "Trimestre": "trimestre",
        "Categoria": "categoria",
        "Tipo": "tipo",
        "CNPJ IF": "cnpj_if",
        "Instituição financeira": "instituicao_financeira",
        "Índice": "indice",
        "Quantidade de reclamações reguladas procedentes": "qtd_reclamacoes_reguladas_procedentes",
        "Quantidade de reclamações reguladas - outras": "qtd_reclamacoes_reguladas_outras",
        "Quantidade de reclamações não reguladas": "qtd_reclamacoes_nao_reguladas",
        "Quantidade total de reclamações": "qtd_total_reclamacoes",
        "Quantidade total de clientes - CCS e SCR": "qtd_total_clientes_ccs_scr",
        "Quantidade de clientes - CCS": "qtd_clientes_ccs",
        "Quantidade de clientes - SCR": "qtd_clientes_scr"
    }
}




In [5]:
from abc import ABC, abstractmethod
import pyspark.sql.functions as F
from pyspark.sql.types import *
from pyspark.sql import DataFrame
import os


class TransformData(ABC):
    """
    Gathers general functions for all transformations.
    """
    def format_cnpj(self, value):
        return F.when((value == ' ') | (value == ''), value).otherwise(F.lpad(value, 8, '0'))


    def load_column_rename_mappings(self, transformation_name):
#         json_file_path = os.path.join(os.path.dirname(__file__), 'column_rename.json')
        column_rename_mappings = json_column_rename
        return column_rename_mappings.get(transformation_name, {})


    def rename_columns(self, df: DataFrame, column_rename) -> DataFrame:
        for old_name, new_name in column_rename.items():
            df = df.withColumnRenamed(old_name, new_name)
        return df


    @abstractmethod
    def transform(self) -> DataFrame:
        pass


class BanksTransformation(TransformData):
    """
    Functions for transforming the pandas dataframe for banks.
    """
    def __init__(self, df: DataFrame):
        """
        Receives the dataframe.
        """
        self.df = df
        self.column_rename = self.load_column_rename_mappings('BanksTransformation')



    def transform(self) -> DataFrame:
        """
        Function to rename (to snake_case), format, and adjust data in the 'Segment', 'CNPJ', and 'Name' columns.
        Returns a dataframe.
        """
        transformed_df = self.rename_columns(self.df, self.column_rename)
        transformed_df = transformed_df.withColumn("cnpj", self.format_cnpj(F.col("cnpj")))
        transformed_df = transformed_df.withColumn("nome", F.regexp_replace(F.col("nome").cast(StringType()), ' - PRUDENCIAL', ''))
        return transformed_df


class EmployeesTransformation(TransformData):
    """
    Functions for transforming the pandas dataframe for employees.
    """
    def __init__(self, df: DataFrame):
        """
        Receives the dataframe.
        """
        self.df = df
        self.column_rename = self.load_column_rename_mappings('EmployeesTransformation')


    def transform(self) -> DataFrame:
        """
        Function to rename (to snake_case), format, and change data types.
        Returns a dataframe.
        """
        transformed_df = self.rename_columns(self.df, self.column_rename)

        transformed_df = transformed_df\
            .withColumn("employer_name", F.col('employer_name').cast(StringType()))\
            .withColumn("reviews_count", F.col('reviews_count').cast(IntegerType()))\
            .withColumn("culture_count", F.col('culture_count').cast(IntegerType()))\
            .withColumn("salaries_count", F.col('salaries_count').cast(IntegerType()))\
            .withColumn("benefits_count", F.col('benefits_count').cast(IntegerType()))\
            .withColumn("employer_website", F.col('employer_website').cast(StringType()))\
            .withColumn("employer_headquarters", F.col('employer_headquarters').cast(StringType()))\
            .withColumn("employer_founded", F.col('employer_founded').cast(IntegerType()))\
            .withColumn("employer_industry", F.col('employer_industry').cast(StringType()))\
            .withColumn("employer_revenue", F.col('employer_revenue').cast(StringType()))\
            .withColumn("url", F.col('url').cast(StringType()))\
            .withColumn("geral", F.col('geral').cast(DecimalType(20,2)))\
            .withColumn("cultura_valores", F.col('cultura_valores').cast(DecimalType(20,2)))\
            .withColumn("diversidade_inclusao", F.col('diversidade_inclusao').cast(DecimalType(20,2)))\
            .withColumn("qualidade_vida", F.col('qualidade_vida').cast(DecimalType(20,2)))\
            .withColumn("alta_lideranca", F.col('alta_lideranca').cast(DecimalType(20,2)))\
            .withColumn("remuneracao_beneficios", F.col('remuneracao_beneficios').cast(DecimalType(20,2)))\
            .withColumn("oportunidades_carreira", F.col('oportunidades_carreira').cast(DecimalType(20,2)))\
            .withColumn("percentual_recomendam_para_outras_pessoas", F.col('percentual_recomendam_para_outras_pessoas').cast(DecimalType(20,2)))\
            .withColumn("percentual_perspectiva_positiva_empresa", F.col('percentual_perspectiva_positiva_empresa').cast(DecimalType(20,2)))\
            .withColumn("nome", F.col('nome').cast(StringType()))\
            .withColumn("segmento", F.col('segmento').cast(StringType()))\
            .withColumn("match_percent", F.col('match_percent').cast(FloatType()))
        #             .withColumn('cnpj', self.format_cnpj(F.col("cnpj")))\


        return transformed_df


    def calculate_aggregates(self, df) -> DataFrame:
        """
        Function to return a pandas dataframe of a pivot table grouped by the 'name' column,
        aggregating the 'geral' and 'remuneracao_beneficios' columns by mean.
        """
        aggregated_df  = df.groupby('nome').agg(
            F.round(F.mean('geral'), 2).alias('geral'),
            F.round(F.mean('remuneracao_beneficios'), 2).alias('remuneracao_beneficios')
        )
        return aggregated_df 


class ComplaintsTransformation(TransformData):
    """
    Functions for transforming the pandas dataframe for complaints.
    """
    def __init__(self, df: DataFrame):
        """
        Receives the dataframe.
        """
        self.df = df
        self.column_rename = self.load_column_rename_mappings('ComplaintsTransformation')


    def transform(self) -> DataFrame:
        """
        Function to rename (to snake_case), format, and change data types.
        Returns a dataframe.
        """
        transformed_df = self.rename_columns(self.df, self.column_rename)

        transformed_df = transformed_df\
            .withColumn("ano", F.col('ano').cast(IntegerType()))\
            .withColumn('trimestre', F.col("trimestre").cast(StringType()))\
            .withColumn('categoria', F.col("categoria").cast(StringType()))\
            .withColumn('tipo', F.col("tipo").cast(StringType()))\
            .withColumn("cnpj", self.format_cnpj(F.col("cnpj_if")))\
            .withColumn("nome", F.regexp_replace(F.col("instituicao_financeira").cast(StringType()), ' \(conglomerado\)', ''))\
            .withColumn("indice", F.regexp_replace(F.col('indice').cast(StringType()), ',', '.').cast(DecimalType(20,2)))\
            .withColumn("qtd_reclamacoes_reguladas_procedentes", F.col('qtd_reclamacoes_reguladas_procedentes').cast(DecimalType(20,2)))\
            .withColumn("qtd_reclamacoes_reguladas_outras", F.col('qtd_reclamacoes_reguladas_outras').cast(DecimalType(20,2)))\
            .withColumn("qtd_reclamacoes_nao_reguladas", F.col('qtd_reclamacoes_nao_reguladas').cast(DecimalType(20,2)))\
            .withColumn("qtd_total_reclamacoes", F.col('qtd_total_reclamacoes').cast(DecimalType(20,2)))\
            .withColumn("qtd_total_clientes_ccs_scr", F.col('qtd_total_clientes_ccs_scr').cast(DecimalType(20,2)))\
            .withColumn("qtd_clientes_ccs", F.col('qtd_clientes_ccs').cast(DecimalType(20,2)))\
            .withColumn("qtd_clientes_scr", F.col('qtd_clientes_scr').cast(DecimalType(20,2)))

        return transformed_df


    def calculate_aggregates(self, df) -> DataFrame:
        """
        Function to return a pandas dataframe of a pivot table grouped by the 'name' column,
        aggregating columns like 'indice', 'qtd_total_reclamacoes', and 'qtd_total_clientes_ccs_scr' by mean.
        """
        aggregated_df  = df.groupby('nome').agg(
            F.round(F.mean('indice'), 2).alias("indice"),
            F.round(F.mean('qtd_total_reclamacoes'), 2).alias("qtd_total_reclamacoes"),
            F.max('qtd_total_clientes_ccs_scr').alias("qtd_total_clientes_ccs_scr")
        )
        return aggregated_df




In [7]:
extract_banks = ExtractData(
    spark,
    file_directory='s3://857119631738-atividade05-trusted/Bancos', 
    file_type='parquet'
)
df_banks = extract_banks.extract()

extract_employees = ExtractData(
    spark,
    file_directory='s3://857119631738-atividade05-trusted/Empregados', 
    file_type='parquet'
)
df_trusted_employees = extract_employees.extract()

extract_complaints = ExtractData(
    spark,
    file_directory='s3://857119631738-atividade05-trusted/Reclamacoes', 
    file_type='parquet'
)
df_trusted_complaints = extract_complaints.extract()

transform_employees = EmployeesTransformation(df_trusted_employees)
df_grouped_employees = transform_employees.calculate_aggregates(df_trusted_employees)

transform_complaints = ComplaintsTransformation(df_trusted_complaints)
df_grouped_complaints = transform_complaints.calculate_aggregates(df_trusted_complaints)

df_complaints_banks = df_banks.join(df_grouped_complaints, on=['nome'], how='inner')
df_complaints_banks_employees = df_complaints_banks.join(df_grouped_employees, on=['nome'], how='inner')

output_directory = 's3://857119631738-atividade05-delivery/atividade05'
write_data = DataWriter()
write_data.write_parquet(df_complaints_banks_employees, output_directory)



