## Imports

In [1]:
from dhzlib import DhzLib
dhz = DhzLib()

Creating Spark session
SparkSession created at http://matheus:4040
Temporary tables created


## Testes de Data Quality

In [None]:
dhz.spark.read.table('tr_capital_bikeshare').printSchema()
df = dhz.spark.read.table('tr_capital_bikeshare')

In [None]:
# Verificação de integridade dos dados em um intervalo específico
lower_bound = dhz.date_(2023, 12, 15)
upper_bound = dhz.date_(2024, 1, 5)
date_column = 'started_at'
table = 'tr_capital_bikeshare'
dhz.check_interval_integrity(table, date_column, lower_bound, upper_bound)

In [None]:
# Geração de tabela com estatísticas das tabelas desejadas
tables = {
    'raw_capital_bikeshare': 'started_at',
    'tr_capital_bikeshare': 'started_at'
}
dhz.generate_stats_table(tables)
dhz.spark.sql('select * from tr_data_quality_stats').show()

In [None]:
# Verificação de atualização de dados
dhz.check_if_updated('tr_capital_bikeshare', 'started_at', dhz.date_(2024, 1, 25))

In [2]:
# Verificação de valores válidos
valid_values = {
    'rideable_type': ['classic_bike', 'electric_bike'],
    'member_casual': ['casual', 'member']
}
dhz.check_if_valid('tr_capital_bikeshare', valid_values)

 Nome da Coluna         Status       Outliers
  rideable_type             OK              0
  member_casual             OK              0


In [None]:
# Check 01 - Dados Atualizados
# previousDay = 1;
# dataLimit = dhzlib.getDataLimit(previousDay, dhzlib.TimeEnum.Day)
# dhzlib.qa.checkIfUpdated('dhauz_general_catalog.dhzqa.tr_bikeshare', dataLimit) 
    ## se for detectado que a tabela não posusi dados mais recenter que a data limite informa, então a tabela estará desatualizada e um email deve ser enviado.

# Check 02 - Dados Válidops
# validStatList = {'MG', 'SP', 'BA', etc...}
# dhzlib.qa.checkIfValid('dhauz_general_catalog.dhzqa.tr_bikeshare', 'state', validStatList) 

# Check 03 - Volumetria Válida - Geral
# AvgWeeklyRowsQtd = 15.000
# dhzlib.qa.checkIfVolumeValid('dhauz_general_catalog.dhzqa.tr_bikeshare', AvgWeeklyRowsQtd, 'time_start', dhzlib.TimeEnum.Week) 

In [2]:
def check_if_valid(table: str, valid_values: dict, schema: str = None):
    check_functions = {
        'string': dhz.check_if_valid_string
    }
    results = []
    table_path = f"{schema}.{table}" if schema else table
    df = dhz.spark.read.table(table_path)
    table_dtypes = df.dtypes
    for column in valid_values.keys():
        column_dtype = [dtype for col, dtype in table_dtypes if col == column][0]

        if column_dtype not in check_functions.keys():
            raise ValueError(f"Data type {column_dtype} not supported")
        
        status = check_functions[column_dtype](column, valid_values[column], df)
        results.append([column, status])

    print(results)
    headers = ["Column Name", "Status"]
    format_row = "{:>12}" * (len(headers) + 1)

    
    format_row = "{:>15}" * (len(headers))
    print(format_row.format(*headers))


    for _, status in zip(headers, results):
        print(format_row.format(*status))
    return None
            
def check_if_valid_string(column: str, valid_values: list, df):
    invalid_values = df.filter(~df[column].isin(valid_values)).count()
    if invalid_values > 0:
        return 'Invalid'
    return 'Valid'

In [3]:
valid_values = {
    'rideable_type': ['classic_bike', 'electric_bike'],
    'member_casual': ['casual', 'member']
}
check_if_valid('tr_capital_bikeshare', valid_values)

[['rideable_type', 'Valid'], ['member_casual', 'Valid']]
    Column Name         Status
  rideable_type          Valid
  member_casual          Valid


## RASCUNHO

In [None]:
tr_df.filter(F.col('started_at').cast('date').between(lower_bound, upper_bound)) \
    .select(F.col('started_at').cast('date')).distinct().orderBy('started_at',ascending=False).show()

In [None]:
pd.date_range(start=lower_bound,
              end=upper_bound,
              freq='d').map(lambda x: datetime.date(x)).tolist()

In [None]:
filtered_df = tr_df.filter(F.col('started_at').between(lower_bound, upper_bound))

In [None]:
filtered_df.select(F.col('started_at').cast('date')).distinct().orderBy('started_at', ascending=False).toPandas()['started_at'].tolist()

In [None]:
dict1 = set(filtered_df.select(F.col('started_at').cast('date')).distinct().toPandas()['started_at'].tolist())
dict2 = set(pd.date_range(start=lower_bound,
              end=upper_bound,
              freq='d').map(lambda x: datetime.date(x)).tolist())
dict1 == dict2

In [None]:
dict1.symmetric_difference(dict2)

In [None]:
check_interval_integrity(1,tr_df, 'started_at', lower_bound, upper_bound)


In [None]:
upper_bound

In [None]:
filtered_df = tr_df.filter(F.col('started_at').between(lower_bound, upper_bound))


In [None]:
headers = ["Column Name", "Status"]
data = [['rideable_type', 'OK'],
        ['member_casual', 'OK']]
format_row = "{:>15}" * (len(headers))
print(format_row.format(*headers))
for team, row in zip(headers, data):
    print(format_row.format(*row))

In [None]:
list(zip(headers, data))

In [None]:
table_dtypes = df.dtypes
list(filter(lambda table_dtypes: table_dtypes[0] == 'rideable_type', table_dtypes))[0][1]

In [None]:
[dtype for col, dtype in table_dtypes if col == 'rideable_type'][0]

In [None]:
df.dtypes[0][0] == 'rideable_type'

In [None]:
search_values = {
    'rideable_type': ['classic_bike', 'electric_bike'],
    'member_casual': ['casual', 'member']
}

In [None]:
filtered_df.select(F.col('started_at').cast('date')).distinct().count()

In [None]:
tr_df.filter(F.col('started_at').between(lower_bound, upper_bound)) \
    .select(F.col('started_at').cast('date')).distinct().count()

In [None]:
from pyspark.sql import functions as F

In [None]:
def check_ingestion_day(df, date_column, day):
    return df.filter(F.col(date_column).cast('date') == day).count()

In [None]:
check_ingestion_day(tr_df, 'started_at', dhz.date_(2023, 3, 16))