## Imports

In [2]:
from dhzlib import DhzLib
dhz = DhzLib()

Creating Spark session
SparkSession created at http://matheus:4040


## Leitura e transformação de dados

In [3]:
files = ['../data/202401-capitalbikeshare-tripdata.csv', '../data/202402-capitalbikeshare-tripdata.csv']
raw_df = dhz.spark.read.csv(files, header=True, inferSchema=True)
raw_df.createOrReplaceTempView('raw_capital_bikeshare')

In [4]:
tr_df = raw_df.select(
    raw_df.ride_id.alias('ride_id'),
    raw_df.rideable_type.alias('rideable_type'),
    dhz.F_.from_utc_timestamp(raw_df.started_at.cast('timestamp'), 'America/Sao_Paulo').alias('started_at'),
    dhz.F_.from_utc_timestamp(raw_df.ended_at.cast('timestamp'), 'America/Sao_Paulo').alias('ended_at'),
    raw_df.start_station_name.alias('start_station_name'),
    raw_df.start_station_id.cast('int').alias('start_station_id'),
    raw_df.end_station_name.alias('end_station_name'),
    raw_df.end_station_id.cast('int').alias('end_station_id'),
    raw_df.start_lat.cast('float').alias('start_lat'),
    raw_df.start_lng.cast('float').alias('start_lng'),
    raw_df.end_lat.cast('float').alias('end_lat'),
    raw_df.end_lng.cast('float').alias('end_lng'),
    raw_df.member_casual.alias('member_casual')
)

tr_df.createOrReplaceTempView('tr_capital_bikeshare')
print(raw_df.show(5))

+----------------+-------------+-------------------+-------------------+------------------+----------------+-------------------+--------------+------------+-------------+-----------------+------------------+-------------+
|         ride_id|rideable_type|         started_at|           ended_at|start_station_name|start_station_id|   end_station_name|end_station_id|   start_lat|    start_lng|          end_lat|           end_lng|member_casual|
+----------------+-------------+-------------------+-------------------+------------------+----------------+-------------------+--------------+------------+-------------+-----------------+------------------+-------------+
|748A93D7DE8A41CD| classic_bike|2024-01-25 15:49:59|2024-01-25 15:52:35|     1st & O St NW|           31519|      1st & L St NW|         31677|   38.908643|   -77.012365|        38.903819|        -77.011987|       member|
|75CBFD136F06305B| classic_bike|2024-01-02 16:44:58|2024-01-02 16:53:25|     1st & O St NW|           31519|4th 

## Testes de Data Quality

In [None]:
# Verificação de integridade dos dados em um intervalo específico
lower_bound = dhz.date_(2024, 1, 16)
upper_bound = dhz.date_(2024, 1, 31)
date_column = 'started_at'
table = 'tr_capital_bikeshare'
dhz.check_interval_integrity(table, date_column, lower_bound, upper_bound)

In [3]:
# Geração de tabela com estatísticas das tabelas desejadas
tables = {
    'raw_capital_bikeshare': 'started_at',
    'tr_capital_bikeshare': 'started_at'
}
dhz.generate_stats_table(tables)


Data read from MySQL
Data read from MySQL
Data written to MySQL


In [6]:
dhz.spark.sql('SHOW TABLES').show()

+---------+--------------------+-----------+
|namespace|           tableName|isTemporary|
+---------+--------------------+-----------+
|         |raw_capital_bikes...|       true|
|         |tr_capital_bikeshare|       true|
+---------+--------------------+-----------+



## RASCUNHO

In [None]:
tr_df.filter(F.col('started_at').cast('date').between(lower_bound, upper_bound)) \
    .select(F.col('started_at').cast('date')).distinct().orderBy('started_at',ascending=False).show()

In [None]:
pd.date_range(start=lower_bound,
              end=upper_bound,
              freq='d').map(lambda x: datetime.date(x)).tolist()

In [None]:
filtered_df = tr_df.filter(F.col('started_at').between(lower_bound, upper_bound))

In [None]:
filtered_df.select(F.col('started_at').cast('date')).distinct().orderBy('started_at', ascending=False).toPandas()['started_at'].tolist()

In [None]:
dict1 = set(filtered_df.select(F.col('started_at').cast('date')).distinct().toPandas()['started_at'].tolist())
dict2 = set(pd.date_range(start=lower_bound,
              end=upper_bound,
              freq='d').map(lambda x: datetime.date(x)).tolist())
dict1 == dict2

In [None]:
dict1.symmetric_difference(dict2)

In [None]:
check_interval_integrity(1,tr_df, 'started_at', lower_bound, upper_bound)


In [None]:
upper_bound

In [None]:
filtered_df = tr_df.filter(F.col('started_at').between(lower_bound, upper_bound))


In [None]:
filtered_df.select(F.col('started_at').cast('date')).distinct().count()

In [None]:
tr_df.filter(F.col('started_at').between(lower_bound, upper_bound)) \
    .select(F.col('started_at').cast('date')).distinct().count()

In [None]:
from pyspark.sql import functions as F

In [None]:
def check_ingestion_day(df, date_column, day):
    return df.filter(F.col(date_column).cast('date') == day).count()

In [None]:
check_ingestion_day(tr_df, 'started_at', dhz.date_(2023, 3, 16))