## Imports

In [1]:
from dhzlib import DhzLib
dhz = DhzLib()

Creating Spark session
SparkSession created at http://matheus:4040


## Leitura e transformação dos dados

In [10]:
raw_df = dhz.spark.read.csv('../data/*.csv', header=True, inferSchema=True)
tr_df = raw_df.select(
    raw_df.ride_id.alias('ride_id'),
    raw_df.rideable_type.alias('rideable_type'),
    raw_df.started_at.cast('timestamp').alias('started_at'),
    raw_df.ended_at.cast('timestamp').alias('ended_at'),
    raw_df.start_station_name.alias('start_station_name'),
    raw_df.start_station_id.cast('int').alias('start_station_id'),
    raw_df.end_station_name.alias('end_station_name'),
    raw_df.end_station_id.cast('int').alias('end_station_id'),
    raw_df.start_lat.cast('float').alias('start_lat'),
    raw_df.start_lng.cast('float').alias('start_lng'),
    raw_df.end_lat.cast('float').alias('end_lat'),
    raw_df.end_lng.cast('float').alias('end_lng'),
    raw_df.member_casual.alias('member_casual')
)

print(raw_df.count())
print(tr_df.printSchema())
raw_df.limit(5).show(truncate=False)

8031506
root
 |-- ride_id: string (nullable = true)
 |-- rideable_type: string (nullable = true)
 |-- started_at: timestamp (nullable = true)
 |-- ended_at: timestamp (nullable = true)
 |-- start_station_name: string (nullable = true)
 |-- start_station_id: integer (nullable = true)
 |-- end_station_name: string (nullable = true)
 |-- end_station_id: integer (nullable = true)
 |-- start_lat: float (nullable = true)
 |-- start_lng: float (nullable = true)
 |-- end_lat: float (nullable = true)
 |-- end_lng: float (nullable = true)
 |-- member_casual: string (nullable = true)

None
+----------------+-------------+-------------------+-------------------+--------------------------------------------+----------------+-------------------------------------------+--------------+----------------+------------------+---------+----------+-------------+
|ride_id         |rideable_type|started_at         |ended_at           |start_station_name                          |start_station_id|end_station_nam

## Testes de Data Quality

In [125]:
import pyspark.sql.functions as F
from pyspark.sql import DataFrame
from datetime import date, datetime
import pandas as pd

In [145]:
lower_bound = date(2023, 3, 16)
upper_bound = date(2023, 3, 31)
dhz.check_interval_integrity(tr_df, 'started_at', lower_bound, upper_bound)

AttributeError: 'dhzlib' object has no attribute 'check_interval_integrity'

In [139]:
tr_df.filter(F.col('started_at').cast('date').between(lower_bound, upper_bound)) \
    .select(F.col('started_at').cast('date')).distinct().orderBy('started_at',ascending=False).show()

+----------+
|started_at|
+----------+
|2023-03-31|
|2023-03-30|
|2023-03-29|
|2023-03-28|
|2023-03-27|
|2023-03-26|
|2023-03-25|
|2023-03-24|
|2023-03-23|
|2023-03-22|
|2023-03-21|
|2023-03-20|
|2023-03-19|
|2023-03-18|
|2023-03-17|
|2023-03-16|
+----------+



In [127]:
pd.date_range(start=lower_bound,
              end=upper_bound,
              freq='d').map(lambda x: datetime.date(x)).tolist()

[datetime.date(2023, 3, 16),
 datetime.date(2023, 3, 17),
 datetime.date(2023, 3, 18),
 datetime.date(2023, 3, 19),
 datetime.date(2023, 3, 20),
 datetime.date(2023, 3, 21),
 datetime.date(2023, 3, 22),
 datetime.date(2023, 3, 23),
 datetime.date(2023, 3, 24),
 datetime.date(2023, 3, 25),
 datetime.date(2023, 3, 26),
 datetime.date(2023, 3, 27),
 datetime.date(2023, 3, 28),
 datetime.date(2023, 3, 29),
 datetime.date(2023, 3, 30),
 datetime.date(2023, 3, 31)]

In [None]:
filtered_df = tr_df.filter(F.col('started_at').between(lower_bound, upper_bound))

In [112]:
filtered_df.select(F.col('started_at').cast('date')).distinct().orderBy('started_at', ascending=False).toPandas()['started_at'].tolist()

[datetime.date(2023, 3, 30),
 datetime.date(2023, 3, 29),
 datetime.date(2023, 3, 28),
 datetime.date(2023, 3, 27),
 datetime.date(2023, 3, 26),
 datetime.date(2023, 3, 25),
 datetime.date(2023, 3, 24),
 datetime.date(2023, 3, 23),
 datetime.date(2023, 3, 22),
 datetime.date(2023, 3, 21),
 datetime.date(2023, 3, 20),
 datetime.date(2023, 3, 19),
 datetime.date(2023, 3, 18),
 datetime.date(2023, 3, 17),
 datetime.date(2023, 3, 16)]

In [114]:
dict1 = set(filtered_df.select(F.col('started_at').cast('date')).distinct().toPandas()['started_at'].tolist())
dict2 = set(pd.date_range(start=lower_bound,
              end=upper_bound,
              freq='d').map(lambda x: datetime.date(x)).tolist())
dict1 == dict2

False

In [115]:
dict1.symmetric_difference(dict2)

{datetime.date(2023, 3, 31)}

In [144]:
check_interval_integrity(1,tr_df, 'started_at', lower_bound, upper_bound)


Checking interval integrity for column started_at
Interval: 2023-03-16 to 2023-03-31
Integrity check passed for started_at


In [129]:
upper_bound

datetime.date(2023, 3, 31)

In [140]:
filtered_df = tr_df.filter(F.col('started_at').between(lower_bound, upper_bound))


In [141]:
filtered_df.select(F.col('started_at').cast('date')).distinct().count()

15

In [142]:
tr_df.filter(F.col('started_at').between(lower_bound, upper_bound)) \
    .select(F.col('started_at').cast('date')).distinct().count()

15