In [0]:
# MAGIC %md
# MAGIC # NYC Taxi Data - Exploratory Data Analysis
# MAGIC ### Stack Tecnologias - Desafio Técnico
# MAGIC 
# MAGIC **Objetivos:**
# MAGIC 1. Análise da estrutura dos dados
# MAGIC 2. Identificação de problemas de qualidade
# MAGIC 3. Definição de schema para Silver/Gold

In [0]:
bronze_path = "s3://nyc-taxi-bronze-lucas/raw/"

df = spark.read.format("csv")\
    .option("header", "true")\
    .option("inferSchema", "true")\
    .load(bronze_path)

# Análise inicial
display(df.printSchema())
display(df.describe())

root
 |-- VendorID: string (nullable = true)
 |-- tpep_pickup_datetime: string (nullable = true)
 |-- tpep_dropoff_datetime: string (nullable = true)
 |-- passenger_count: string (nullable = true)
 |-- trip_distance: string (nullable = true)
 |-- pickup_longitude: string (nullable = true)
 |-- pickup_latitude: string (nullable = true)
 |-- RateCodeID: string (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- dropoff_longitude: string (nullable = true)
 |-- dropoff_latitude: string (nullable = true)
 |-- payment_type: string (nullable = true)
 |-- fare_amount: string (nullable = true)
 |-- extra: string (nullable = true)
 |-- mta_tax: string (nullable = true)
 |-- tip_amount: string (nullable = true)
 |-- tolls_amount: string (nullable = true)
 |-- improvement_surcharge: string (nullable = true)
 |-- total_amount: string (nullable = true)



summary,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,RateCodeID,store_and_fwd_flag,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount
count,47248845.0,47248845,47248845,47248845.0,47248845.0,47248845.0,47248845.0,47248845.0,47248845,47248845.0,47248845.0,47248845.0,47248845.0,47248845.0,47248845.0,47248845.0,47248845.0,47248842.0,47248845.0
mean,1.529570490029968,,,1.6670397339871483,7.508417945877826,-72.7645182845853,40.084704988038204,1.0385429739076162,,-72.82501122353683,40.118890625916094,1.352482394014076,12.392189114675714,0.3243179554971132,0.4977080728640033,1.7945680860558584,0.2843674354367812,0.295246617260989,15.592732594204715
stddev,0.4991248254615429,,,1.322092230733379,6487.658339592048,9.381829927913058,5.168527613321488,0.5902423267122126,,9.150215403289168,5.040345298485914,0.4922386879494088,78.61770040483974,0.5158948172922766,0.0467238798292527,574.738388499466,1.657184394696857,0.0381207987017221,580.1392733880582
min,1.0,2015-01-01 00:00:00,2015-01-01 00:00:00,0.0,-3390583.8,-0.1399070024490356,-18.679283142089844,1.0,N,-0.1166670024394989,-77.03948974609375,1.0,-0.01,-0.09,-0.5,-0.01,-0.1,-0.3,-0.31
max,2.0,2016-03-31 23:59:59,2016-06-29 15:58:16,9.0,99.9,94.64386749267578,9.587846755981444,99.0,Y,85.27402496337889,9.980953216552734,5.0,999.99,999.99,89.7,998.14,999.99,0.3,999.84


In [0]:
# MAGIC %md
# MAGIC # NYC Taxi Data - Exploratory Data Analysis
# MAGIC ### Stack Tecnologias - Desafio Técnico

In [0]:
# COMMAND ----------
from pyspark.sql.functions import col, count, when, isnan, to_timestamp, cast, unix_timestamp
from pyspark.sql.types import DoubleType, IntegerType

# Ler dados da camada Bronze
bronze_path = "s3://nyc-taxi-bronze-lucas/raw/"

df = spark.read.format("csv")\
    .option("header", "true")\
    .load(bronze_path)

# COMMAND ----------
# 1. Análise de Valores Nulos e Vazios
null_counts = df.select([
    count(when(
        (col(c).isNull()) | 
        (col(c) == "") | 
        (col(c) == "NULL") | 
        (col(c) == "null"), 
        True)
    ).alias(c) for c in df.columns
])
display(null_counts)

# COMMAND ----------
# 2. Converter tipos para análise
df_typed = df.select(
    # IDs e Flags
    col("VendorID").cast(IntegerType()).alias("vendor_id"),
    col("RateCodeID").cast(IntegerType()).alias("rate_code_id"),
    col("store_and_fwd_flag"),
    col("payment_type").cast(IntegerType()),
    
    # Timestamps
    to_timestamp(col("tpep_pickup_datetime")).alias("pickup_datetime"),
    to_timestamp(col("tpep_dropoff_datetime")).alias("dropoff_datetime"),
    
    # Números
    col("passenger_count").cast(IntegerType()),
    col("trip_distance").cast(DoubleType()),
    
    # Coordenadas
    col("pickup_longitude").cast(DoubleType()),
    col("pickup_latitude").cast(DoubleType()),
    col("dropoff_longitude").cast(DoubleType()),
    col("dropoff_latitude").cast(DoubleType()),
    
    # Valores Monetários
    col("fare_amount").cast(DoubleType()),
    col("extra").cast(DoubleType()),
    col("mta_tax").cast(DoubleType()),
    col("tip_amount").cast(DoubleType()),
    col("tolls_amount").cast(DoubleType()),
    col("improvement_surcharge").cast(DoubleType()),
    col("total_amount").cast(DoubleType())
)

# COMMAND ----------
# 3. Análise de Distribuição - Campos Numéricos
numeric_cols = ["passenger_count", "trip_distance", "fare_amount", "total_amount"]

for col_name in numeric_cols:
    print(f"\nEstatísticas para {col_name}:")
    df_typed.select(col_name).summary("count", "mean", "stddev", "min", "25%", "50%", "75%", "max").show()

# COMMAND ----------
# 4. Validação de Coordenadas
nyc_coord_check = df_typed.select(
    count(when(
        (col("pickup_longitude").cast(DoubleType()) < -74.5) | 
        (col("pickup_longitude").cast(DoubleType()) > -73.5) |
        (col("pickup_latitude").cast(DoubleType()) < 40.5) |
        (col("pickup_latitude").cast(DoubleType()) > 41.0), 
        True)
    ).alias("invalid_pickup_coords"),
    
    count(when(
        (col("dropoff_longitude").cast(DoubleType()) < -74.5) |
        (col("dropoff_longitude").cast(DoubleType()) > -73.5) |
        (col("dropoff_latitude").cast(DoubleType()) < 40.5) |
        (col("dropoff_latitude").cast(DoubleType()) > 41.0),
        True)
    ).alias("invalid_dropoff_coords")
)
display(nyc_coord_check)

# COMMAND ----------
# 5. Análise de Consistência Temporal
time_issues = df_typed.select(
    count(when(col("dropoff_datetime") < col("pickup_datetime"), True))
    .alias("negative_duration"),
    
    count(when(
        (col("dropoff_datetime") > col("pickup_datetime")) & 
        ((unix_timestamp(col("dropoff_datetime")) - unix_timestamp(col("pickup_datetime"))) > 86400),
        True)
    ).alias("duration_over_24h")
)
display(time_issues)

# COMMAND ----------
# 6. Análise de Valores de Pagamento
payment_analysis = df_typed.groupBy("payment_type").count().orderBy("payment_type")
display(payment_analysis)

fare_issues = df_typed.select(
    count(when(col("fare_amount") < 0, True)).alias("negative_fare"),
    count(when(col("total_amount") < col("fare_amount"), True)).alias("total_less_than_fare")
)
display(fare_issues)

# COMMAND ----------
# MAGIC %md
# MAGIC ## Sumário dos Problemas de Qualidade Encontrados
# MAGIC 
# MAGIC 1. **Valores Nulos/Vazios:**
# MAGIC    - Quantidade por coluna mostrada acima
# MAGIC 
# MAGIC 2. **Problemas de Coordenadas:**
# MAGIC    - Coordenadas fora do range de NYC
# MAGIC    - Possíveis trocas entre latitude e longitude
# MAGIC 
# MAGIC 3. **Problemas Temporais:**
# MAGIC    - Viagens com duração negativa
# MAGIC    - Viagens muito longas (> 24h)
# MAGIC 
# MAGIC 4. **Problemas de Valores:**
# MAGIC    - Valores negativos em campos monetários
# MAGIC    - Total menor que tarifa base
# MAGIC    - Valores zerados

VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,RateCodeID,store_and_fwd_flag,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0



Estatísticas para passenger_count:
+-------+------------------+
|summary|   passenger_count|
+-------+------------------+
|  count|          47248845|
|   mean|1.6670397339871483|
| stddev|1.3220922307333787|
|    min|                 0|
|    25%|                 1|
|    50%|                 1|
|    75%|                 2|
|    max|                 9|
+-------+------------------+


Estatísticas para trip_distance:
+-------+-----------------+
|summary|    trip_distance|
+-------+-----------------+
|  count|         47248845|
|   mean|7.508417945877779|
| stddev|6487.658339592048|
|    min|       -3390583.8|
|    25%|              1.0|
|    50%|             1.69|
|    75%|              3.1|
|    max|     1.90726288E7|
+-------+-----------------+


Estatísticas para fare_amount:
+-------+------------------+
|summary|       fare_amount|
+-------+------------------+
|  count|          47248845|
|   mean|12.392189114675682|
| stddev| 78.61770040483972|
|    min|            -957.6|
|    25%|

invalid_pickup_coords,invalid_dropoff_coords
777002,745966


negative_duration,duration_over_24h
473,165


payment_type,count
1,30870614
2,16158086
3,164138
4,56004
5,3


negative_fare,total_less_than_fare
17153,17194


In [0]:
# MAGIC %md
# MAGIC ## 2. Análise de Qualidade dos Dados
# MAGIC 
# MAGIC Vamos analisar:
# MAGIC 1. Valores nulos
# MAGIC 2. Distribuição dos dados
# MAGIC 3. Valores inconsistentes
# MAGIC 4. Outliers

In [0]:
# MAGIC %md
# MAGIC ## 4. Schema Proposto para Silver Layer

In [0]:
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, TimestampType, IntegerType

silver_schema = StructType([
    # Campos Temporais
    StructField("pickup_datetime", TimestampType(), False),
    StructField("dropoff_datetime", TimestampType(), False),
    StructField("trip_duration_seconds", IntegerType(), True),
    StructField("hour_of_day", IntegerType(), True),
    StructField("day_of_week", IntegerType(), True),
    StructField("month", IntegerType(), True),
    
    # Localização
    StructField("pickup_longitude", DoubleType(), False),
    StructField("pickup_latitude", DoubleType(), False),
    StructField("dropoff_longitude", DoubleType(), False),
    StructField("dropoff_latitude", DoubleType(), False),
    StructField("calculated_distance_km", DoubleType(), True),
    
    # Métricas da Viagem
    StructField("passenger_count", IntegerType(), True),
    StructField("trip_distance", DoubleType(), True),
    
    # Valores Monetários
    StructField("fare_amount", DoubleType(), True),
    StructField("extra", DoubleType(), True),
    StructField("mta_tax", DoubleType(), True),
    StructField("tip_amount", DoubleType(), True),
    StructField("tolls_amount", DoubleType(), True),
    StructField("improvement_surcharge", DoubleType(), True),
    StructField("total_amount", DoubleType(), True),
    
    # Metadados
    StructField("payment_type_id", IntegerType(), True),
    StructField("payment_type_desc", StringType(), True),
    StructField("rate_code_id", IntegerType(), True),
    StructField("store_and_fwd_flag", StringType(), True),
    
    # Campos de Controle
    StructField("ingestion_timestamp", TimestampType(), True),
    StructField("source_file", StringType(), True)
])

# COMMAND ----------
# MAGIC %md
# MAGIC ## 5. Próximos Passos
# MAGIC 
# MAGIC 1. **Implementar Pipeline Bronze → Silver:**
# MAGIC    - Aplicar transformações identificadas
# MAGIC    - Implementar validações de qualidade
# MAGIC    - Criar campos derivados
# MAGIC 
# MAGIC 2. **Definir Métricas de Qualidade:**
# MAGIC    - % de registros válidos
# MAGIC    - % de campos preenchidos
# MAGIC    - % de coordenadas válidas