In [1]:
print("hello world!")

hello world!


### Spark сессия

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

drivers = [
    "/home/jovyan/work/spark-jars/hadoop-aws-3.3.4.jar",             # S3
    "/home/jovyan/work/spark-jars/aws-java-sdk-bundle-1.12.262.jar", # S3
    "/home/jovyan/work/spark-jars/wildfly-openssl-1.0.7.Final.jar",  # S3
    "/home/jovyan/work/spark-jars/postgresql-42.6.0.jar",            # PostgreSQL
]

spark = (SparkSession.builder
         .appName("mustdayker-Spark")
         .master("spark://spark-master:7077") 
         .config("spark.jars", ",".join(drivers))
         .getOrCreate()
        )

25/10/31 18:40:33 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


### Читаем датафрейм

In [117]:
df_2023_01 = spark.read.parquet("s3a://silver/nyc-taxi-data/yellow_tripdata_2023-01.parquet")
df_2024_01 = spark.read.parquet("s3a://bronze/nyc-taxi-data/yellow_tripdata_2024-01.parquet")
df_2025_01 = spark.read.parquet("s3a://bronze/nyc-taxi-data/yellow_tripdata_2025-01.parquet")


In [125]:
layer = "silver"

df = (spark.read
      .format("parquet")
      .option("mergeSchema", "true")
      .load([
          f"s3a://{layer}/nyc-taxi-data/yellow_tripdata_2023-*.parquet",
          f"s3a://{layer}/nyc-taxi-data/yellow_tripdata_2024-*.parquet",
          f"s3a://{layer}/nyc-taxi-data/yellow_tripdata_2025-*.parquet",
      ])
     )

In [126]:
print(f"Количетсво строк: {df.count()}\n")

Количетсво строк: 115287399



In [127]:
df.show(1)

+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+------------------+
|vendorid|tpep_pickup_datetime|tpep_dropoff_datetime|passenger_count|trip_distance|ratecodeid|store_and_fwd_flag|pulocationid|dolocationid|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|congestion_surcharge|airport_fee|cbd_congestion_fee|
+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+------------------+
|       1| 2023-05-01 00:33:13|  2023-05-01 00:53:01|              0|          7.8|         1|                 N|         138|    

In [128]:
print(f"Количетсво строк: {df.count()}\n")

(df.withColumn("year",  F.year("tpep_pickup_datetime"))
   .withColumn("month", F.month("tpep_pickup_datetime"))
   .select("year", 
           "month", 
           "VendorID",
           "tpep_pickup_datetime", 
           "passenger_count", 
           "trip_distance",
           "airport_fee",
          )
   .groupBy("year", "month")
   .agg(
       F.count("*").alias("total_records"),
       F.sum("airport_fee").alias("Обдираловка"),
       )
   .orderBy("year", "month")
   .show(50)
)

Количетсво строк: 115287399





+----+-----+-------------+-----------+
|year|month|total_records|Обдираловка|
+----+-----+-------------+-----------+
|2001|    1|            6|        3.0|
|2002|   12|           22|      11.25|
|2003|    1|            6|        2.5|
|2007|   12|            1|        0.0|
|2008|   12|           33|      17.75|
|2009|    1|           37|      15.75|
|2014|   11|            1|        0.0|
|2022|   10|           11|        2.5|
|2022|   12|           25|        2.5|
|2023|    1|      3066726|  321686.25|
|2023|    2|      2914003|   274590.0|
|2023|    3|      3403660|  344928.75|
|2023|    4|      3288248|  450625.25|
|2023|    5|      3513664|   481782.0|
|2023|    6|      3307259|  492094.75|
|2023|    7|      2907093|  463704.45|
|2023|    8|      2824201|   471320.0|
|2023|    9|      2846741|   446869.5|
|2023|   10|      3522269|  519090.25|
|2023|   11|      3339731|   477872.5|
|2023|   12|      3376537|   437989.5|
|2024|    1|      2964623|  398704.25|
|2024|    2|      3007533

                                                                                

### Исследуем

In [115]:
from pyspark.sql import functions as F
from pyspark.sql.types import DoubleType, IntegerType

def standardize_nyc_taxi_data(input_path, output_path):
    """
    Стандартизирует данные NYC Taxi:
    - Приводит все названия колонок к нижнему регистру
    - Приводит типы данных к унифицированным
    - Сохраняет в указанный путь
    """
    
    # Читаем исходные данные
    df = spark.read.parquet(input_path)
    
    # 1. Приводим все названия колонок к нижнему регистру
    for col_name in df.columns:
        df = df.withColumnRenamed(col_name, col_name.lower())
    
    # 2. Определяем маппинг типов для унификации
    type_mapping = {
        # Числовые идентификаторы - в Integer
        "vendorid": IntegerType(),
        "pulocationid": IntegerType(), 
        "dolocationid": IntegerType(),
        "payment_type": IntegerType(),
        "ratecodeid": IntegerType(),
        
        # Пассажиры - в Integer (более логично)
        "passenger_count": IntegerType(),
        
        # Денежные суммы - в Double (для точности)
        "fare_amount": DoubleType(),
        "extra": DoubleType(),
        "mta_tax": DoubleType(),
        "tip_amount": DoubleType(),
        "tolls_amount": DoubleType(),
        "improvement_surcharge": DoubleType(),
        "total_amount": DoubleType(),
        "congestion_surcharge": DoubleType(),
        "airport_fee": DoubleType(),
        "cbd_congestion_fee": DoubleType(),
        
        # Дистанция - в Double
        "trip_distance": DoubleType()
    }
    
    # 3. Применяем преобразование типов с обработкой ошибок
    for col_name, target_type in type_mapping.items():
        if col_name in df.columns:
            df = df.withColumn(
                col_name, 
                F.coalesce(
                    F.col(col_name).cast(target_type), 
                    F.lit(0 if target_type == IntegerType() else 0.0)
                )
            )
    
    # 4. Гарантируем порядок колонок для consistency
    expected_columns = [
        "vendorid", "tpep_pickup_datetime", "tpep_dropoff_datetime",
        "passenger_count", "trip_distance", "ratecodeid", "store_and_fwd_flag",
        "pulocationid", "dolocationid", "payment_type", "fare_amount", "extra",
        "mta_tax", "tip_amount", "tolls_amount", "improvement_surcharge",
        "total_amount", "congestion_surcharge", "airport_fee", "cbd_congestion_fee"
    ]
    
    # Выбираем только существующие колонки в правильном порядке
    final_columns = [col for col in expected_columns if col in df.columns]
    df_standardized = df.select(final_columns)
    
    # 5. Сохраняем с оптимальными настройками
    (df_standardized
     .write
     .mode("overwrite")
     .option("compression", "snappy")  # хороший баланс скорость/сжатие
     .parquet(output_path)
    )
    
    print(f"✅ Стандартизировано: {input_path} -> {output_path}")
    print(f"📊 Схема после стандартизации:")
    df_standardized.printSchema()
    
    return df_standardized

In [116]:
# Пример использования для одного файла
input_path = "s3a://bronze/nyc-taxi-data/yellow_tripdata_2023-01.parquet"
output_path = "s3a://silver/nyc-taxi-data/yellow_tripdata_2023-01.parquet"

standardized_df = standardize_nyc_taxi_data(input_path, output_path)

                                                                                

✅ Стандартизировано: s3a://bronze/nyc-taxi-data/yellow_tripdata_2023-01.parquet -> s3a://silver/nyc-taxi-data/yellow_tripdata_2023-01.parquet
📊 Схема после стандартизации:
root
 |-- vendorid: integer (nullable = false)
 |-- tpep_pickup_datetime: timestamp_ntz (nullable = true)
 |-- tpep_dropoff_datetime: timestamp_ntz (nullable = true)
 |-- passenger_count: integer (nullable = false)
 |-- trip_distance: double (nullable = false)
 |-- ratecodeid: integer (nullable = false)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- pulocationid: integer (nullable = false)
 |-- dolocationid: integer (nullable = false)
 |-- payment_type: integer (nullable = false)
 |-- fare_amount: double (nullable = false)
 |-- extra: double (nullable = false)
 |-- mta_tax: double (nullable = false)
 |-- tip_amount: double (nullable = false)
 |-- tolls_amount: double (nullable = false)
 |-- improvement_surcharge: double (nullable = false)
 |-- total_amount: double (nullable = false)
 |-- congestion_surcharge:

In [123]:
def process_all_nyc_taxi_files():
    """Обрабатывает все файлы NYC Taxi из Bronze в Silver"""
    
    # Получаем список всех parquet файлов в bronze
    bronze_files = spark.sql(f"""
        SELECT path 
        FROM (
            SELECT input_file_name() as path 
            FROM parquet.`s3a://bronze/nyc-taxi-data/`
        ) 
        GROUP BY path
    """).collect()
    
    print(f"📁 Найдено файлов для обработки: {len(bronze_files)}")
    
    for i, row in enumerate(bronze_files, 1):
        input_path = row['path']
        # Извлекаем имя файла из пути
        file_name = input_path.split('/')[-1]
        output_path = f"s3a://silver/nyc-taxi-data/{file_name}"
        
        print(f"🔄 Обрабатываю ({i}/{len(bronze_files)}): {file_name}")
        
        try:
            standardize_nyc_taxi_data(input_path, output_path)
        except Exception as e:
            print(f"❌ Ошибка при обработке {file_name}: {e}")
    
    print("🎉 Все файлы обработаны!")

In [124]:
start_time = time.time()

# Запускаем обработку всех файлов
process_all_nyc_taxi_files()

end_time = time.time()
execution_time = end_time - start_time
print(f"Время выполнения: {execution_time:.4f} секунд")

25/10/31 20:07:32 WARN ObjectStore: Failed to get database parquet, returning NoSuchObjectException
                                                                                

📁 Найдено файлов для обработки: 45
🔄 Обрабатываю (1/45): yellow_tripdata_2025-05.parquet


                                                                                

✅ Стандартизировано: s3a://bronze/nyc-taxi-data/yellow_tripdata_2025-05.parquet -> s3a://silver/nyc-taxi-data/yellow_tripdata_2025-05.parquet
📊 Схема после стандартизации:
root
 |-- vendorid: integer (nullable = false)
 |-- tpep_pickup_datetime: timestamp_ntz (nullable = true)
 |-- tpep_dropoff_datetime: timestamp_ntz (nullable = true)
 |-- passenger_count: integer (nullable = false)
 |-- trip_distance: double (nullable = false)
 |-- ratecodeid: integer (nullable = false)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- pulocationid: integer (nullable = false)
 |-- dolocationid: integer (nullable = false)
 |-- payment_type: integer (nullable = false)
 |-- fare_amount: double (nullable = false)
 |-- extra: double (nullable = false)
 |-- mta_tax: double (nullable = false)
 |-- tip_amount: double (nullable = false)
 |-- tolls_amount: double (nullable = false)
 |-- improvement_surcharge: double (nullable = false)
 |-- total_amount: double (nullable = false)
 |-- congestion_surcharge:

                                                                                

✅ Стандартизировано: s3a://bronze/nyc-taxi-data/yellow_tripdata_2025-09.parquet -> s3a://silver/nyc-taxi-data/yellow_tripdata_2025-09.parquet
📊 Схема после стандартизации:
root
 |-- vendorid: integer (nullable = false)
 |-- tpep_pickup_datetime: timestamp_ntz (nullable = true)
 |-- tpep_dropoff_datetime: timestamp_ntz (nullable = true)
 |-- passenger_count: integer (nullable = false)
 |-- trip_distance: double (nullable = false)
 |-- ratecodeid: integer (nullable = false)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- pulocationid: integer (nullable = false)
 |-- dolocationid: integer (nullable = false)
 |-- payment_type: integer (nullable = false)
 |-- fare_amount: double (nullable = false)
 |-- extra: double (nullable = false)
 |-- mta_tax: double (nullable = false)
 |-- tip_amount: double (nullable = false)
 |-- tolls_amount: double (nullable = false)
 |-- improvement_surcharge: double (nullable = false)
 |-- total_amount: double (nullable = false)
 |-- congestion_surcharge:

                                                                                

✅ Стандартизировано: s3a://bronze/nyc-taxi-data/yellow_tripdata_2025-04.parquet -> s3a://silver/nyc-taxi-data/yellow_tripdata_2025-04.parquet
📊 Схема после стандартизации:
root
 |-- vendorid: integer (nullable = false)
 |-- tpep_pickup_datetime: timestamp_ntz (nullable = true)
 |-- tpep_dropoff_datetime: timestamp_ntz (nullable = true)
 |-- passenger_count: integer (nullable = false)
 |-- trip_distance: double (nullable = false)
 |-- ratecodeid: integer (nullable = false)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- pulocationid: integer (nullable = false)
 |-- dolocationid: integer (nullable = false)
 |-- payment_type: integer (nullable = false)
 |-- fare_amount: double (nullable = false)
 |-- extra: double (nullable = false)
 |-- mta_tax: double (nullable = false)
 |-- tip_amount: double (nullable = false)
 |-- tolls_amount: double (nullable = false)
 |-- improvement_surcharge: double (nullable = false)
 |-- total_amount: double (nullable = false)
 |-- congestion_surcharge:

                                                                                

✅ Стандартизировано: s3a://bronze/nyc-taxi-data/yellow_tripdata_2024-10.parquet -> s3a://silver/nyc-taxi-data/yellow_tripdata_2024-10.parquet
📊 Схема после стандартизации:
root
 |-- vendorid: integer (nullable = false)
 |-- tpep_pickup_datetime: timestamp_ntz (nullable = true)
 |-- tpep_dropoff_datetime: timestamp_ntz (nullable = true)
 |-- passenger_count: integer (nullable = false)
 |-- trip_distance: double (nullable = false)
 |-- ratecodeid: integer (nullable = false)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- pulocationid: integer (nullable = false)
 |-- dolocationid: integer (nullable = false)
 |-- payment_type: integer (nullable = false)
 |-- fare_amount: double (nullable = false)
 |-- extra: double (nullable = false)
 |-- mta_tax: double (nullable = false)
 |-- tip_amount: double (nullable = false)
 |-- tolls_amount: double (nullable = false)
 |-- improvement_surcharge: double (nullable = false)
 |-- total_amount: double (nullable = false)
 |-- congestion_surcharge:

                                                                                

✅ Стандартизировано: s3a://bronze/nyc-taxi-data/yellow_tripdata_2024-05.parquet -> s3a://silver/nyc-taxi-data/yellow_tripdata_2024-05.parquet
📊 Схема после стандартизации:
root
 |-- vendorid: integer (nullable = false)
 |-- tpep_pickup_datetime: timestamp_ntz (nullable = true)
 |-- tpep_dropoff_datetime: timestamp_ntz (nullable = true)
 |-- passenger_count: integer (nullable = false)
 |-- trip_distance: double (nullable = false)
 |-- ratecodeid: integer (nullable = false)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- pulocationid: integer (nullable = false)
 |-- dolocationid: integer (nullable = false)
 |-- payment_type: integer (nullable = false)
 |-- fare_amount: double (nullable = false)
 |-- extra: double (nullable = false)
 |-- mta_tax: double (nullable = false)
 |-- tip_amount: double (nullable = false)
 |-- tolls_amount: double (nullable = false)
 |-- improvement_surcharge: double (nullable = false)
 |-- total_amount: double (nullable = false)
 |-- congestion_surcharge:

                                                                                

✅ Стандартизировано: s3a://bronze/nyc-taxi-data/yellow_tripdata_2024-11.parquet -> s3a://silver/nyc-taxi-data/yellow_tripdata_2024-11.parquet
📊 Схема после стандартизации:
root
 |-- vendorid: integer (nullable = false)
 |-- tpep_pickup_datetime: timestamp_ntz (nullable = true)
 |-- tpep_dropoff_datetime: timestamp_ntz (nullable = true)
 |-- passenger_count: integer (nullable = false)
 |-- trip_distance: double (nullable = false)
 |-- ratecodeid: integer (nullable = false)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- pulocationid: integer (nullable = false)
 |-- dolocationid: integer (nullable = false)
 |-- payment_type: integer (nullable = false)
 |-- fare_amount: double (nullable = false)
 |-- extra: double (nullable = false)
 |-- mta_tax: double (nullable = false)
 |-- tip_amount: double (nullable = false)
 |-- tolls_amount: double (nullable = false)
 |-- improvement_surcharge: double (nullable = false)
 |-- total_amount: double (nullable = false)
 |-- congestion_surcharge:

                                                                                

✅ Стандартизировано: s3a://bronze/nyc-taxi-data/yellow_tripdata_2024-09.parquet -> s3a://silver/nyc-taxi-data/yellow_tripdata_2024-09.parquet
📊 Схема после стандартизации:
root
 |-- vendorid: integer (nullable = false)
 |-- tpep_pickup_datetime: timestamp_ntz (nullable = true)
 |-- tpep_dropoff_datetime: timestamp_ntz (nullable = true)
 |-- passenger_count: integer (nullable = false)
 |-- trip_distance: double (nullable = false)
 |-- ratecodeid: integer (nullable = false)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- pulocationid: integer (nullable = false)
 |-- dolocationid: integer (nullable = false)
 |-- payment_type: integer (nullable = false)
 |-- fare_amount: double (nullable = false)
 |-- extra: double (nullable = false)
 |-- mta_tax: double (nullable = false)
 |-- tip_amount: double (nullable = false)
 |-- tolls_amount: double (nullable = false)
 |-- improvement_surcharge: double (nullable = false)
 |-- total_amount: double (nullable = false)
 |-- congestion_surcharge:

                                                                                

✅ Стандартизировано: s3a://bronze/nyc-taxi-data/yellow_tripdata_2024-04.parquet -> s3a://silver/nyc-taxi-data/yellow_tripdata_2024-04.parquet
📊 Схема после стандартизации:
root
 |-- vendorid: integer (nullable = false)
 |-- tpep_pickup_datetime: timestamp_ntz (nullable = true)
 |-- tpep_dropoff_datetime: timestamp_ntz (nullable = true)
 |-- passenger_count: integer (nullable = false)
 |-- trip_distance: double (nullable = false)
 |-- ratecodeid: integer (nullable = false)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- pulocationid: integer (nullable = false)
 |-- dolocationid: integer (nullable = false)
 |-- payment_type: integer (nullable = false)
 |-- fare_amount: double (nullable = false)
 |-- extra: double (nullable = false)
 |-- mta_tax: double (nullable = false)
 |-- tip_amount: double (nullable = false)
 |-- tolls_amount: double (nullable = false)
 |-- improvement_surcharge: double (nullable = false)
 |-- total_amount: double (nullable = false)
 |-- congestion_surcharge:

                                                                                

✅ Стандартизировано: s3a://bronze/nyc-taxi-data/yellow_tripdata_2023-10.parquet -> s3a://silver/nyc-taxi-data/yellow_tripdata_2023-10.parquet
📊 Схема после стандартизации:
root
 |-- vendorid: integer (nullable = false)
 |-- tpep_pickup_datetime: timestamp_ntz (nullable = true)
 |-- tpep_dropoff_datetime: timestamp_ntz (nullable = true)
 |-- passenger_count: integer (nullable = false)
 |-- trip_distance: double (nullable = false)
 |-- ratecodeid: integer (nullable = false)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- pulocationid: integer (nullable = false)
 |-- dolocationid: integer (nullable = false)
 |-- payment_type: integer (nullable = false)
 |-- fare_amount: double (nullable = false)
 |-- extra: double (nullable = false)
 |-- mta_tax: double (nullable = false)
 |-- tip_amount: double (nullable = false)
 |-- tolls_amount: double (nullable = false)
 |-- improvement_surcharge: double (nullable = false)
 |-- total_amount: double (nullable = false)
 |-- congestion_surcharge:

                                                                                

✅ Стандартизировано: s3a://bronze/nyc-taxi-data/yellow_tripdata_2023-11.parquet -> s3a://silver/nyc-taxi-data/yellow_tripdata_2023-11.parquet
📊 Схема после стандартизации:
root
 |-- vendorid: integer (nullable = false)
 |-- tpep_pickup_datetime: timestamp_ntz (nullable = true)
 |-- tpep_dropoff_datetime: timestamp_ntz (nullable = true)
 |-- passenger_count: integer (nullable = false)
 |-- trip_distance: double (nullable = false)
 |-- ratecodeid: integer (nullable = false)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- pulocationid: integer (nullable = false)
 |-- dolocationid: integer (nullable = false)
 |-- payment_type: integer (nullable = false)
 |-- fare_amount: double (nullable = false)
 |-- extra: double (nullable = false)
 |-- mta_tax: double (nullable = false)
 |-- tip_amount: double (nullable = false)
 |-- tolls_amount: double (nullable = false)
 |-- improvement_surcharge: double (nullable = false)
 |-- total_amount: double (nullable = false)
 |-- congestion_surcharge:

                                                                                

✅ Стандартизировано: s3a://bronze/nyc-taxi-data/yellow_tripdata_2022-03.parquet -> s3a://silver/nyc-taxi-data/yellow_tripdata_2022-03.parquet
📊 Схема после стандартизации:
root
 |-- vendorid: integer (nullable = false)
 |-- tpep_pickup_datetime: timestamp_ntz (nullable = true)
 |-- tpep_dropoff_datetime: timestamp_ntz (nullable = true)
 |-- passenger_count: integer (nullable = false)
 |-- trip_distance: double (nullable = false)
 |-- ratecodeid: integer (nullable = false)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- pulocationid: integer (nullable = false)
 |-- dolocationid: integer (nullable = false)
 |-- payment_type: integer (nullable = false)
 |-- fare_amount: double (nullable = false)
 |-- extra: double (nullable = false)
 |-- mta_tax: double (nullable = false)
 |-- tip_amount: double (nullable = false)
 |-- tolls_amount: double (nullable = false)
 |-- improvement_surcharge: double (nullable = false)
 |-- total_amount: double (nullable = false)
 |-- congestion_surcharge:

                                                                                

✅ Стандартизировано: s3a://bronze/nyc-taxi-data/yellow_tripdata_2022-06.parquet -> s3a://silver/nyc-taxi-data/yellow_tripdata_2022-06.parquet
📊 Схема после стандартизации:
root
 |-- vendorid: integer (nullable = false)
 |-- tpep_pickup_datetime: timestamp_ntz (nullable = true)
 |-- tpep_dropoff_datetime: timestamp_ntz (nullable = true)
 |-- passenger_count: integer (nullable = false)
 |-- trip_distance: double (nullable = false)
 |-- ratecodeid: integer (nullable = false)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- pulocationid: integer (nullable = false)
 |-- dolocationid: integer (nullable = false)
 |-- payment_type: integer (nullable = false)
 |-- fare_amount: double (nullable = false)
 |-- extra: double (nullable = false)
 |-- mta_tax: double (nullable = false)
 |-- tip_amount: double (nullable = false)
 |-- tolls_amount: double (nullable = false)
 |-- improvement_surcharge: double (nullable = false)
 |-- total_amount: double (nullable = false)
 |-- congestion_surcharge:

                                                                                

✅ Стандартизировано: s3a://bronze/nyc-taxi-data/yellow_tripdata_2022-05.parquet -> s3a://silver/nyc-taxi-data/yellow_tripdata_2022-05.parquet
📊 Схема после стандартизации:
root
 |-- vendorid: integer (nullable = false)
 |-- tpep_pickup_datetime: timestamp_ntz (nullable = true)
 |-- tpep_dropoff_datetime: timestamp_ntz (nullable = true)
 |-- passenger_count: integer (nullable = false)
 |-- trip_distance: double (nullable = false)
 |-- ratecodeid: integer (nullable = false)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- pulocationid: integer (nullable = false)
 |-- dolocationid: integer (nullable = false)
 |-- payment_type: integer (nullable = false)
 |-- fare_amount: double (nullable = false)
 |-- extra: double (nullable = false)
 |-- mta_tax: double (nullable = false)
 |-- tip_amount: double (nullable = false)
 |-- tolls_amount: double (nullable = false)
 |-- improvement_surcharge: double (nullable = false)
 |-- total_amount: double (nullable = false)
 |-- congestion_surcharge:

                                                                                

✅ Стандартизировано: s3a://bronze/nyc-taxi-data/yellow_tripdata_2024-08.parquet -> s3a://silver/nyc-taxi-data/yellow_tripdata_2024-08.parquet
📊 Схема после стандартизации:
root
 |-- vendorid: integer (nullable = false)
 |-- tpep_pickup_datetime: timestamp_ntz (nullable = true)
 |-- tpep_dropoff_datetime: timestamp_ntz (nullable = true)
 |-- passenger_count: integer (nullable = false)
 |-- trip_distance: double (nullable = false)
 |-- ratecodeid: integer (nullable = false)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- pulocationid: integer (nullable = false)
 |-- dolocationid: integer (nullable = false)
 |-- payment_type: integer (nullable = false)
 |-- fare_amount: double (nullable = false)
 |-- extra: double (nullable = false)
 |-- mta_tax: double (nullable = false)
 |-- tip_amount: double (nullable = false)
 |-- tolls_amount: double (nullable = false)
 |-- improvement_surcharge: double (nullable = false)
 |-- total_amount: double (nullable = false)
 |-- congestion_surcharge:

                                                                                

✅ Стандартизировано: s3a://bronze/nyc-taxi-data/yellow_tripdata_2024-07.parquet -> s3a://silver/nyc-taxi-data/yellow_tripdata_2024-07.parquet
📊 Схема после стандартизации:
root
 |-- vendorid: integer (nullable = false)
 |-- tpep_pickup_datetime: timestamp_ntz (nullable = true)
 |-- tpep_dropoff_datetime: timestamp_ntz (nullable = true)
 |-- passenger_count: integer (nullable = false)
 |-- trip_distance: double (nullable = false)
 |-- ratecodeid: integer (nullable = false)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- pulocationid: integer (nullable = false)
 |-- dolocationid: integer (nullable = false)
 |-- payment_type: integer (nullable = false)
 |-- fare_amount: double (nullable = false)
 |-- extra: double (nullable = false)
 |-- mta_tax: double (nullable = false)
 |-- tip_amount: double (nullable = false)
 |-- tolls_amount: double (nullable = false)
 |-- improvement_surcharge: double (nullable = false)
 |-- total_amount: double (nullable = false)
 |-- congestion_surcharge:

                                                                                

✅ Стандартизировано: s3a://bronze/nyc-taxi-data/yellow_tripdata_2024-02.parquet -> s3a://silver/nyc-taxi-data/yellow_tripdata_2024-02.parquet
📊 Схема после стандартизации:
root
 |-- vendorid: integer (nullable = false)
 |-- tpep_pickup_datetime: timestamp_ntz (nullable = true)
 |-- tpep_dropoff_datetime: timestamp_ntz (nullable = true)
 |-- passenger_count: integer (nullable = false)
 |-- trip_distance: double (nullable = false)
 |-- ratecodeid: integer (nullable = false)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- pulocationid: integer (nullable = false)
 |-- dolocationid: integer (nullable = false)
 |-- payment_type: integer (nullable = false)
 |-- fare_amount: double (nullable = false)
 |-- extra: double (nullable = false)
 |-- mta_tax: double (nullable = false)
 |-- tip_amount: double (nullable = false)
 |-- tolls_amount: double (nullable = false)
 |-- improvement_surcharge: double (nullable = false)
 |-- total_amount: double (nullable = false)
 |-- congestion_surcharge:

                                                                                

✅ Стандартизировано: s3a://bronze/nyc-taxi-data/yellow_tripdata_2022-11.parquet -> s3a://silver/nyc-taxi-data/yellow_tripdata_2022-11.parquet
📊 Схема после стандартизации:
root
 |-- vendorid: integer (nullable = false)
 |-- tpep_pickup_datetime: timestamp_ntz (nullable = true)
 |-- tpep_dropoff_datetime: timestamp_ntz (nullable = true)
 |-- passenger_count: integer (nullable = false)
 |-- trip_distance: double (nullable = false)
 |-- ratecodeid: integer (nullable = false)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- pulocationid: integer (nullable = false)
 |-- dolocationid: integer (nullable = false)
 |-- payment_type: integer (nullable = false)
 |-- fare_amount: double (nullable = false)
 |-- extra: double (nullable = false)
 |-- mta_tax: double (nullable = false)
 |-- tip_amount: double (nullable = false)
 |-- tolls_amount: double (nullable = false)
 |-- improvement_surcharge: double (nullable = false)
 |-- total_amount: double (nullable = false)
 |-- congestion_surcharge:

                                                                                

✅ Стандартизировано: s3a://bronze/nyc-taxi-data/yellow_tripdata_2022-08.parquet -> s3a://silver/nyc-taxi-data/yellow_tripdata_2022-08.parquet
📊 Схема после стандартизации:
root
 |-- vendorid: integer (nullable = false)
 |-- tpep_pickup_datetime: timestamp_ntz (nullable = true)
 |-- tpep_dropoff_datetime: timestamp_ntz (nullable = true)
 |-- passenger_count: integer (nullable = false)
 |-- trip_distance: double (nullable = false)
 |-- ratecodeid: integer (nullable = false)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- pulocationid: integer (nullable = false)
 |-- dolocationid: integer (nullable = false)
 |-- payment_type: integer (nullable = false)
 |-- fare_amount: double (nullable = false)
 |-- extra: double (nullable = false)
 |-- mta_tax: double (nullable = false)
 |-- tip_amount: double (nullable = false)
 |-- tolls_amount: double (nullable = false)
 |-- improvement_surcharge: double (nullable = false)
 |-- total_amount: double (nullable = false)
 |-- congestion_surcharge:

                                                                                

✅ Стандартизировано: s3a://bronze/nyc-taxi-data/yellow_tripdata_2024-01.parquet -> s3a://silver/nyc-taxi-data/yellow_tripdata_2024-01.parquet
📊 Схема после стандартизации:
root
 |-- vendorid: integer (nullable = false)
 |-- tpep_pickup_datetime: timestamp_ntz (nullable = true)
 |-- tpep_dropoff_datetime: timestamp_ntz (nullable = true)
 |-- passenger_count: integer (nullable = false)
 |-- trip_distance: double (nullable = false)
 |-- ratecodeid: integer (nullable = false)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- pulocationid: integer (nullable = false)
 |-- dolocationid: integer (nullable = false)
 |-- payment_type: integer (nullable = false)
 |-- fare_amount: double (nullable = false)
 |-- extra: double (nullable = false)
 |-- mta_tax: double (nullable = false)
 |-- tip_amount: double (nullable = false)
 |-- tolls_amount: double (nullable = false)
 |-- improvement_surcharge: double (nullable = false)
 |-- total_amount: double (nullable = false)
 |-- congestion_surcharge:

                                                                                

✅ Стандартизировано: s3a://bronze/nyc-taxi-data/yellow_tripdata_2023-09.parquet -> s3a://silver/nyc-taxi-data/yellow_tripdata_2023-09.parquet
📊 Схема после стандартизации:
root
 |-- vendorid: integer (nullable = false)
 |-- tpep_pickup_datetime: timestamp_ntz (nullable = true)
 |-- tpep_dropoff_datetime: timestamp_ntz (nullable = true)
 |-- passenger_count: integer (nullable = false)
 |-- trip_distance: double (nullable = false)
 |-- ratecodeid: integer (nullable = false)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- pulocationid: integer (nullable = false)
 |-- dolocationid: integer (nullable = false)
 |-- payment_type: integer (nullable = false)
 |-- fare_amount: double (nullable = false)
 |-- extra: double (nullable = false)
 |-- mta_tax: double (nullable = false)
 |-- tip_amount: double (nullable = false)
 |-- tolls_amount: double (nullable = false)
 |-- improvement_surcharge: double (nullable = false)
 |-- total_amount: double (nullable = false)
 |-- congestion_surcharge:

                                                                                

✅ Стандартизировано: s3a://bronze/nyc-taxi-data/yellow_tripdata_2023-02.parquet -> s3a://silver/nyc-taxi-data/yellow_tripdata_2023-02.parquet
📊 Схема после стандартизации:
root
 |-- vendorid: integer (nullable = false)
 |-- tpep_pickup_datetime: timestamp_ntz (nullable = true)
 |-- tpep_dropoff_datetime: timestamp_ntz (nullable = true)
 |-- passenger_count: integer (nullable = false)
 |-- trip_distance: double (nullable = false)
 |-- ratecodeid: integer (nullable = false)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- pulocationid: integer (nullable = false)
 |-- dolocationid: integer (nullable = false)
 |-- payment_type: integer (nullable = false)
 |-- fare_amount: double (nullable = false)
 |-- extra: double (nullable = false)
 |-- mta_tax: double (nullable = false)
 |-- tip_amount: double (nullable = false)
 |-- tolls_amount: double (nullable = false)
 |-- improvement_surcharge: double (nullable = false)
 |-- total_amount: double (nullable = false)
 |-- congestion_surcharge:

                                                                                

✅ Стандартизировано: s3a://bronze/nyc-taxi-data/yellow_tripdata_2022-01.parquet -> s3a://silver/nyc-taxi-data/yellow_tripdata_2022-01.parquet
📊 Схема после стандартизации:
root
 |-- vendorid: integer (nullable = false)
 |-- tpep_pickup_datetime: timestamp_ntz (nullable = true)
 |-- tpep_dropoff_datetime: timestamp_ntz (nullable = true)
 |-- passenger_count: integer (nullable = false)
 |-- trip_distance: double (nullable = false)
 |-- ratecodeid: integer (nullable = false)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- pulocationid: integer (nullable = false)
 |-- dolocationid: integer (nullable = false)
 |-- payment_type: integer (nullable = false)
 |-- fare_amount: double (nullable = false)
 |-- extra: double (nullable = false)
 |-- mta_tax: double (nullable = false)
 |-- tip_amount: double (nullable = false)
 |-- tolls_amount: double (nullable = false)
 |-- improvement_surcharge: double (nullable = false)
 |-- total_amount: double (nullable = false)
 |-- congestion_surcharge:

                                                                                

✅ Стандартизировано: s3a://bronze/nyc-taxi-data/yellow_tripdata_2025-06.parquet -> s3a://silver/nyc-taxi-data/yellow_tripdata_2025-06.parquet
📊 Схема после стандартизации:
root
 |-- vendorid: integer (nullable = false)
 |-- tpep_pickup_datetime: timestamp_ntz (nullable = true)
 |-- tpep_dropoff_datetime: timestamp_ntz (nullable = true)
 |-- passenger_count: integer (nullable = false)
 |-- trip_distance: double (nullable = false)
 |-- ratecodeid: integer (nullable = false)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- pulocationid: integer (nullable = false)
 |-- dolocationid: integer (nullable = false)
 |-- payment_type: integer (nullable = false)
 |-- fare_amount: double (nullable = false)
 |-- extra: double (nullable = false)
 |-- mta_tax: double (nullable = false)
 |-- tip_amount: double (nullable = false)
 |-- tolls_amount: double (nullable = false)
 |-- improvement_surcharge: double (nullable = false)
 |-- total_amount: double (nullable = false)
 |-- congestion_surcharge:

                                                                                

✅ Стандартизировано: s3a://bronze/nyc-taxi-data/yellow_tripdata_2025-03.parquet -> s3a://silver/nyc-taxi-data/yellow_tripdata_2025-03.parquet
📊 Схема после стандартизации:
root
 |-- vendorid: integer (nullable = false)
 |-- tpep_pickup_datetime: timestamp_ntz (nullable = true)
 |-- tpep_dropoff_datetime: timestamp_ntz (nullable = true)
 |-- passenger_count: integer (nullable = false)
 |-- trip_distance: double (nullable = false)
 |-- ratecodeid: integer (nullable = false)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- pulocationid: integer (nullable = false)
 |-- dolocationid: integer (nullable = false)
 |-- payment_type: integer (nullable = false)
 |-- fare_amount: double (nullable = false)
 |-- extra: double (nullable = false)
 |-- mta_tax: double (nullable = false)
 |-- tip_amount: double (nullable = false)
 |-- tolls_amount: double (nullable = false)
 |-- improvement_surcharge: double (nullable = false)
 |-- total_amount: double (nullable = false)
 |-- congestion_surcharge:

                                                                                

✅ Стандартизировано: s3a://bronze/nyc-taxi-data/yellow_tripdata_2025-07.parquet -> s3a://silver/nyc-taxi-data/yellow_tripdata_2025-07.parquet
📊 Схема после стандартизации:
root
 |-- vendorid: integer (nullable = false)
 |-- tpep_pickup_datetime: timestamp_ntz (nullable = true)
 |-- tpep_dropoff_datetime: timestamp_ntz (nullable = true)
 |-- passenger_count: integer (nullable = false)
 |-- trip_distance: double (nullable = false)
 |-- ratecodeid: integer (nullable = false)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- pulocationid: integer (nullable = false)
 |-- dolocationid: integer (nullable = false)
 |-- payment_type: integer (nullable = false)
 |-- fare_amount: double (nullable = false)
 |-- extra: double (nullable = false)
 |-- mta_tax: double (nullable = false)
 |-- tip_amount: double (nullable = false)
 |-- tolls_amount: double (nullable = false)
 |-- improvement_surcharge: double (nullable = false)
 |-- total_amount: double (nullable = false)
 |-- congestion_surcharge:

                                                                                

✅ Стандартизировано: s3a://bronze/nyc-taxi-data/yellow_tripdata_2024-12.parquet -> s3a://silver/nyc-taxi-data/yellow_tripdata_2024-12.parquet
📊 Схема после стандартизации:
root
 |-- vendorid: integer (nullable = false)
 |-- tpep_pickup_datetime: timestamp_ntz (nullable = true)
 |-- tpep_dropoff_datetime: timestamp_ntz (nullable = true)
 |-- passenger_count: integer (nullable = false)
 |-- trip_distance: double (nullable = false)
 |-- ratecodeid: integer (nullable = false)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- pulocationid: integer (nullable = false)
 |-- dolocationid: integer (nullable = false)
 |-- payment_type: integer (nullable = false)
 |-- fare_amount: double (nullable = false)
 |-- extra: double (nullable = false)
 |-- mta_tax: double (nullable = false)
 |-- tip_amount: double (nullable = false)
 |-- tolls_amount: double (nullable = false)
 |-- improvement_surcharge: double (nullable = false)
 |-- total_amount: double (nullable = false)
 |-- congestion_surcharge:

                                                                                

✅ Стандартизировано: s3a://bronze/nyc-taxi-data/yellow_tripdata_2025-08.parquet -> s3a://silver/nyc-taxi-data/yellow_tripdata_2025-08.parquet
📊 Схема после стандартизации:
root
 |-- vendorid: integer (nullable = false)
 |-- tpep_pickup_datetime: timestamp_ntz (nullable = true)
 |-- tpep_dropoff_datetime: timestamp_ntz (nullable = true)
 |-- passenger_count: integer (nullable = false)
 |-- trip_distance: double (nullable = false)
 |-- ratecodeid: integer (nullable = false)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- pulocationid: integer (nullable = false)
 |-- dolocationid: integer (nullable = false)
 |-- payment_type: integer (nullable = false)
 |-- fare_amount: double (nullable = false)
 |-- extra: double (nullable = false)
 |-- mta_tax: double (nullable = false)
 |-- tip_amount: double (nullable = false)
 |-- tolls_amount: double (nullable = false)
 |-- improvement_surcharge: double (nullable = false)
 |-- total_amount: double (nullable = false)
 |-- congestion_surcharge:

                                                                                

✅ Стандартизировано: s3a://bronze/nyc-taxi-data/yellow_tripdata_2024-03.parquet -> s3a://silver/nyc-taxi-data/yellow_tripdata_2024-03.parquet
📊 Схема после стандартизации:
root
 |-- vendorid: integer (nullable = false)
 |-- tpep_pickup_datetime: timestamp_ntz (nullable = true)
 |-- tpep_dropoff_datetime: timestamp_ntz (nullable = true)
 |-- passenger_count: integer (nullable = false)
 |-- trip_distance: double (nullable = false)
 |-- ratecodeid: integer (nullable = false)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- pulocationid: integer (nullable = false)
 |-- dolocationid: integer (nullable = false)
 |-- payment_type: integer (nullable = false)
 |-- fare_amount: double (nullable = false)
 |-- extra: double (nullable = false)
 |-- mta_tax: double (nullable = false)
 |-- tip_amount: double (nullable = false)
 |-- tolls_amount: double (nullable = false)
 |-- improvement_surcharge: double (nullable = false)
 |-- total_amount: double (nullable = false)
 |-- congestion_surcharge:

                                                                                

✅ Стандартизировано: s3a://bronze/nyc-taxi-data/yellow_tripdata_2025-02.parquet -> s3a://silver/nyc-taxi-data/yellow_tripdata_2025-02.parquet
📊 Схема после стандартизации:
root
 |-- vendorid: integer (nullable = false)
 |-- tpep_pickup_datetime: timestamp_ntz (nullable = true)
 |-- tpep_dropoff_datetime: timestamp_ntz (nullable = true)
 |-- passenger_count: integer (nullable = false)
 |-- trip_distance: double (nullable = false)
 |-- ratecodeid: integer (nullable = false)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- pulocationid: integer (nullable = false)
 |-- dolocationid: integer (nullable = false)
 |-- payment_type: integer (nullable = false)
 |-- fare_amount: double (nullable = false)
 |-- extra: double (nullable = false)
 |-- mta_tax: double (nullable = false)
 |-- tip_amount: double (nullable = false)
 |-- tolls_amount: double (nullable = false)
 |-- improvement_surcharge: double (nullable = false)
 |-- total_amount: double (nullable = false)
 |-- congestion_surcharge:

                                                                                

✅ Стандартизировано: s3a://bronze/nyc-taxi-data/yellow_tripdata_2024-06.parquet -> s3a://silver/nyc-taxi-data/yellow_tripdata_2024-06.parquet
📊 Схема после стандартизации:
root
 |-- vendorid: integer (nullable = false)
 |-- tpep_pickup_datetime: timestamp_ntz (nullable = true)
 |-- tpep_dropoff_datetime: timestamp_ntz (nullable = true)
 |-- passenger_count: integer (nullable = false)
 |-- trip_distance: double (nullable = false)
 |-- ratecodeid: integer (nullable = false)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- pulocationid: integer (nullable = false)
 |-- dolocationid: integer (nullable = false)
 |-- payment_type: integer (nullable = false)
 |-- fare_amount: double (nullable = false)
 |-- extra: double (nullable = false)
 |-- mta_tax: double (nullable = false)
 |-- tip_amount: double (nullable = false)
 |-- tolls_amount: double (nullable = false)
 |-- improvement_surcharge: double (nullable = false)
 |-- total_amount: double (nullable = false)
 |-- congestion_surcharge:

                                                                                

✅ Стандартизировано: s3a://bronze/nyc-taxi-data/yellow_tripdata_2025-01.parquet -> s3a://silver/nyc-taxi-data/yellow_tripdata_2025-01.parquet
📊 Схема после стандартизации:
root
 |-- vendorid: integer (nullable = false)
 |-- tpep_pickup_datetime: timestamp_ntz (nullable = true)
 |-- tpep_dropoff_datetime: timestamp_ntz (nullable = true)
 |-- passenger_count: integer (nullable = false)
 |-- trip_distance: double (nullable = false)
 |-- ratecodeid: integer (nullable = false)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- pulocationid: integer (nullable = false)
 |-- dolocationid: integer (nullable = false)
 |-- payment_type: integer (nullable = false)
 |-- fare_amount: double (nullable = false)
 |-- extra: double (nullable = false)
 |-- mta_tax: double (nullable = false)
 |-- tip_amount: double (nullable = false)
 |-- tolls_amount: double (nullable = false)
 |-- improvement_surcharge: double (nullable = false)
 |-- total_amount: double (nullable = false)
 |-- congestion_surcharge:

                                                                                

✅ Стандартизировано: s3a://bronze/nyc-taxi-data/yellow_tripdata_2023-05.parquet -> s3a://silver/nyc-taxi-data/yellow_tripdata_2023-05.parquet
📊 Схема после стандартизации:
root
 |-- vendorid: integer (nullable = false)
 |-- tpep_pickup_datetime: timestamp_ntz (nullable = true)
 |-- tpep_dropoff_datetime: timestamp_ntz (nullable = true)
 |-- passenger_count: integer (nullable = false)
 |-- trip_distance: double (nullable = false)
 |-- ratecodeid: integer (nullable = false)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- pulocationid: integer (nullable = false)
 |-- dolocationid: integer (nullable = false)
 |-- payment_type: integer (nullable = false)
 |-- fare_amount: double (nullable = false)
 |-- extra: double (nullable = false)
 |-- mta_tax: double (nullable = false)
 |-- tip_amount: double (nullable = false)
 |-- tolls_amount: double (nullable = false)
 |-- improvement_surcharge: double (nullable = false)
 |-- total_amount: double (nullable = false)
 |-- congestion_surcharge:

                                                                                

✅ Стандартизировано: s3a://bronze/nyc-taxi-data/yellow_tripdata_2022-10.parquet -> s3a://silver/nyc-taxi-data/yellow_tripdata_2022-10.parquet
📊 Схема после стандартизации:
root
 |-- vendorid: integer (nullable = false)
 |-- tpep_pickup_datetime: timestamp_ntz (nullable = true)
 |-- tpep_dropoff_datetime: timestamp_ntz (nullable = true)
 |-- passenger_count: integer (nullable = false)
 |-- trip_distance: double (nullable = false)
 |-- ratecodeid: integer (nullable = false)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- pulocationid: integer (nullable = false)
 |-- dolocationid: integer (nullable = false)
 |-- payment_type: integer (nullable = false)
 |-- fare_amount: double (nullable = false)
 |-- extra: double (nullable = false)
 |-- mta_tax: double (nullable = false)
 |-- tip_amount: double (nullable = false)
 |-- tolls_amount: double (nullable = false)
 |-- improvement_surcharge: double (nullable = false)
 |-- total_amount: double (nullable = false)
 |-- congestion_surcharge:

                                                                                

✅ Стандартизировано: s3a://bronze/nyc-taxi-data/yellow_tripdata_2023-03.parquet -> s3a://silver/nyc-taxi-data/yellow_tripdata_2023-03.parquet
📊 Схема после стандартизации:
root
 |-- vendorid: integer (nullable = false)
 |-- tpep_pickup_datetime: timestamp_ntz (nullable = true)
 |-- tpep_dropoff_datetime: timestamp_ntz (nullable = true)
 |-- passenger_count: integer (nullable = false)
 |-- trip_distance: double (nullable = false)
 |-- ratecodeid: integer (nullable = false)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- pulocationid: integer (nullable = false)
 |-- dolocationid: integer (nullable = false)
 |-- payment_type: integer (nullable = false)
 |-- fare_amount: double (nullable = false)
 |-- extra: double (nullable = false)
 |-- mta_tax: double (nullable = false)
 |-- tip_amount: double (nullable = false)
 |-- tolls_amount: double (nullable = false)
 |-- improvement_surcharge: double (nullable = false)
 |-- total_amount: double (nullable = false)
 |-- congestion_surcharge:

                                                                                

✅ Стандартизировано: s3a://bronze/nyc-taxi-data/yellow_tripdata_2023-12.parquet -> s3a://silver/nyc-taxi-data/yellow_tripdata_2023-12.parquet
📊 Схема после стандартизации:
root
 |-- vendorid: integer (nullable = false)
 |-- tpep_pickup_datetime: timestamp_ntz (nullable = true)
 |-- tpep_dropoff_datetime: timestamp_ntz (nullable = true)
 |-- passenger_count: integer (nullable = false)
 |-- trip_distance: double (nullable = false)
 |-- ratecodeid: integer (nullable = false)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- pulocationid: integer (nullable = false)
 |-- dolocationid: integer (nullable = false)
 |-- payment_type: integer (nullable = false)
 |-- fare_amount: double (nullable = false)
 |-- extra: double (nullable = false)
 |-- mta_tax: double (nullable = false)
 |-- tip_amount: double (nullable = false)
 |-- tolls_amount: double (nullable = false)
 |-- improvement_surcharge: double (nullable = false)
 |-- total_amount: double (nullable = false)
 |-- congestion_surcharge:

                                                                                

✅ Стандартизировано: s3a://bronze/nyc-taxi-data/yellow_tripdata_2023-06.parquet -> s3a://silver/nyc-taxi-data/yellow_tripdata_2023-06.parquet
📊 Схема после стандартизации:
root
 |-- vendorid: integer (nullable = false)
 |-- tpep_pickup_datetime: timestamp_ntz (nullable = true)
 |-- tpep_dropoff_datetime: timestamp_ntz (nullable = true)
 |-- passenger_count: integer (nullable = false)
 |-- trip_distance: double (nullable = false)
 |-- ratecodeid: integer (nullable = false)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- pulocationid: integer (nullable = false)
 |-- dolocationid: integer (nullable = false)
 |-- payment_type: integer (nullable = false)
 |-- fare_amount: double (nullable = false)
 |-- extra: double (nullable = false)
 |-- mta_tax: double (nullable = false)
 |-- tip_amount: double (nullable = false)
 |-- tolls_amount: double (nullable = false)
 |-- improvement_surcharge: double (nullable = false)
 |-- total_amount: double (nullable = false)
 |-- congestion_surcharge:

                                                                                

✅ Стандартизировано: s3a://bronze/nyc-taxi-data/yellow_tripdata_2022-04.parquet -> s3a://silver/nyc-taxi-data/yellow_tripdata_2022-04.parquet
📊 Схема после стандартизации:
root
 |-- vendorid: integer (nullable = false)
 |-- tpep_pickup_datetime: timestamp_ntz (nullable = true)
 |-- tpep_dropoff_datetime: timestamp_ntz (nullable = true)
 |-- passenger_count: integer (nullable = false)
 |-- trip_distance: double (nullable = false)
 |-- ratecodeid: integer (nullable = false)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- pulocationid: integer (nullable = false)
 |-- dolocationid: integer (nullable = false)
 |-- payment_type: integer (nullable = false)
 |-- fare_amount: double (nullable = false)
 |-- extra: double (nullable = false)
 |-- mta_tax: double (nullable = false)
 |-- tip_amount: double (nullable = false)
 |-- tolls_amount: double (nullable = false)
 |-- improvement_surcharge: double (nullable = false)
 |-- total_amount: double (nullable = false)
 |-- congestion_surcharge:

                                                                                

✅ Стандартизировано: s3a://bronze/nyc-taxi-data/yellow_tripdata_2023-04.parquet -> s3a://silver/nyc-taxi-data/yellow_tripdata_2023-04.parquet
📊 Схема после стандартизации:
root
 |-- vendorid: integer (nullable = false)
 |-- tpep_pickup_datetime: timestamp_ntz (nullable = true)
 |-- tpep_dropoff_datetime: timestamp_ntz (nullable = true)
 |-- passenger_count: integer (nullable = false)
 |-- trip_distance: double (nullable = false)
 |-- ratecodeid: integer (nullable = false)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- pulocationid: integer (nullable = false)
 |-- dolocationid: integer (nullable = false)
 |-- payment_type: integer (nullable = false)
 |-- fare_amount: double (nullable = false)
 |-- extra: double (nullable = false)
 |-- mta_tax: double (nullable = false)
 |-- tip_amount: double (nullable = false)
 |-- tolls_amount: double (nullable = false)
 |-- improvement_surcharge: double (nullable = false)
 |-- total_amount: double (nullable = false)
 |-- congestion_surcharge:

                                                                                

✅ Стандартизировано: s3a://bronze/nyc-taxi-data/yellow_tripdata_2022-12.parquet -> s3a://silver/nyc-taxi-data/yellow_tripdata_2022-12.parquet
📊 Схема после стандартизации:
root
 |-- vendorid: integer (nullable = false)
 |-- tpep_pickup_datetime: timestamp_ntz (nullable = true)
 |-- tpep_dropoff_datetime: timestamp_ntz (nullable = true)
 |-- passenger_count: integer (nullable = false)
 |-- trip_distance: double (nullable = false)
 |-- ratecodeid: integer (nullable = false)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- pulocationid: integer (nullable = false)
 |-- dolocationid: integer (nullable = false)
 |-- payment_type: integer (nullable = false)
 |-- fare_amount: double (nullable = false)
 |-- extra: double (nullable = false)
 |-- mta_tax: double (nullable = false)
 |-- tip_amount: double (nullable = false)
 |-- tolls_amount: double (nullable = false)
 |-- improvement_surcharge: double (nullable = false)
 |-- total_amount: double (nullable = false)
 |-- congestion_surcharge:

                                                                                

✅ Стандартизировано: s3a://bronze/nyc-taxi-data/yellow_tripdata_2022-09.parquet -> s3a://silver/nyc-taxi-data/yellow_tripdata_2022-09.parquet
📊 Схема после стандартизации:
root
 |-- vendorid: integer (nullable = false)
 |-- tpep_pickup_datetime: timestamp_ntz (nullable = true)
 |-- tpep_dropoff_datetime: timestamp_ntz (nullable = true)
 |-- passenger_count: integer (nullable = false)
 |-- trip_distance: double (nullable = false)
 |-- ratecodeid: integer (nullable = false)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- pulocationid: integer (nullable = false)
 |-- dolocationid: integer (nullable = false)
 |-- payment_type: integer (nullable = false)
 |-- fare_amount: double (nullable = false)
 |-- extra: double (nullable = false)
 |-- mta_tax: double (nullable = false)
 |-- tip_amount: double (nullable = false)
 |-- tolls_amount: double (nullable = false)
 |-- improvement_surcharge: double (nullable = false)
 |-- total_amount: double (nullable = false)
 |-- congestion_surcharge:

                                                                                

✅ Стандартизировано: s3a://bronze/nyc-taxi-data/yellow_tripdata_2022-07.parquet -> s3a://silver/nyc-taxi-data/yellow_tripdata_2022-07.parquet
📊 Схема после стандартизации:
root
 |-- vendorid: integer (nullable = false)
 |-- tpep_pickup_datetime: timestamp_ntz (nullable = true)
 |-- tpep_dropoff_datetime: timestamp_ntz (nullable = true)
 |-- passenger_count: integer (nullable = false)
 |-- trip_distance: double (nullable = false)
 |-- ratecodeid: integer (nullable = false)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- pulocationid: integer (nullable = false)
 |-- dolocationid: integer (nullable = false)
 |-- payment_type: integer (nullable = false)
 |-- fare_amount: double (nullable = false)
 |-- extra: double (nullable = false)
 |-- mta_tax: double (nullable = false)
 |-- tip_amount: double (nullable = false)
 |-- tolls_amount: double (nullable = false)
 |-- improvement_surcharge: double (nullable = false)
 |-- total_amount: double (nullable = false)
 |-- congestion_surcharge:

                                                                                

✅ Стандартизировано: s3a://bronze/nyc-taxi-data/yellow_tripdata_2023-08.parquet -> s3a://silver/nyc-taxi-data/yellow_tripdata_2023-08.parquet
📊 Схема после стандартизации:
root
 |-- vendorid: integer (nullable = false)
 |-- tpep_pickup_datetime: timestamp_ntz (nullable = true)
 |-- tpep_dropoff_datetime: timestamp_ntz (nullable = true)
 |-- passenger_count: integer (nullable = false)
 |-- trip_distance: double (nullable = false)
 |-- ratecodeid: integer (nullable = false)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- pulocationid: integer (nullable = false)
 |-- dolocationid: integer (nullable = false)
 |-- payment_type: integer (nullable = false)
 |-- fare_amount: double (nullable = false)
 |-- extra: double (nullable = false)
 |-- mta_tax: double (nullable = false)
 |-- tip_amount: double (nullable = false)
 |-- tolls_amount: double (nullable = false)
 |-- improvement_surcharge: double (nullable = false)
 |-- total_amount: double (nullable = false)
 |-- congestion_surcharge:

                                                                                

✅ Стандартизировано: s3a://bronze/nyc-taxi-data/yellow_tripdata_2023-07.parquet -> s3a://silver/nyc-taxi-data/yellow_tripdata_2023-07.parquet
📊 Схема после стандартизации:
root
 |-- vendorid: integer (nullable = false)
 |-- tpep_pickup_datetime: timestamp_ntz (nullable = true)
 |-- tpep_dropoff_datetime: timestamp_ntz (nullable = true)
 |-- passenger_count: integer (nullable = false)
 |-- trip_distance: double (nullable = false)
 |-- ratecodeid: integer (nullable = false)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- pulocationid: integer (nullable = false)
 |-- dolocationid: integer (nullable = false)
 |-- payment_type: integer (nullable = false)
 |-- fare_amount: double (nullable = false)
 |-- extra: double (nullable = false)
 |-- mta_tax: double (nullable = false)
 |-- tip_amount: double (nullable = false)
 |-- tolls_amount: double (nullable = false)
 |-- improvement_surcharge: double (nullable = false)
 |-- total_amount: double (nullable = false)
 |-- congestion_surcharge:

                                                                                

✅ Стандартизировано: s3a://bronze/nyc-taxi-data/yellow_tripdata_2022-02.parquet -> s3a://silver/nyc-taxi-data/yellow_tripdata_2022-02.parquet
📊 Схема после стандартизации:
root
 |-- vendorid: integer (nullable = false)
 |-- tpep_pickup_datetime: timestamp_ntz (nullable = true)
 |-- tpep_dropoff_datetime: timestamp_ntz (nullable = true)
 |-- passenger_count: integer (nullable = false)
 |-- trip_distance: double (nullable = false)
 |-- ratecodeid: integer (nullable = false)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- pulocationid: integer (nullable = false)
 |-- dolocationid: integer (nullable = false)
 |-- payment_type: integer (nullable = false)
 |-- fare_amount: double (nullable = false)
 |-- extra: double (nullable = false)
 |-- mta_tax: double (nullable = false)
 |-- tip_amount: double (nullable = false)
 |-- tolls_amount: double (nullable = false)
 |-- improvement_surcharge: double (nullable = false)
 |-- total_amount: double (nullable = false)
 |-- congestion_surcharge:



✅ Стандартизировано: s3a://bronze/nyc-taxi-data/yellow_tripdata_2023-01.parquet -> s3a://silver/nyc-taxi-data/yellow_tripdata_2023-01.parquet
📊 Схема после стандартизации:
root
 |-- vendorid: integer (nullable = false)
 |-- tpep_pickup_datetime: timestamp_ntz (nullable = true)
 |-- tpep_dropoff_datetime: timestamp_ntz (nullable = true)
 |-- passenger_count: integer (nullable = false)
 |-- trip_distance: double (nullable = false)
 |-- ratecodeid: integer (nullable = false)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- pulocationid: integer (nullable = false)
 |-- dolocationid: integer (nullable = false)
 |-- payment_type: integer (nullable = false)
 |-- fare_amount: double (nullable = false)
 |-- extra: double (nullable = false)
 |-- mta_tax: double (nullable = false)
 |-- tip_amount: double (nullable = false)
 |-- tolls_amount: double (nullable = false)
 |-- improvement_surcharge: double (nullable = false)
 |-- total_amount: double (nullable = false)
 |-- congestion_surcharge:

                                                                                

In [122]:
import time

start_time = time.time()


end_time = time.time()
execution_time = end_time - start_time

print(f"Время выполнения: {execution_time:.4f} секунд")


Время выполнения: 0.0000 секунд


In [119]:
bronze_files = spark.sql(f"""
    SELECT path 
    FROM (
        SELECT input_file_name() as path 
        FROM parquet.`s3a://bronze/nyc-taxi-data/`
    ) 
    GROUP BY path
""").collect()

25/10/31 20:03:32 WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist
25/10/31 20:03:32 WARN HiveConf: HiveConf of name hive.stats.retries.wait does not exist
25/10/31 20:03:36 WARN ObjectStore: Version information not found in metastore. hive.metastore.schema.verification is not enabled so recording the schema version 2.3.0
25/10/31 20:03:36 WARN ObjectStore: setMetaStoreSchemaVersion called but recording version is disabled: version = 2.3.0, comment = Set by MetaStore UNKNOWN@172.18.0.8
25/10/31 20:03:36 WARN ObjectStore: Failed to get database default, returning NoSuchObjectException
25/10/31 20:03:36 WARN ObjectStore: Failed to get database global_temp, returning NoSuchObjectException
25/10/31 20:03:36 WARN ObjectStore: Failed to get database parquet, returning NoSuchObjectException
                                                                                

In [24]:
# Конвертируем в Pandas для красивого отображения
import pandas as pd

def show_schema_pandas(df):
    schema_info = []
    for field in df.schema.fields:
        schema_info.append({
            'Column': field.name,
            'Type': field.dataType.simpleString(),
            'Nullable': field.nullable,
            'Metadata': field.metadata
        })
    
    schema_pd = pd.DataFrame(schema_info)
    print(schema_pd)

In [114]:
show_schema_pandas(df_2023_01)

                   Column           Type  Nullable Metadata
0                VendorID         bigint      True       {}
1    tpep_pickup_datetime  timestamp_ntz      True       {}
2   tpep_dropoff_datetime  timestamp_ntz      True       {}
3         passenger_count         double      True       {}
4           trip_distance         double      True       {}
5              RatecodeID         double      True       {}
6      store_and_fwd_flag         string      True       {}
7            PULocationID         bigint      True       {}
8            DOLocationID         bigint      True       {}
9            payment_type         bigint      True       {}
10            fare_amount         double      True       {}
11                  extra         double      True       {}
12                mta_tax         double      True       {}
13             tip_amount         double      True       {}
14           tolls_amount         double      True       {}
15  improvement_surcharge         double

In [118]:
show_schema_pandas(df_2023_01)

                   Column           Type  Nullable Metadata
0                vendorid            int      True       {}
1    tpep_pickup_datetime  timestamp_ntz      True       {}
2   tpep_dropoff_datetime  timestamp_ntz      True       {}
3         passenger_count            int      True       {}
4           trip_distance         double      True       {}
5              ratecodeid            int      True       {}
6      store_and_fwd_flag         string      True       {}
7            pulocationid            int      True       {}
8            dolocationid            int      True       {}
9            payment_type            int      True       {}
10            fare_amount         double      True       {}
11                  extra         double      True       {}
12                mta_tax         double      True       {}
13             tip_amount         double      True       {}
14           tolls_amount         double      True       {}
15  improvement_surcharge         double

In [None]:
show_schema_pandas(df_2025_01)

In [None]:
# Просто выводим схемы файлов для визуального сравнения
years = ['2023', '2024', '2025']

for year in years:
    print(f"\n{'='*20} {year} {'='*20}")
    try:
        df = spark.read.parquet(f"s3a://bronze/nyc-taxi-data/yellow_tripdata_{year}-01.parquet")
        df.printSchema()
    except Exception as e:
        print(f"Ошибка: {e}")

Отличная идея! Это правильный подход ETL - подготовить данные на уровне Silver. 

Вот обработчик:

```python
from pyspark.sql import functions as F
from pyspark.sql.types import DoubleType, IntegerType

def standardize_nyc_taxi_data(input_path, output_path):
    """
    Стандартизирует данные NYC Taxi:
    - Приводит все названия колонок к нижнему регистру
    - Приводит типы данных к унифицированным
    - Сохраняет в указанный путь
    """
    
    # Читаем исходные данные
    df = spark.read.parquet(input_path)
    
    # 1. Приводим все названия колонок к нижнему регистру
    for col_name in df.columns:
        df = df.withColumnRenamed(col_name, col_name.lower())
    
    # 2. Определяем маппинг типов для унификации
    type_mapping = {
        # Числовые идентификаторы - в Integer
        "vendorid": IntegerType(),
        "pulocationid": IntegerType(), 
        "dolocationid": IntegerType(),
        "payment_type": IntegerType(),
        "ratecodeid": IntegerType(),
        
        # Пассажиры - в Integer (более логично)
        "passenger_count": IntegerType(),
        
        # Денежные суммы - в Double (для точности)
        "fare_amount": DoubleType(),
        "extra": DoubleType(),
        "mta_tax": DoubleType(),
        "tip_amount": DoubleType(),
        "tolls_amount": DoubleType(),
        "improvement_surcharge": DoubleType(),
        "total_amount": DoubleType(),
        "congestion_surcharge": DoubleType(),
        "airport_fee": DoubleType(),
        "cbd_congestion_fee": DoubleType(),
        
        # Дистанция - в Double
        "trip_distance": DoubleType()
    }
    
    # 3. Применяем преобразование типов с обработкой ошибок
    for col_name, target_type in type_mapping.items():
        if col_name in df.columns:
            df = df.withColumn(
                col_name, 
                F.coalesce(
                    F.col(col_name).cast(target_type), 
                    F.lit(0 if target_type == IntegerType() else 0.0)
                )
            )
    
    # 4. Гарантируем порядок колонок для consistency
    expected_columns = [
        "vendorid", "tpep_pickup_datetime", "tpep_dropoff_datetime",
        "passenger_count", "trip_distance", "ratecodeid", "store_and_fwd_flag",
        "pulocationid", "dolocationid", "payment_type", "fare_amount", "extra",
        "mta_tax", "tip_amount", "tolls_amount", "improvement_surcharge",
        "total_amount", "congestion_surcharge", "airport_fee", "cbd_congestion_fee"
    ]
    
    # Выбираем только существующие колонки в правильном порядке
    final_columns = [col for col in expected_columns if col in df.columns]
    df_standardized = df.select(final_columns)
    
    # 5. Сохраняем с оптимальными настройками
    (df_standardized
     .write
     .mode("overwrite")
     .option("compression", "snappy")  # хороший баланс скорость/сжатие
     .parquet(output_path)
    )
    
    print(f"✅ Стандартизировано: {input_path} -> {output_path}")
    print(f"📊 Схема после стандартизации:")
    df_standardized.printSchema()
    
    return df_standardized

# Пример использования для одного файла
input_path = "s3a://bronze/nyc-taxi-data/yellow_tripdata_2023-01.parquet"
output_path = "s3a://silver/nyc-taxi-data/yellow_tripdata_2023-01.parquet"

standardized_df = standardize_nyc_taxi_data(input_path, output_path)
```

**Улучшенная версия для обработки всех файлов:**

```python
def process_all_nyc_taxi_files():
    """Обрабатывает все файлы NYC Taxi из Bronze в Silver"""
    
    # Получаем список всех parquet файлов в bronze
    bronze_files = spark.sql(f"""
        SELECT path 
        FROM (
            SELECT input_file_name() as path 
            FROM parquet.`s3a://bronze/nyc-taxi-data/`
        ) 
        GROUP BY path
    """).collect()
    
    print(f"📁 Найдено файлов для обработки: {len(bronze_files)}")
    
    for i, row in enumerate(bronze_files, 1):
        input_path = row['path']
        # Извлекаем имя файла из пути
        file_name = input_path.split('/')[-1]
        output_path = f"s3a://silver/nyc-taxi-data/{file_name}"
        
        print(f"🔄 Обрабатываю ({i}/{len(bronze_files)}): {file_name}")
        
        try:
            standardize_nyc_taxi_data(input_path, output_path)
        except Exception as e:
            print(f"❌ Ошибка при обработке {file_name}: {e}")
    
    print("🎉 Все файлы обработаны!")

# Запускаем обработку всех файлов
process_all_nyc_taxi_files()
```

**Преимущества подхода:**

1. ✅ **Единая схема** - все файлы имеют одинаковые названия колонок и типы
2. ✅ **Простота чтения** - `spark.read.parquet("s3a://silver/nyc-taxi-data/")` просто работает
3. ✅ **Производительность** - предсказуемые типы данных улучшают performance
4. ✅ **Стабильность** - никаких внезапных падений из-за evolving schemas

**Типы данных выбраны так:**
- **Integer** - для счетчиков и ID (пассажиры, локации, вендоры)
- **Double** - для денежных сумм и дробных значений (более точный чем Float)
- **String/Timestamp** - остаются как есть

После этого можно будет безопасно делать:
```python
df = spark.read.parquet("s3a://silver/nyc-taxi-data/")
# И все просто работает!
```