# **Импорт библиотек**

In [1]:
from pyspark.sql import SparkSession, SQLContext
import pyspark.sql.functions as F
import logging
import os
from airflow.models import Variable
from dotenv import load_dotenv
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, DataType
import requests
import json
import boto3
import minio
import mimesis
from mimesis import Person, Generic, Address, Finance, Datetime, Choice
from mimesis.locales import Locale
from mimesis import Code
from mimesis.enums import TimestampFormat
import datetime
import logging

# **Create SparkSession**

In [2]:
# Настройка логирования
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)

# Загрузка и проверка переменных окружения
load_dotenv()

try:
    # Создание SparkSession
    spark = SparkSession.builder \
        .appName("MinIO Data Reader") \
        .config("spark.hadoop.fs.s3a.access.key", os.getenv("MINIO_ACCESS_KEY")) \
        .config("spark.hadoop.fs.s3a.secret.key", os.getenv("MINIO_SECRET_KEY")) \
        .config("spark.hadoop.fs.s3a.endpoint", os.getenv("MINIO_ENDPOINT")) \
        .config("spark.hadoop.fs.s3a.path.style.access", "true") \
        .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false") \
        .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
        .config("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider") \
        .config("spark.jars", "/home/jovyan/jars/hadoop-aws-3.3.1.jar,/home/jovyan/jars/aws-java-sdk-bundle-1.11.901.jar,/home/jovyan/jars/iceberg-spark-runtime-3.5_2.12-1.5.0.jar") \
        .config("spark.sql.catalog.nessie", "org.apache.iceberg.spark.SparkCatalog") \
        .config("spark.sql.catalog.nessie.type", "nessie") \
        .config("spark.sql.catalog.nessie.uri", "http://nessie-server:19120/api/v2") \
        .config("spark.sql.catalog.nessie.ref", "main") \
        .config("spark.sql.catalog.nessie.warehouse", "s3a://datalake/warehouse") \
        .config("spark.sql.catalog.nessie.io-impl", "org.apache.iceberg.hadoop.HadoopFileIO") \
        .getOrCreate()

    
    logging.info('SparkSession успешно создана для работы с MinIO')
except Exception as e:
    logging.error(f'SparkSession не создана по причине: {e}')


[[34m2025-09-08T20:57:17.742+0000[0m] {[34m176353067.py:[0m31} INFO[0m - SparkSession успешно создана для работы с MinIO[0m


# **Проверяем каталог Nessie**

In [13]:
spark.sql("SHOW NAMESPACES IN nessie").show()

+---------+
|namespace|
+---------+
|      dev|
+---------+



# **Создаем namespace в каталоге Nessie**

In [None]:
spark.sql("CREATE NAMESPACE IF NOT EXISTS nessie.dev").show()

In [None]:
path = "transactions.csv"
schema = StructType(fields=[
        StructField("Customer ID", StringType()),
        StructField("Transaction Date", StringType()),
        StructField("Product Purchased", StringType()),
        StructField("Transaction Amount", DoubleType()),
        StructField("Payment Method", StringType()),
        StructField("Transaction Status", StringType()),
        StructField("Transaction Type", StringType())
])
transactions_df = spark.read.csv(path, schema=schema, sep=",", header=True)
transactions_df.show(10)

In [None]:
transactions_df.printSchema()

# **Запись в S3**

In [None]:
output_path = "s3a://datalake/transactions.parquet"
transactions_df.write.mode("overwrite").parquet(output_path)

# **Проверка файлов в бакете**

In [None]:
raw_endpoint = os.getenv("MINIO_ENDPOINT")

if not raw_endpoint.startswith(('http://', 'https://')):
    endpoint_url = 'http://' + raw_endpoint
else:
    endpoint_url = raw_endpoint
    

s3_creds = {
    "aws_access_key_id": os.getenv("MINIO_ACCESS_KEY"),
    "aws_secret_access_key": os.getenv("MINIO_SECRET_KEY"),
    "endpoint_url": endpoint_url
}

s3 = boto3.client("s3", **s3_creds)

my_bucket = "datalake"

response = s3.list_objects_v2(Bucket=my_bucket)

for obj in response.get('Contents', []):
    print(obj['Key'])

# **Чтение из S3**

In [3]:
s3_path = "s3a://datalake/transactions.parquet/"
df_transactions = spark.read.parquet(s3_path)
df_transactions.show(10)

+--------------------+----------------+-----------------+------------------+--------------+------------------+----------------+
|         Customer ID|Transaction Date|Product Purchased|Transaction Amount|Payment Method|Transaction Status|Transaction Type|
+--------------------+----------------+-----------------+------------------+--------------+------------------+----------------+
|7810b819-f86d-498...|      2025-03-05|          quickly|417.52795241907484|          Cash|         Completed|        In-Store|
|c1426052-e306-459...|      2025-03-19|         material|174.45190084753798|        PayPal|            Failed|          Online|
|99a40289-d53d-4e2...|      2025-04-09|           travel|28.593285457686974|     Gift Card|            Failed|        In-Store|
|984e8efc-9a2f-437...|      2025-02-02|              cut|  78.3398992520693|        PayPal|            Failed|        In-Store|
|741d0bea-1f72-435...|      2025-04-13|              she| 288.2127996249924|        PayPal|           Pe

In [None]:
df_transactions.printSchema()

# **Добавим новую колонку dt с данными из колонки Transaction Date и приведем ее к типу Date**

In [4]:
new_df_transactions = df_transactions.withColumn("dt", F.to_date("Transaction Date", "yyyy-MM-dd"))

In [5]:
new_df_transactions.show(10)

+--------------------+----------------+-----------------+------------------+--------------+------------------+----------------+----------+
|         Customer ID|Transaction Date|Product Purchased|Transaction Amount|Payment Method|Transaction Status|Transaction Type|        dt|
+--------------------+----------------+-----------------+------------------+--------------+------------------+----------------+----------+
|7810b819-f86d-498...|      2025-03-05|          quickly|417.52795241907484|          Cash|         Completed|        In-Store|2025-03-05|
|c1426052-e306-459...|      2025-03-19|         material|174.45190084753798|        PayPal|            Failed|          Online|2025-03-19|
|99a40289-d53d-4e2...|      2025-04-09|           travel|28.593285457686974|     Gift Card|            Failed|        In-Store|2025-04-09|
|984e8efc-9a2f-437...|      2025-02-02|              cut|  78.3398992520693|        PayPal|            Failed|        In-Store|2025-02-02|
|741d0bea-1f72-435...|     

In [None]:
new_df_transactions.printSchema()

In [10]:
new_df_transactions.count()

100000

# **Проверка на NULL**

In [8]:
new_df_transactions.filter(new_df_transactions["dt"].isNotNull()) \
                   .count()

100000

In [9]:
new_df_transactions.filter(new_df_transactions["dt"].isNull()) \
                   .count()

0

# **MIN/MAX по дате**

In [6]:
new_df_transactions.agg(F.min("dt")) \
                   .show()

+----------+
|   min(dt)|
+----------+
|2025-01-01|
+----------+



In [7]:
new_df_transactions.agg(F.max("dt")) \
                   .show()

+----------+
|   max(dt)|
+----------+
|2025-05-18|
+----------+



In [None]:
new_df_transactions.agg(F.max("Transaction Date")) \
                   .show()

# **Проверка, что месяц в Transaction Date не равен месяцу в dt**

In [11]:
new_df_transactions.filter(F.month("Transaction Date") != F.month("dt")) \
                   .show()

+-----------+----------------+-----------------+------------------+--------------+------------------+----------------+---+
|Customer ID|Transaction Date|Product Purchased|Transaction Amount|Payment Method|Transaction Status|Transaction Type| dt|
+-----------+----------------+-----------------+------------------+--------------+------------------+----------------+---+
+-----------+----------------+-----------------+------------------+--------------+------------------+----------------+---+



# **Создание iceberg таблицы в каталоге Nessie**

In [18]:
spark.sql("""CREATE TABLE nessie.dev.transactions_sample (
                        customer_id INT, 
                        transaction_date DATE, 
                        product_purchased VARCHAR(128), 
                        transaction_amount DECIMAL, 
                        payment_method VARCHAR(128), 
                        transaction_status VARCHAR(128), 
                        transaction_type VARCHAR(128), 
                        dt DATE
                        )
            USING iceberg 
            PARTITIONED BY (months(dt));
""")

DataFrame[]

In [19]:
spark.sql("SHOW TABLES IN nessie").show()

+---------+-------------------+-----------+
|namespace|          tableName|isTemporary|
+---------+-------------------+-----------+
|      dev|transactions_sample|      false|
+---------+-------------------+-----------+



In [20]:
spark.sql("DESCRIBE TABLE nessie.dev.transactions_sample").show()

+------------------+-------------+-------+
|          col_name|    data_type|comment|
+------------------+-------------+-------+
|       customer_id|          int|   NULL|
|  transaction_date|         date|   NULL|
| product_purchased|       string|   NULL|
|transaction_amount|decimal(10,0)|   NULL|
|    payment_method|       string|   NULL|
|transaction_status|       string|   NULL|
|  transaction_type|       string|   NULL|
|                dt|         date|   NULL|
|                  |             |       |
|    # Partitioning|             |       |
|            Part 0|   months(dt)|       |
+------------------+-------------+-------+



# **Проверить список Jars**

In [None]:
print(spark.sparkContext._jsc.sc().listJars())

# **Узнать версию Hadoop-клиента**

In [None]:
print(spark.sparkContext._jvm.org.apache.hadoop.util.VersionInfo.getVersion())