# **Импорт библиотек**

In [1]:
from pyspark.sql import SparkSession, SQLContext
import pyspark.sql.functions as F
import logging
import os
from airflow.models import Variable
from dotenv import load_dotenv
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, DataType
import requests
import json
import boto3
import minio
import mimesis
from mimesis import Person, Generic, Address, Finance, Datetime, Choice
from mimesis.locales import Locale
from mimesis import Code
from mimesis.enums import TimestampFormat
import datetime
import logging

# **Create SparkSession**

In [2]:
# Настройка логирования
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)

# Загрузка и проверка переменных окружения
load_dotenv()

try:
    # Создание SparkSession
    spark = SparkSession.builder \
        .appName("MinIO Data Reader") \
        .config("spark.hadoop.fs.s3a.access.key", os.getenv("MINIO_ACCESS_KEY")) \
        .config("spark.hadoop.fs.s3a.secret.key", os.getenv("MINIO_SECRET_KEY")) \
        .config("spark.hadoop.fs.s3a.endpoint", os.getenv("MINIO_ENDPOINT")) \
        .config("spark.hadoop.fs.s3a.path.style.access", "true") \
        .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false") \
        .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
        .config("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider") \
        .config("spark.jars", "home/jovyan/jars/hadoop-aws-3.3.1.jar,/home/jovyan/jars/aws-java-sdk-bundle-1.11.901.jar,/home/jovyan/jars/iceberg-spark-runtime-3.5_2.12-1.5.0.jar") \
        .config("spark.sql.catalog.nessie", "org.apache.iceberg.spark.SparkCatalog") \
        .config("spark.sql.catalog.nessie.type", "nessie") \
        .config("spark.sql.catalog.nessie.uri", "http://nessie-server:19120/api/v2") \
        .config("spark.sql.catalog.nessie.ref", "main") \
        .config("spark.sql.catalog.nessie.warehouse", "s3a://datalake/warehouse") \
        .config("spark.sql.catalog.nessie.io-impl", "org.apache.iceberg.hadoop.HadoopFileIO") \
        .getOrCreate()

    
    logging.info('SparkSession успешно создана для работы с MinIO')
except Exception as e:
    logging.error(f'SparkSession не создана по причине: {e}')


[[34m2025-08-29T23:32:46.505+0000[0m] {[34m1154948182.py:[0m31} INFO[0m - SparkSession успешно создана для работы с MinIO[0m


In [3]:
spark.sql("SHOW NAMESPACES IN nessie").show()

+---------+
|namespace|
+---------+
+---------+



In [None]:
path = "transactions.csv"
schema = StructType(fields=[
        StructField("Customer ID", StringType()),
        StructField("Transaction Date", StringType()),
        StructField("Product Purchased", StringType()),
        StructField("Transaction Amount", DoubleType()),
        StructField("Payment Method", StringType()),
        StructField("Transaction Status", StringType()),
        StructField("Transaction Type", StringType())
])
transactions_df = spark.read.csv(path, schema=schema, sep=",", header=True)
transactions_df.show(10)

In [None]:
transactions_df.printSchema()

# **Запись в S3**

In [None]:
output_path = "s3a://datalake/transactions.parquet"
transactions_df.write.mode("overwrite").parquet(output_path)

# **Проверка файлов в бакете**

In [None]:
raw_endpoint = os.getenv("MINIO_ENDPOINT")

if not raw_endpoint.startswith(('http://', 'https://')):
    endpoint_url = 'http://' + raw_endpoint
else:
    endpoint_url = raw_endpoint
    

s3_creds = {
    "aws_access_key_id": os.getenv("MINIO_ACCESS_KEY"),
    "aws_secret_access_key": os.getenv("MINIO_SECRET_KEY"),
    "endpoint_url": endpoint_url
}

s3 = boto3.client("s3", **s3_creds)

my_bucket = "datalake"

response = s3.list_objects_v2(Bucket=my_bucket)

for obj in response.get('Contents', []):
    print(obj['Key'])