# Migrating database into MinIO bucket

This notebook loads the root database into MinIO `rootdb` bucket

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import year, month

In [2]:
# Configuration variables
MINIO_ENDPOINT = 'http://minio:9000'
MINIO_ACCESS_KEY = 'minioadmin'
MINIO_SECRET_KEY = 'minioadmin123'
MYSQL_HOST = 'mysql'
MYSQL_PORT = '3306'
MYSQL_DATABASE = 'finance'
MYSQL_USER = 'root'
MYSQL_PASSWORD = 'root123'

# Derived configurations
mysql_url = f"jdbc:mysql://{MYSQL_HOST}:{MYSQL_PORT}/{MYSQL_DATABASE}"
mysql_properties = {
    "user": MYSQL_USER,
    "password": MYSQL_PASSWORD,
    "driver": "com.mysql.cj.jdbc.Driver"
}

In [3]:
builder = (
    SparkSession.builder.appName("MinIO-Delta")
    # Memory configurations for large datasets
    .config("spark.driver.memory", "4g")
    .config("spark.executor.memory", "4g")
    .config("spark.driver.maxResultSize", "2g")
    .config("spark.sql.adaptive.enabled", "true")
    .config("spark.sql.adaptive.coalescePartitions.enabled", "true")
    # Jars
    .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:3.4.1,com.amazonaws:aws-java-sdk-bundle:1.12.262," \
    "io.delta:delta-spark_2.13:4.0.0," \
    "com.mysql:mysql-connector-j:8.0.33")
    # Delta Lake
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
    # MinIO (S3A)
    .config("spark.hadoop.fs.s3a.endpoint", MINIO_ENDPOINT)
    .config("spark.hadoop.fs.s3a.access.key", MINIO_ACCESS_KEY)
    .config("spark.hadoop.fs.s3a.secret.key", MINIO_SECRET_KEY)
    .config("spark.hadoop.fs.s3a.path.style.access", "true")
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
    # S3A performance configs
    .config("spark.hadoop.fs.s3a.connection.timeout", "60000")
    .config("spark.hadoop.fs.s3a.connection.request.timeout", "60000")
    .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false")
    .config("spark.hadoop.fs.s3a.attempts.maximum", "3")
    .config("spark.hadoop.fs.s3a.retry.limit", "3")
)

spark = builder.getOrCreate()



:: loading settings :: url = jar:file:/home/airflow/.local/lib/python3.12/site-packages/pyspark/jars/ivy-2.5.3.jar!/org/apache/ivy/core/settings/ivysettings.xml
Ivy Default Cache set to: /home/airflow/.ivy2.5.2/cache
The jars for the packages stored in: /home/airflow/.ivy2.5.2/jars
org.apache.hadoop#hadoop-aws added as a dependency
com.amazonaws#aws-java-sdk-bundle added as a dependency
io.delta#delta-spark_2.13 added as a dependency
com.mysql#mysql-connector-j added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-f8bf1498-9e25-4d02-aba5-edd7e431c543;1.0
	confs: [default]


	found org.apache.hadoop#hadoop-aws;3.4.1 in central
	found software.amazon.awssdk#bundle;2.24.6 in central
	found org.wildfly.openssl#wildfly-openssl;1.1.3.Final in central
	found com.amazonaws#aws-java-sdk-bundle;1.12.262 in central
	found io.delta#delta-spark_2.13;4.0.0 in central
	found io.delta#delta-storage;4.0.0 in central
	found org.antlr#antlr4-runtime;4.13.1 in central
	found com.mysql#mysql-connector-j;8.0.33 in central
	found com.google.protobuf#protobuf-java;3.21.9 in central


:: resolution report :: resolve 361ms :: artifacts dl 15ms
	:: modules in use:
	com.amazonaws#aws-java-sdk-bundle;1.12.262 from central in [default]
	com.google.protobuf#protobuf-java;3.21.9 from central in [default]
	com.mysql#mysql-connector-j;8.0.33 from central in [default]
	io.delta#delta-spark_2.13;4.0.0 from central in [default]
	io.delta#delta-storage;4.0.0 from central in [default]
	org.antlr#antlr4-runtime;4.13.1 from central in [default]
	org.apache.hadoop#hadoop-aws;3.4.1 from central in [default]
	org.wildfly.openssl#wildfly-openssl;1.1.3.Final from central in [default]
	software.amazon.awssdk#bundle;2.24.6 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default     |   9   |   0   |   0   |   0   ||   9   |

25/10/10 04:41:51 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [4]:
# Table migration configurations
tables_config = {
    'users': {
        'path': 's3a://rootdb/users/',
        'partitions': None
    },
    'mcc_codes': {
        'path': 's3a://rootdb/mcc_codes/',
        'partitions': None
    },
    'cards': {
        'path': 's3a://rootdb/cards/',
        'partitions': ['card_brand']  # Partition by brand for better queries
    },
    'transactions': {
        'path': 's3a://rootdb/transactions/',
        'partitions': ['year', 'month']  # Partition by date for performance
    },
    'fraud_labels': {
        'path': 's3a://rootdb/fraud_labels/',
        'partitions': None
    }
}

migration_summary = {}

In [5]:
# Migrate users table
try:
    df = spark.read.jdbc(url=mysql_url, table='users', properties=mysql_properties)
    row_count = df.count()
    
    if row_count > 0:
        df.write.format("delta").mode("overwrite").save('s3a://rootdb/users/')
        print(f"users: {row_count} rows migrated successfully")
    else:
        print(f"users: Empty table migrated successfully")
        
except Exception as e:
    print(f"users: Migration failed - {str(e)}")

25/10/10 04:42:01 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties
SLF4J: Failed to load class "org.slf4j.impl.StaticLoggerBinder".
SLF4J: Defaulting to no-operation (NOP) logger implementation
SLF4J: See http://www.slf4j.org/codes.html#StaticLoggerBinder for further details.


[Stage 3:>                                                          (0 + 1) / 1]

25/10/10 04:42:04 WARN S3ABlockOutputStream: Application invoked the Syncable API against stream writing to users/part-00000-09bcd3d8-d567-4bf9-9fd1-93d0c88f89c0-c000.snappy.parquet. This is Unsupported
                                                                                

users: 2000 rows migrated successfully


In [6]:
# Migrate mcc_codes table
try:
    df = spark.read.jdbc(url=mysql_url, table='mcc_codes', properties=mysql_properties)
    row_count = df.count()
    
    if row_count > 0:
        df.write.format("delta").mode("overwrite").save('s3a://rootdb/mcc_codes/')
        print(f"mcc_codes: {row_count} rows migrated successfully")
    else:
        print(f"mcc_codes: Empty table migrated successfully")

except Exception as e:
    print(f"mcc_codes: Migration failed - {str(e)}")

mcc_codes: 109 rows migrated successfully


In [7]:
# Migrate cards table (partitioned by card_brand)
try:
    df = spark.read.jdbc(url=mysql_url, table='cards', properties=mysql_properties)
    row_count = df.count()
    
    if row_count > 0:
        df.write.format("delta").mode("overwrite").partitionBy('card_brand').save('s3a://rootdb/cards/')
        print(f"✓ cards: {row_count} rows migrated successfully")
    else:
        print(f"cards: Empty table migrated successfully")
        
except Exception as e:
    print(f"cards: Migration failed - {str(e)}")

[Stage 11:>                                                         (0 + 1) / 1]

                                                                                

✓ cards: 6146 rows migrated successfully


In [8]:
# Migrate transactions table
try:
    # Use JDBC partitioning for large table
    df = spark.read.jdbc(
        url=mysql_url, 
        table='transactions', 
        properties=mysql_properties,
        column='transaction_id',
        lowerBound=1,
        upperBound=14000000,
        numPartitions=40
    )
    
    # Add year and month columns for partitioning
    df = df.withColumn('year', year('trans_date')).withColumn('month', month('trans_date'))
    df = df.coalesce(5)  
    
    row_count = df.count()
    
    if row_count > 0:
        df.write.format("delta").mode("overwrite").partitionBy('year', 'month').save('s3a://rootdb/transactions/')
        print(f"transactions: {row_count} rows migrated successfully")
    else:
        print(f"transactions: Empty table migrated successfully")

except Exception as e:
    print(f"transactions: Migration failed - {str(e)}")







                                                                                







25/10/10 04:42:58 WARN TaskMemoryManager: Failed to allocate a page (134217728 bytes), try again.


25/10/10 04:43:00 WARN TaskMemoryManager: Failed to allocate a page (134217728 bytes), try again.


25/10/10 04:43:02 WARN TaskMemoryManager: Failed to allocate a page (134217728 bytes), try again.


25/10/10 04:43:04 WARN TaskMemoryManager: Failed to allocate a page (134217728 bytes), try again.


25/10/10 04:43:06 WARN TaskMemoryManager: Failed to allocate a page (134217728 bytes), try again.


25/10/10 04:43:08 WARN TaskMemoryManager: Failed to allocate a page (134217728 bytes), try again.


25/10/10 04:43:10 WARN TaskMemoryManager: Failed to allocate a page (134217728 bytes), try again.


25/10/10 04:43:12 WARN TaskMemoryManager: Failed to allocate a page (134217728 bytes), try again.


25/10/10 04:43:13 WARN TaskMemoryManager: Failed to allocate a page (134217728 bytes), try again.


25/10/10 04:43:15 WARN TaskMemoryManager: Failed to allocate a page (134217728 bytes), try again.


25/10/10 04:43:17 WARN TaskMemoryManager: Failed to allocate a page (134217728 bytes), try again.


25/10/10 04:43:19 WARN TaskMemoryManager: Failed to allocate a page (134217728 bytes), try again.


25/10/10 04:43:20 WARN TaskMemoryManager: Failed to allocate a page (134217728 bytes), try again.




                                                                                

transactions: 13305915 rows migrated successfully


In [9]:
# Migrate fraud_labels table (large table with JDBC partitioning)
try:
    # Use JDBC partitioning for large table
    df = spark.read.jdbc(
        url=mysql_url, 
        table='fraud_labels', 
        properties=mysql_properties,
        column='transaction_id',
        lowerBound=1,
        upperBound=14000000,
        numPartitions=40
    )
    
    df = df.coalesce(5)  
    row_count = df.count()
    
    if row_count > 0:
        df.write.format("delta").mode("overwrite").save('s3a://rootdb/fraud_labels/')
        print(f"fraud_labels: {row_count} rows migrated successfully")
    else:
        print(f"fraud_labels: Empty table migrated successfully")
        
except Exception as e:
    print(f"fraud_labels: Migration failed - {str(e)}")

fraud_labels: Migration failed - An error occurred while calling o104.jdbc.
: java.sql.SQLSyntaxErrorException: Table 'finance.fraud_labels' doesn't exist
	at com.mysql.cj.jdbc.exceptions.SQLError.createSQLException(SQLError.java:121)
	at com.mysql.cj.jdbc.exceptions.SQLExceptionsMapping.translateException(SQLExceptionsMapping.java:122)
	at com.mysql.cj.jdbc.ClientPreparedStatement.executeInternal(ClientPreparedStatement.java:916)
	at com.mysql.cj.jdbc.ClientPreparedStatement.executeQuery(ClientPreparedStatement.java:972)
	at org.apache.spark.sql.execution.datasources.jdbc.JDBCRDD$.$anonfun$getQueryOutputSchema$2(JDBCRDD.scala:70)
	at scala.util.Using$.resource(Using.scala:296)
	at org.apache.spark.sql.execution.datasources.jdbc.JDBCRDD$.$anonfun$getQueryOutputSchema$1(JDBCRDD.scala:68)
	at scala.util.Using$.resource(Using.scala:296)
	at org.apache.spark.sql.execution.datasources.jdbc.JDBCRDD$.getQueryOutputSchema(JDBCRDD.scala:67)
	at org.apache.spark.sql.execution.datasources.jdbc.JD