# Migrating database into MinIO bucket

This notebook loads the root database into MinIO `rootdb` bucket

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import year, month

In [None]:
builder = (
    SparkSession.builder.appName("MinIO-Delta")
    # Memory configurations for large datasets
    .config("spark.driver.memory", "4g")
    .config("spark.executor.memory", "4g")
    .config("spark.driver.maxResultSize", "2g")
    .config("spark.sql.adaptive.enabled", "true")
    .config("spark.sql.adaptive.coalescePartitions.enabled", "true")
    # Jars
    .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:3.4.1,com.amazonaws:aws-java-sdk-bundle:1.12.262," \
    "io.delta:delta-spark_2.13:4.0.0," \
    "com.mysql:mysql-connector-j:8.0.33")
    # Delta Lake
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
    # MinIO (S3A)
    .config("spark.hadoop.fs.s3a.endpoint", "http://localhost:9900")
    .config("spark.hadoop.fs.s3a.access.key", "minioadmin")
    .config("spark.hadoop.fs.s3a.secret.key", "minioadmin123")
    .config("spark.hadoop.fs.s3a.path.style.access", "true")
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
    # S3A performance configs
    .config("spark.hadoop.fs.s3a.connection.timeout", "60000")
    .config("spark.hadoop.fs.s3a.connection.request.timeout", "60000")
    .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false")
    .config("spark.hadoop.fs.s3a.attempts.maximum", "3")
    .config("spark.hadoop.fs.s3a.retry.limit", "3")
)

spark = builder.getOrCreate()

In [None]:
# MySQL connection settings
mysql_url = "jdbc:mysql://localhost:30306/finance"
mysql_properties = {
        "user": "root",
        "password": "root123",
        "driver": "com.mysql.cj.jdbc.Driver"
    }
    
tables_config = {
        'users': {
            'path': 's3a://rootdb/users/',
            'partitions': None
        },
        'mcc_codes': {
            'path': 's3a://rootdb/mcc_codes/',
            'partitions': None
        },
        'cards': {
            'path': 's3a://rootdb/cards/',
            'partitions': ['card_brand']  # Partition by brand for better queries
        },
        'transactions': {
            'path': 's3a://rootdb/transactions/',
            'partitions': ['year', 'month']  # Partition by date for performance
        },
        'fraud_labels': {
            'path': 's3a://rootdb/fraud_labels/',
            'partitions': None
        }
    }
    
migration_summary = {}

In [None]:
# from pyspark.sql.functions import year, month

# # Define partitioning configs for large tables with smaller partitions
# jdbc_partition_configs = {
#     'transactions': {
#         'partitionColumn': 'transaction_id',
#         'lowerBound': 1,
#         'upperBound': 14000000,
#         'numPartitions': 40
#     },
#     'fraud_labels': {
#         'partitionColumn': 'transaction_id', 
#         'lowerBound': 1,
#         'upperBound': 14000000,
#         'numPartitions': 40
#     }
# }

# for table, config in tables_config.items():
#     try:
#         if table in jdbc_partition_configs:
#             partition_config = jdbc_partition_configs[table]
#             df = spark.read.jdbc(
#                 url=mysql_url, 
#                 table=table, 
#                 properties=mysql_properties,
#                 column=partition_config['partitionColumn'],
#                 lowerBound=partition_config['lowerBound'],
#                 upperBound=partition_config['upperBound'],
#                 numPartitions=partition_config['numPartitions']
#             )
#         else:
#             df = spark.read.jdbc(url=mysql_url, table=table, properties=mysql_properties)
        
#         # Add partitioning columns for transactions table
#         if table == 'transactions':
#             df = df.withColumn('year', year('trans_date')).withColumn('month', month('trans_date'))
#             df = df.coalesce(5)
#         elif table == 'fraud_labels':
#             df = df.coalesce(5)
        
#         row_count = df.count()
        
#         if row_count > 0:
#             writer = df.write.format("delta").mode("overwrite")
            
#             # Add partitioning if configured
#             if config['partitions']:
#                 writer = writer.partitionBy(*config['partitions'])
            
#             # Write to MinIO 
#             writer.save(config['path'])
#             migration_summary[table] = f'Success - {row_count} rows migrated'
#             print(f"{table}: {row_count} rows migrated successfully")
#         else:
#             migration_summary[table] = 'Success - Empty table (0 rows)'
#             print(f"{table}: Empty table migrated successfully")
            
#     except Exception as e:
#         migration_summary[table] = f'Failed: {str(e)}'
#         print(f"{table}: Migration failed - {str(e)}")

In [None]:
# df = spark.read.format("delta").load("s3a://rootdb/transactions/").where("year = 2019 AND month = 1")
# df.show(5)

In [None]:
# Migrate users table
try:
    df = spark.read.jdbc(url=mysql_url, table='users', properties=mysql_properties)
    row_count = df.count()
    
    if row_count > 0:
        df.write.format("delta").mode("overwrite").save('s3a://rootdb/users/')
        print(f"users: {row_count} rows migrated successfully")
    else:
        print(f"users: Empty table migrated successfully")
        
except Exception as e:
    print(f"users: Migration failed - {str(e)}")

In [None]:
# Migrate mcc_codes table
try:
    df = spark.read.jdbc(url=mysql_url, table='mcc_codes', properties=mysql_properties)
    row_count = df.count()
    
    if row_count > 0:
        df.write.format("delta").mode("overwrite").save('s3a://rootdb/mcc_codes/')
        print(f"mcc_codes: {row_count} rows migrated successfully")
    else:
        print(f"mcc_codes: Empty table migrated successfully")

except Exception as e:
    print(f"mcc_codes: Migration failed - {str(e)}")

In [None]:
# Migrate cards table (partitioned by card_brand)
try:
    df = spark.read.jdbc(url=mysql_url, table='cards', properties=mysql_properties)
    row_count = df.count()
    
    if row_count > 0:
        df.write.format("delta").mode("overwrite").partitionBy('card_brand').save('s3a://rootdb/cards/')
        print(f"âœ“ cards: {row_count} rows migrated successfully")
    else:
        print(f"cards: Empty table migrated successfully")
        
except Exception as e:
    print(f"cards: Migration failed - {str(e)}")

In [None]:
# Migrate transactions table
try:
    # Use JDBC partitioning for large table
    df = spark.read.jdbc(
        url=mysql_url, 
        table='transactions', 
        properties=mysql_properties,
        column='transaction_id',
        lowerBound=1,
        upperBound=14000000,
        numPartitions=40
    )
    
    # Add year and month columns for partitioning
    df = df.withColumn('year', year('trans_date')).withColumn('month', month('trans_date'))
    df = df.coalesce(5)  # Reduce partitions to save memory
    
    row_count = df.count()
    
    if row_count > 0:
        df.write.format("delta").mode("overwrite").partitionBy('year', 'month').save('s3a://rootdb/transactions/')
        print(f"transactions: {row_count} rows migrated successfully")
    else:
        print(f"transactions: Empty table migrated successfully")

except Exception as e:
    print(f"transactions: Migration failed - {str(e)}")

In [None]:
# Migrate fraud_labels table (large table with JDBC partitioning)
try:
    # Use JDBC partitioning for large table
    df = spark.read.jdbc(
        url=mysql_url, 
        table='fraud_labels', 
        properties=mysql_properties,
        column='transaction_id',
        lowerBound=1,
        upperBound=14000000,
        numPartitions=40
    )
    
    df = df.coalesce(5)  # Reduce partitions to save memory
    row_count = df.count()
    
    if row_count > 0:
        df.write.format("delta").mode("overwrite").save('s3a://rootdb/fraud_labels/')
        print(f"fraud_labels: {row_count} rows migrated successfully")
    else:
        print(f"fraud_labels: Empty table migrated successfully")
        
except Exception as e:
    print(f"fraud_labels: Migration failed - {str(e)}")