In [5]:
%session_id_prefix native-hudi-dataframe-
%glue_version 3.0
%idle_timeout 60
%%configure 
{
  "--conf": "spark.serializer=org.apache.spark.serializer.KryoSerializer --conf spark.sql.hive.convertMetastoreParquet=false",
  "--datalake-formats": "hudi"
}

Welcome to the Glue Interactive Sessions Kernel
For more information on available magic commands, please type %help in any new cell.

Please view our Getting Started page to access the most up-to-date information on the Interactive Sessions kernel: https://docs.aws.amazon.com/glue/latest/dg/interactive-sessions.html
Installed kernel version: 0.37.3 
Setting session ID prefix to native-hudi-dataframe-
Setting Glue version to: 3.0
Current idle_timeout is 2800 minutes.
idle_timeout has been set to 60 minutes.
The following configurations have been updated: {'--conf': 'spark.serializer=org.apache.spark.serializer.KryoSerializer --conf spark.sql.hive.convertMetastoreParquet=false', '--datalake-formats': 'hudi'}


In [1]:
bucket_name = "mojap-raw-hist-sandbox-608"
bucket_prefix = "hmpps/oasys/EOR/BACKUP_SYNTHETIC/OASYS_QUESTION_SANDBOXY"
database_name = "hudi_df"
table_name = "oasys_synth_q"
table_prefix = f"{bucket_prefix}/{database_name}/{table_name}"
table_location = f"s3://{bucket_name}/{table_prefix}"

Authenticating with environment variables and user-defined glue_role_arn: arn:aws:iam::684969100054:role/AdminAccessGlueNotebook
Trying to create a Glue session for the kernel.
Worker Type: G.1X
Number of Workers: 5
Session ID: 3b9682a6-b03a-4af2-ab71-e47d0ccb57e2
Job Type: glueetl
Applying the following default arguments:
--glue_kernel_version 0.37.3
--enable-glue-datacatalog true
--conf spark.serializer=org.apache.spark.serializer.KryoSerializer --conf spark.sql.hive.convertMetastoreParquet=false
--datalake-formats hudi
Waiting for session 3b9682a6-b03a-4af2-ab71-e47d0ccb57e2 to get into ready status...
Session 3b9682a6-b03a-4af2-ab71-e47d0ccb57e2 has been created.



## Clean up existing resources

In [2]:
import boto3

## Create a database with the name hudi_df to host hudi tables if not exists.
try:
    glue = boto3.client('glue')
    glue.create_database(DatabaseInput={'Name': database_name})
    print(f"New database {database_name} created")
except glue.exceptions.AlreadyExistsException:
    print(f"Database {database_name} already exist")

## Delete files in S3
s3 = boto3.resource('s3')
bucket = s3.Bucket(bucket_name)
bucket.objects.filter(Prefix=f"{table_prefix}/").delete()

## Drop table in Glue Data Catalog
try:
    glue = boto3.client('glue')
    glue.delete_table(DatabaseName=database_name, Name=table_name)
except glue.exceptions.EntityNotFoundException:
    print(f"Table {database_name}.{table_name} does not exist")


Database hudi_df already exist
{'ResponseMetadata': {'RequestId': '3e6ca492-8bb3-45bc-9007-78982fb4e178', 'HTTPStatusCode': 200, 'HTTPHeaders': {'date': 'Thu, 18 May 2023 07:34:43 GMT', 'content-type': 'application/x-amz-json-1.1', 'content-length': '2', 'connection': 'keep-alive', 'x-amzn-requestid': '3e6ca492-8bb3-45bc-9007-78982fb4e178'}, 'RetryAttempts': 0}}


In [3]:
import json




## Create Hudi table with sample data using catalog sync

In [4]:
from pyspark.sql import Row
import time
from pyspark.sql.functions import col, to_timestamp,lit
ut = time.time()


df_products = spark.read.parquet("s3://mojap-raw-hist-sandbox-608/hmpps/oasys/EOR/BACKUP_SYNTHETIC/OASYS_QUESTION_SANDBOXY/")
# Convert the date_string column to timestamp
df_products = df_products.withColumn("date_timestamp", to_timestamp(col("extraction_timestamp"), 'yyyy-MM-dd'))
df_products = df_products.withColumn("op", lit(None).cast("string"))

df_initial_full_load = df_products.sample(0.0001)





## Create a dataframe with a sample of the initial dataframe that has some additional columns!

In [5]:
from pyspark.sql.functions import rand

def add_mixed_columns(df, num_cols, ratio):
    num_string_cols = int(num_cols * ratio)
    num_int_cols = num_cols - num_string_cols

    # Add string columns
    for i in range(1, num_string_cols + 1):
        new_col_name = f'col_{i}'
        df = df.withColumn(new_col_name, lit("stringdata"))

    # Add integer columns
    for i in range(num_string_cols + 1, num_cols + 1):
        new_col_name = f'intcol_{i - num_string_cols}'
        # rand() generates a number between 0 and 1. We multiply it by 100 and convert to integer to get random integers.
        df = df.withColumn(new_col_name, (rand() * 100).cast("integer"))

    return df





In [6]:

df_anothersample = df_products.sample(0.0001) # another sample with differentg rows
df_anothersample = df_anothersample.withColumn("date_timestamp", to_timestamp(lit('2023-05-18')) )

df_updated_schema = add_mixed_columns(df_anothersample,10,0.5)

df_updated_schema.printSchema()


root
 |-- op: string (nullable = true)
 |-- extraction_timestamp: string (nullable = true)
 |-- scn: string (nullable = true)
 |-- oasys_question_pk: decimal(38,10) (nullable = true)
 |-- date_timestamp: timestamp (nullable = true)
 |-- col_1: string (nullable = false)
 |-- col_2: string (nullable = false)
 |-- col_3: string (nullable = false)
 |-- col_4: string (nullable = false)
 |-- col_5: string (nullable = false)
 |-- intcol_1: integer (nullable = true)
 |-- intcol_2: integer (nullable = true)
 |-- intcol_3: integer (nullable = true)
 |-- intcol_4: integer (nullable = true)
 |-- intcol_5: integer (nullable = true)


In [7]:
from pyspark.sql.types import StructType, StructField, StringType

old_schema = df_updated_schema.schema

new_fields = []
for field in old_schema.fields:
    if isinstance(field.dataType, StringType):
        new_fields.append(StructField(field.name, field.dataType, nullable=True))
    else:
        new_fields.append(field)

new_schema = StructType(fields=new_fields)

df_updated_schema = spark.createDataFrame(df_updated_schema.rdd, schema=new_schema)
df_updated_schema.printSchema()

root
 |-- op: string (nullable = true)
 |-- extraction_timestamp: string (nullable = true)
 |-- scn: string (nullable = true)
 |-- oasys_question_pk: decimal(38,10) (nullable = true)
 |-- date_timestamp: timestamp (nullable = true)
 |-- col_1: string (nullable = true)
 |-- col_2: string (nullable = true)
 |-- col_3: string (nullable = true)
 |-- col_4: string (nullable = true)
 |-- col_5: string (nullable = true)
 |-- intcol_1: integer (nullable = true)
 |-- intcol_2: integer (nullable = true)
 |-- intcol_3: integer (nullable = true)
 |-- intcol_4: integer (nullable = true)
 |-- intcol_5: integer (nullable = true)


In [8]:
df_initial_full_load.printSchema()

root
 |-- op: string (nullable = true)
 |-- extraction_timestamp: string (nullable = true)
 |-- scn: string (nullable = true)
 |-- oasys_question_pk: decimal(38,10) (nullable = true)
 |-- date_timestamp: timestamp (nullable = true)


In [9]:

df_initial_full_load.show(1,vertical=True)
df_updated_schema.show(1,vertical=True)

-RECORD 0-----------------------------------
 op                   | null                
 extraction_timestamp | 2023-04-01          
 scn                  | SYNTHETICS          
 oasys_question_pk    | 8975927.2860468380  
 date_timestamp       | 2023-04-01 00:00:00 
only showing top 1 row

-RECORD 0-----------------------------------
 op                   | null                
 extraction_timestamp | 2023-04-01          
 scn                  | SYNTHETICS          
 oasys_question_pk    | 8976182.0156663790  
 date_timestamp       | 2023-05-18 00:00:00 
 col_1                | stringdata          
 col_2                | stringdata          
 col_3                | stringdata          
 col_4                | stringdata          
 col_5                | stringdata          
 intcol_1             | 19                  
 intcol_2             | 55                  
 intcol_3             | 65                  
 intcol_4             | 61                  
 intcol_5             | 62     

In [10]:
df_initial_full_load.count()


1585


In [11]:
df_updated_schema.count()

1679


In [12]:
table_location = f"s3://mojap-raw-hist-sandbox-608/hmpps/oasys/EOR/BACKUP_SYNTHETIC/HUDIEXAMPLE"
table_location

's3://mojap-raw-hist-sandbox-608/hmpps/oasys/EOR/BACKUP_SYNTHETIC/HUDIEXAMPLE'


In [13]:
hudi_options = {
    'hoodie.table.name': table_name,
    'hoodie.datasource.write.storage.type': 'COPY_ON_WRITE',
    'hoodie.datasource.write.recordkey.field': 'oasys_question_pk',
    'hoodie.datasource.write.partitionpath.field': 'scn',
    'hoodie.datasource.write.table.name': table_name,
    'hoodie.datasource.write.operation': 'upsert',
    'hoodie.datasource.write.precombine.field': 'date_timestamp',
    'hoodie.datasource.write.hive_style_partitioning': 'true',
    'hoodie.upsert.shuffle.parallelism': 2,
    'hoodie.insert.shuffle.parallelism': 2,
    'path': table_location,
    'hoodie.datasource.hive_sync.enable': 'true',
    'hoodie.datasource.hive_sync.database': database_name,
    'hoodie.datasource.hive_sync.table': table_name,
    'hoodie.datasource.hive_sync.partition_fields': 'scn',
    'hoodie.datasource.hive_sync.partition_extractor_class': 'org.apache.hudi.hive.MultiPartKeysValueExtractor',
    'hoodie.datasource.hive_sync.use_jdbc': 'false',
    'hoodie.datasource.hive_sync.mode': 'hms'
}





In [14]:
df_initial_full_load.write.format("hudi")  \
    .options(**hudi_options)  \
    .mode("overwrite")  \
    .save()




## note : for initial load the following two options need to be configured 
```.option('hoodie.datasource.write.operation', 'upsert')``` on hudi options

and 

``` .mode("overwrite")  ``` on  write statement

## Read from Hudi table

In [15]:
df_products_read = spark.read  \
    .format("hudi")  \
    .load(table_location)
df_products_read.show(1,vertical=True,truncate=False)


-RECORD 0-------------------------------------------------------------------------------------------
 _hoodie_commit_time    | 20230518073708690                                                         
 _hoodie_commit_seqno   | 20230518073708690_0_1                                                     
 _hoodie_record_key     | 8975939.9683490770                                                        
 _hoodie_partition_path | scn=SYNTHETICS                                                            
 _hoodie_file_name      | 59255b78-cbab-4ebe-a05e-0a8cc817784e-0_0-35-177_20230518073708690.parquet 
 op                     | null                                                                      
 extraction_timestamp   | 2023-04-01                                                                
 oasys_question_pk      | 8975939.9683490770                                                        
 date_timestamp         | 2023-04-01 00:00:00                                              

In [16]:
df_products_read.count()

1585


## Upsert records into Hudi table with evolved Schema

In [17]:

#Hudi options from previous operation have the right hudi options
print(json.dumps(hudi_options, indent=4))

{
    "hoodie.table.name": "oasys_synth_q",
    "hoodie.datasource.write.storage.type": "COPY_ON_WRITE",
    "hoodie.datasource.write.recordkey.field": "oasys_question_pk",
    "hoodie.datasource.write.partitionpath.field": "scn",
    "hoodie.datasource.write.table.name": "oasys_synth_q",
    "hoodie.datasource.write.operation": "upsert",
    "hoodie.datasource.write.precombine.field": "date_timestamp",
    "hoodie.datasource.write.hive_style_partitioning": "true",
    "hoodie.upsert.shuffle.parallelism": 2,
    "hoodie.insert.shuffle.parallelism": 2,
    "path": "s3://mojap-raw-hist-sandbox-608/hmpps/oasys/EOR/BACKUP_SYNTHETIC/HUDIEXAMPLE",
    "hoodie.datasource.hive_sync.enable": "true",
    "hoodie.datasource.hive_sync.database": "hudi_df",
    "hoodie.datasource.hive_sync.table": "oasys_synth_q",
    "hoodie.datasource.hive_sync.partition_fields": "scn",
    "hoodie.datasource.hive_sync.partition_extractor_class": "org.apache.hudi.hive.MultiPartKeysValueExtractor",
    "hoodie.dat

In [19]:
df_updated_schema.write.format("hudi") \
    .options(**hudi_options) \
    .mode("append") \
    .save()




In [20]:
df_updated_schema_read = spark.read  \
    .format("hudi")  \
    .load(table_location)





In [21]:
df_updated_schema_read.count()



3264


In [22]:
df_updated_schema_read.printSchema()

root
 |-- _hoodie_commit_time: string (nullable = true)
 |-- _hoodie_commit_seqno: string (nullable = true)
 |-- _hoodie_record_key: string (nullable = true)
 |-- _hoodie_partition_path: string (nullable = true)
 |-- _hoodie_file_name: string (nullable = true)
 |-- op: string (nullable = true)
 |-- extraction_timestamp: string (nullable = true)
 |-- oasys_question_pk: decimal(38,10) (nullable = true)
 |-- date_timestamp: timestamp (nullable = true)
 |-- col_1: string (nullable = true)
 |-- col_2: string (nullable = true)
 |-- col_3: string (nullable = true)
 |-- col_4: string (nullable = true)
 |-- col_5: string (nullable = true)
 |-- intcol_1: integer (nullable = true)
 |-- intcol_2: integer (nullable = true)
 |-- intcol_3: integer (nullable = true)
 |-- intcol_4: integer (nullable = true)
 |-- intcol_5: integer (nullable = true)
 |-- scn: string (nullable = true)


In [23]:
df_updated_schema_read.show(1,vertical=True,truncate=False)

-RECORD 0-------------------------------------------------------------------------------------------
 _hoodie_commit_time    | 20230518073708690                                                         
 _hoodie_commit_seqno   | 20230518073708690_0_1                                                     
 _hoodie_record_key     | 8975939.9683490770                                                        
 _hoodie_partition_path | scn=SYNTHETICS                                                            
 _hoodie_file_name      | 59255b78-cbab-4ebe-a05e-0a8cc817784e-0_0-35-177_20230518073708690.parquet 
 op                     | null                                                                      
 extraction_timestamp   | 2023-04-01                                                                
 oasys_question_pk      | 8975939.9683490770                                                        
 date_timestamp         | 2023-04-01 00:00:00                                              

## Stop Session

In [7]:
%stop_session

Stopping session: 3b9682a6-b03a-4af2-ab71-e47d0ccb57e2
Stopped session.
