In [None]:
from snowflake.snowpark import Session
from snowflake.snowpark import functions as F
from snowflake.snowpark import types as T
from snowflake.snowpark.window import Window
from snowflake.snowpark.functions import col, lit, coalesce
from datetime import date

connection_parameters = {
    'user': '',
    'password': '',
    'account': 'yvb85669.us-east-1',
    'warehouse': 'WH_PG200',
    'role': 'SYSADMINDEV',
    'database': 'VESSOPS_D',
    'schema': 'L10_RDV'
}
session = Session.builder.configs(connection_parameters).create()


In [None]:
#pax_cruise_segment_data

pax_seg= session.table("VESSOPS_D.L10_RDV.VFM_PCH_PAX_CRUISE_SEGMENT_DATA").select(
    F.col("COMPANY_CD"),
    F.col("CRUISE_CD"),
    F.col("CRUISE_SEGMENT_CD"),
    F.col("ACCOUNTING_PERIOD"),
    F.col("NET_REVENUE"),
    F.col("CRUISE_SEG_PAX_DAY_QTY"),
    F.col("CRUISE_SEGMENT_PAX_QTY"),
    F.col("CRUISE_PAX_QTY"),
)

#changing from float to int
pax_seg = (
    pax_seg.with_column("CRUISE_SEG_PAX_DAY_QTY", F.round(F.col("CRUISE_SEG_PAX_DAY_QTY"), 0).cast(T.IntegerType())).with_column("CRUISE_SEGMENT_PAX_QTY", F.round(F.col("CRUISE_SEGMENT_PAX_QTY"), 0).cast(T.IntegerType())).with_column("CRUISE_PAX_QTY", F.round(F.col("CRUISE_PAX_QTY"), 0).cast(T.IntegerType())))

# pax_seg.select("ACCOUNTING_PERIOD").distinct().show()

        
pax_seg.show(10)


In [None]:
#FDR DATA 


FDR = (
    session.table("VESSOPS_D.L10_RDV.VFM_OCI_FDR")
    .select(
        "CRUISE_CD",
        "PRTD_YEAR",
        "PRTD_MONTH",
        "PERIOD",
        "PAX_DAY_CAPACITY",
        "REV_PAX_DAYS",
        "REV_OCC_PCT"
    )
)

#CREATING ACCOUNTING_PERIOD
FDR = FDR.with_column(
    "ACCOUNTING_PERIOD",
    F.concat(
        F.col("PRTD_YEAR").cast("string"),
        F.lpad(
            F.to_char(
                F.to_date(F.concat(F.col("PRTD_MONTH"), F.lit(" 01 2000")), 'MMMM DD YYYY'),
                'MM'
            ),
            2,
            F.lit('0')   
        )
    ).cast("int")
)
#FILTERING BASED ON YEAR AND MONTH

FDR = FDR.filter(
    (F.col("PRTD_YEAR") == 2023)
    | (F.col("PRTD_YEAR") == 2024)
    | (
        (F.col("PRTD_YEAR") == 2025)
        & F.col("PRTD_MONTH").isin(
            ["January", "February", "March", "April", "May", "June"]
        )
    )
)
# RENAMING THE COLOUMN
FDR = FDR.with_column_renamed("CRUISE_CD", "CRUISE_SEGMENT_CD")



FDR_merge = (
    FDR.join(
        pax_seg,
        on=["CRUISE_SEGMENT_CD", "ACCOUNTING_PERIOD"],
        how="inner"  # behaves like pandas merge
    )

)




FDR_summed_Pax = (
    FDR_merge.group_by("CRUISE_SEGMENT_CD", "ACCOUNTING_PERIOD")
             .agg(F.sum(F.col("CRUISE_SEG_PAX_DAY_QTY")).alias("SUMMED_CRUISE_SEG_PAX_DAY_QTY"))
)




FDR_merge_clean = FDR_merge.join(
    FDR_summed_Pax,
    on=["CRUISE_SEGMENT_CD", "ACCOUNTING_PERIOD"],
    how="inner"
)


# 1. Extract month from ACCOUNTING_PERIOD (assumes format YYYYMM)
FDR_merge_clean = FDR_merge_clean.with_column(
    "Month",
    F.col("ACCOUNTING_PERIOD").cast("string").substr(5, 2).cast(T.IntegerType())
)

# 2. Convert ACCOUNTING_PERIOD back to float if needed
FDR_merge_clean = FDR_merge_clean.with_column(
    "ACCOUNTING_PERIOD",
    F.col("ACCOUNTING_PERIOD").cast(T.FloatType())
)

# 3. Calculate Ratio
FDR_merge_clean = FDR_merge_clean.with_column(
    "Ratio",
    F.col("CRUISE_SEG_PAX_DAY_QTY") / F.col("SUMMED_CRUISE_SEG_PAX_DAY_QTY")
)

# 4. Calculate ADJ_BF_Cap_Days and ADJ_BF_Pax_Days (rounded integers)
FDR_merge_clean = FDR_merge_clean.with_column(
    "ADJ_BF_Cap_Days",
    F.round(F.col("PAX_DAY_CAPACITY") * F.col("Ratio"), 0).cast(T.IntegerType())
).with_column(
    "ADJ_BF_Pax_Days",
    F.round(F.col("REV_PAX_DAYS") * F.col("Ratio"), 0).cast(T.IntegerType())
)

# 5. Calculate PPD (rounded to 2 decimals)
FDR_merge_clean = FDR_merge_clean.with_column(
    "PPD",
    F.round(F.col("NET_REVENUE") / F.col("ADJ_BF_Pax_Days"), 2)
)



# FDR.select("ACCOUNTING_PERIOD").distinct().show()

FDR_merge_clean = FDR_merge_clean.filter(F.col("REV_PAX_DAYS") != 0)



FDR_NEW = FDR_merge_clean.select(
    F.col("CRUISE_CD"),
    F.col("CRUISE_SEGMENT_CD"),
    F.col("PRTD_YEAR"),
    F.col("MONTH"),
    F.col("ADJ_BF_CAP_DAYS"),
    F.col("ADJ_BF_PAX_DAYS"),
    F.col("PPD"),
     # previously renamed from "REV_OCC_PCT"
)


FDR_NEW = FDR_NEW.with_column(
    "SHIP_CD",
    F.col("CRUISE_SEGMENT_CD").substr(1, 3)  # first 3 characters
).with_column(
    "CRUISE_GRAB",
    F.col("CRUISE_SEGMENT_CD").substr(1, 7)  # first 7 characters
)

FDR_NEW.limit(10).show()

In [None]:
#GSS DATA


GSS = session.table("VESSOPS_D.L10_RDV.VFM_PCH_GSS").select(
    F.col("BRAND"),
    F.col("CRUISE_CD"),
    F.col("CRUISE_START_DAT"),
    F.col("CRUISE_END_DAT"),
    F.col("OPERATIONAL_DATE"),
    F.col("MEASURE_NAMES"),
    F.col("MEASURE_VALUES"),
    
)

GSS_OCI = GSS.filter(
    (F.col("BRAND") == "OCI") &
    (F.col("MEASURE_NAMES") == "% Satisfied")
)

# Select and rename columns
GSS_OCI = GSS_OCI.select(
    F.col("CRUISE_CD").alias("CRUISE_SEGMENT_CD"),
    F.col("BRAND"),
    F.col("MEASURE_VALUES").alias("GSS"),
    F.col("CRUISE_CD").substr(1, 3).alias("SHIP_CD")
)

GSS_OCI.limit(10).show()

In [None]:
# #ITINERARY TABLE
# itinerary = session.table("VESSOPS_D.L10_RDV.VFM_PCH_ITINERARY").select(
#     F.col("COMPANY_CD"),
#     F.col("CRUISE_CD"),
#     F.col("PRODUCT"),
#     F.col("STARTPOINT"),
#     F.col("ENDPOINT"),
#     F.col("ACTIVE_POSITION"),
#     F.col("START_DATE"),
#     F.col("END_DATE"),
#     F.col("RANK_OF_POSITION"),
#     F.col("STARTPORT_CD"),
#     F.col("ENDPORT_CD")
# )


# # itinerary = itinerary.with_column_renamed("CRUISE_CD", "CRUISE_SEGMENT_CD")

# # # Cast StartPort_CD to string
# # itinerary = itinerary.with_column("STARTPORT_CD", F.col("STARTPORT_CD").cast("string"))

# # # Window per cruise, ordered by RANK_OF_POSITION
# # w = Window.partition_by("CRUISE_SEGMENT_CD").order_by("RANK_OF_POSITION")

# # # Lag to get previous end port
# # itinerary = itinerary.with_column("PREV_ENDPORT", F.lag("ENDPORT_CD").over(w))

# # # Determine PORT_PART (avoid consecutive duplicates)
# # itinerary = itinerary.with_column(
# #     "PORT_PART",
# #     F.when(F.col("PREV_ENDPORT").is_null(), F.col("STARTPORT_CD"))
# #      .when(F.col("PREV_ENDPORT") != F.col("ENDPORT_CD"), F.col("ENDPORT_CD"))
# #      .otherwise(F.lit(None))
# # )

# # # Aggregate into PortCD_Activity
# # itinerary_result = (
# #     itinerary.group_by("CRUISE_SEGMENT_CD")
# #              .agg(F.array_to_string(F.array_agg("PORT_PART"), F.lit(" - ")).alias("PORTCD_ACTIVITY"))
# # )

# #2ND CODE

# # from snowflake.snowpark import functions as F
# # from snowflake.snowpark.window import Window

# # # Rename as you did
# # itinerary = itinerary.with_column_renamed("CRUISE_CD", "CRUISE_SEGMENT_CD")

# # # Cast STARTPORT_CD to string
# # itinerary = itinerary.with_column("STARTPORT_CD", F.col("STARTPORT_CD").cast("string"))

# # w = Window.partition_by("CRUISE_SEGMENT_CD").order_by("RANK_OF_POSITION")

# # # Lag previous ENDPORT_CD
# # itinerary = itinerary.with_column("PREV_ENDPORT", F.lag("ENDPORT_CD").over(w))

# # # Add a row_number to identify first row per cruise
# # itinerary = itinerary.with_column("ROW_NUM", F.row_number().over(w))

# # # Create a column that holds ports to add (could be two ports for first row)
# # # We'll explode this array later before aggregation
# # itinerary = itinerary.with_column(
# #     "PORTS_TO_ADD",
# #     F.when(
# #         F.col("ROW_NUM") == 1,
# #         F.array_construct(F.col("STARTPORT_CD"), F.col("ENDPORT_CD"))
# #     ).otherwise(
# #         F.when(F.col("ENDPORT_CD") != F.col("PREV_ENDPORT"), F.array_construct(F.col("ENDPORT_CD")))
# #          .otherwise(F.array_construct())
# #     )
# # )

# # # Explode array so each port is a separate row, then group back

# # # Flatten the ports to single rows
# # itinerary_exploded = itinerary.select(
# #     "CRUISE_SEGMENT_CD",
# #     F.explode("PORTS_TO_ADD").alias("PORT_PART"),
# #     "RANK_OF_POSITION",
# #     "ROW_NUM"
# # )

# # # Now remove duplicates in a way that respects order (keep unique ports only)

# # from snowflake.snowpark.functions import row_number

# # # Define a window to order by RANK_OF_POSITION and PORT_PART for deduplication
# # w2 = Window.partition_by("CRUISE_SEGMENT_CD").order_by("RANK_OF_POSITION", "PORT_PART")

# # # Add row_number for deduplication of consecutive ports
# # itinerary_exploded = itinerary_exploded.with_column(
# #     "PREV_PORT",
# #     F.lag("PORT_PART").over(Window.partition_by("CRUISE_SEGMENT_CD").order_by("RANK_OF_POSITION"))
# # )

# # # Filter out consecutive duplicate ports
# # itinerary_filtered = itinerary_exploded.filter(
# #     (F.col("PREV_PORT").is_null()) | (F.col("PORT_PART") != F.col("PREV_PORT"))
# # )

# # # Aggregate final port sequence per cruise
# # itinerary_result = (
# #     itinerary_filtered.group_by("CRUISE_SEGMENT_CD")
# #     .agg(
# #         F.expr("LISTAGG(PORT_PART, ' - ') WITHIN GROUP (ORDER BY RANK_OF_POSITION)").alias("PORTCD_ACTIVITY")
# #     )
# # )

# #3RDCODE



# # Step 1: Rename cruise code and cast ports as string
# itinerary = itinerary.with_column_renamed("CRUISE_CD", "CRUISE_SEGMENT_CD")
# itinerary = itinerary.with_column("STARTPORT_CD", F.col("STARTPORT_CD").cast("string"))
# itinerary = itinerary.with_column("ENDPORT_CD", F.col("ENDPORT_CD").cast("string"))

# # Step 2: Explode itinerary into port events with ordering

# # Create a "port events" dataframe that includes:
# # - first start port as a row
# # - then all end ports as subsequent rows ordered by RANK_OF_POSITION

# start_ports = itinerary.filter(F.col("RANK_OF_POSITION") == 1) \
#                       .select(
#                           "CRUISE_SEGMENT_CD",
#                           F.col("STARTPORT_CD").alias("PORT_CD"),
#                           F.lit(0).alias("PORT_ORDER")
#                       )

# end_ports = itinerary.select(
#     "CRUISE_SEGMENT_CD",
#     F.col("ENDPORT_CD").alias("PORT_CD"),
#     F.col("RANK_OF_POSITION").alias("PORT_ORDER")
# )

# port_events = start_ports.union_all(end_ports)

# # Step 3: Window to order ports per cruise segment by PORT_ORDER
# w = Window.partition_by("CRUISE_SEGMENT_CD").order_by("PORT_ORDER")

# # Step 4: Use lag to remove consecutive duplicate ports
# port_events = port_events.with_column("PREV_PORT_CD", F.lag("PORT_CD").over(w)) \
#                          .with_column("IS_DUP", F.when(F.col("PORT_CD") == F.col("PREV_PORT_CD"), 1).otherwise(0))

# ports_no_dup = port_events.filter(F.col("IS_DUP") == 0)

# # Step 5: Aggregate ordered ports per cruise segment using LISTAGG
# itinerary_result = ports_no_dup.group_by("CRUISE_SEGMENT_CD").agg(
#     F.expr("LISTAGG(PORT_CD, ' - ') WITHIN GROUP (ORDER BY PORT_ORDER)").alias("PORTCD_ACTIVITY")
# )


# # Join itinerary with the aggregated port sequence
# itinerary_final = itinerary.join(
#     itinerary_result,
#     on="CRUISE_SEGMENT_CD",
#     how="left"
# )

# # Derive year from CRUISE_SEGMENT_CD
# itinerary_final = itinerary_final.with_column(
#     "YEAR",
#     F.substring(F.col("CRUISE_SEGMENT_CD"), 4, 2)
# )

# # Filter for years 22–25
# itinerary_final = itinerary_final.filter(F.col("YEAR").isin(["22","23","24","25"]))






# # Remove duplicates first
# itinerary_dupl = itinerary_final.select(
#     "CRUISE_SEGMENT_CD", "PRODUCT", "PORTCD_ACTIVITY"
# ).distinct()

# # Aggregate into array and then string
# portcd = (
#     itinerary_dupl.group_by("CRUISE_SEGMENT_CD", "PRODUCT")
#                    .agg(
#                        F.array_to_string(
#                            F.array_agg("PORTCD_ACTIVITY"),
#                            F.lit(" - ")
#                        ).alias("PORTCD_ACTIVITY")
#                    )
# )



# portcd.limit(10).show()





In [None]:


# Step 1: Load itinerary table with all needed columns
itinerary = session.table("VESSOPS_D.L10_RDV.VFM_PCH_ITINERARY").select(
    "COMPANY_CD",
    "CRUISE_CD",
    "PRODUCT",
    "STARTPOINT",
    "ENDPOINT",
    "ACTIVE_POSITION",
    "START_DATE",
    "END_DATE",
    "RANK_OF_POSITION",   # IMPORTANT: keep this column
    "STARTPORT_CD",
    "ENDPORT_CD"
)

# Step 2: Rename CRUISE_CD to CRUISE_SEGMENT_CD and cast ports as string
itinerary = itinerary.with_column_renamed("CRUISE_CD", "CRUISE_SEGMENT_CD")
itinerary = itinerary.with_column("STARTPORT_CD", F.col("STARTPORT_CD").cast("string"))
itinerary = itinerary.with_column("ENDPORT_CD", F.col("ENDPORT_CD").cast("string"))

# Step 3: Extract first start port with PORT_ORDER=0
start_with_order = itinerary.filter(F.col("RANK_OF_POSITION") == 1).select(
    "CRUISE_SEGMENT_CD",
    F.col("STARTPORT_CD").alias("PORT_CD"),
    F.lit(0).alias("PORT_ORDER")
)

# Step 4: Extract all end ports with their rank as order
end_with_order = itinerary.select(
    "CRUISE_SEGMENT_CD",
    F.col("ENDPORT_CD").alias("PORT_CD"),
    F.col("RANK_OF_POSITION").alias("PORT_ORDER")
)

# Step 5: Union start and end ports into one dataframe
ports = start_with_order.union_all(end_with_order)

# Step 6: Define window partitioned by cruise segment, ordered by PORT_ORDER then PORT_CD (for ties)
w = Window.partition_by("CRUISE_SEGMENT_CD").order_by("PORT_ORDER", "PORT_CD")

# Step 7: Remove consecutive duplicate ports using lag
ports = ports.with_column("PREV_PORT", F.lag("PORT_CD").over(w))
ports = ports.filter((F.col("PREV_PORT").is_null()) | (F.col("PORT_CD") != F.col("PREV_PORT")))

# Step 8: Aggregate the port sequence per cruise segment, ordered by PORT_ORDER then PORT_CD
itinerary_result = ports.group_by("CRUISE_SEGMENT_CD").agg(
    F.expr("LISTAGG(PORT_CD, ' - ') WITHIN GROUP (ORDER BY PORT_ORDER, PORT_CD)").alias("PORTCD_ACTIVITY")
)



# Optional: Join back to original itinerary if needed, e.g. to add port sequences to main data
itinerary_final = itinerary.join(itinerary_result, on="CRUISE_SEGMENT_CD", how="left")

# Optional: Derive year from cruise segment (example: substring positions 4-5) and filter
itinerary_final = itinerary_final.with_column(
    "YEAR",
    F.substring(F.col("CRUISE_SEGMENT_CD"), 4, 2)
).filter(F.col("YEAR").isin(["22", "23", "24", "25"]))



# Remove duplicates first
itinerary_dupl = itinerary_final.select(
    "CRUISE_SEGMENT_CD", "PRODUCT", "PORTCD_ACTIVITY"
).distinct()

# Aggregate into array and then string
portcd = (
    itinerary_dupl.group_by("CRUISE_SEGMENT_CD", "PRODUCT")
                   .agg(
                       F.array_to_string(
                           F.array_agg("PORTCD_ACTIVITY"),
                           F.lit(" - ")
                       ).alias("PORTCD_ACTIVITY")
                   )
)



portcd.limit(10).show()


In [None]:
#alb data

nclh_alb = session.table("VESSOPS_D.L10_RDV.VFM_NCLH_ALB").select(
    F.col("BRAND"),
    F.col("SHIP_NAME"),
    F.col("SHIP_CD"),
    F.col("ALB")
)

# Preview the first 10 rows
nclh_alb.limit(10).show()


In [None]:
# Join FDR_NEW with portcd
merged_FDR = FDR_NEW.join(portcd, on=["CRUISE_SEGMENT_CD"], how="left")

# merged_FDR.show(10)

# Join the result with GSS_OCI
final_FDR_merge = merged_FDR.join(GSS_OCI, on=["CRUISE_SEGMENT_CD", "SHIP_CD"], how="left")

final_FDR_merge = final_FDR_merge.with_column("BRAND", F.lit("OCI"))

# # Preview
# final_FDR_merge.limit(10).show()


# Join using both Ship_Cd and BRAND as keys
FDR_final = final_FDR_merge.join(
    nclh_alb,
    on=["Ship_Cd", "BRAND"],
    how="left"
)


FDR_final = (
    FDR_final
    .with_column_renamed("ADJ_BF_Pax_Days", "PAX_DAYS")
    .with_column_renamed("ADJ_BF_Cap_Days", "DO_CAPS_DAYS")

)

FDR_final = FDR_final.with_column("LOAD_FACTOR", col("PAX_DAYS") / col("DO_CAPS_DAYS"))
FDR_final.show(10)







In [None]:

# Load the sailing master table
sailing_master = session.table("VESSOPS_D.L10_RDV.VFM_PCH_SAILING_MASTER").select(
    F.col("COMPANY_CD"),
    F.col("CRUISE_SEGMENT_CD"),
    F.col("SHIP_CD"),
    F.col("SAIL_DAT"),
    F.col("RETURN_DAT"),
    F.col("SAIL_DAY_QTY"),
    F.col("EMBARK_PORT_CD"),
    F.col("DISEMBARK_PORT_CD"),
    F.col("CHARTER_IND"),
    F.col("SEGMENT_MARKET_NAME"),
    F.col("SEGMENT_SUPER_MARKET_DESC"),
    F.col("GUEST_CAPACITY_QTY"),
    F.col("CABIN_CAPACITY_QTY"),
    F.col("WEEK_NUM"),
    F.col("MONTH_NUM"),
    F.col("YEAR_NUM"),
)

# Identify MAIN_VOYAGE_CD based on last char of CRUISE_SEGMENT_CD
last_char = F.substring(F.col("CRUISE_SEGMENT_CD"), F.length(F.col("CRUISE_SEGMENT_CD")), 1)
sailing_master = sailing_master.with_column(
    "MAIN_VOYAGE_CD",
    F.when(last_char.rlike("[A-Za-z]"), F.lit("N")).otherwise(F.lit("Y"))
).filter(F.col("MAIN_VOYAGE_CD") == "Y")

# Add END_OF_MONTH column
sailing_master = sailing_master.with_column("END_OF_MONTH", F.last_day(F.col("SAIL_DAT")))

# Add STRADDLE_FLAG
sailing_master = sailing_master.with_column(
    "STRADDLE_FLAG",
    F.when(F.col("RETURN_DAT") > F.col("END_OF_MONTH"), F.lit("straddle"))
     .otherwise(F.lit("Non-straddle"))
)
# --- keep your code above as-is until STRADDLE_FLAG ---

# PRTD_DAYS_BEFORE: same as you had (+1 for inclusive)
sailing_master = sailing_master.with_column(
    "PRTD_DAYS_BEFORE",
    F.when(F.col("STRADDLE_FLAG") == "straddle",
           F.datediff("day", F.col("SAIL_DAT"), F.col("END_OF_MONTH")) + F.lit(1)
    ).otherwise(F.col("SAIL_DAY_QTY"))
)

# PRTD_DAYS_AFTER: mirror pandas split logic  ==>  (return - eom) - 1
# Use greatest(..., 0) to avoid negatives just in case.
sailing_master = sailing_master.with_column(
    "PRTD_DAYS_AFTER_RAW",
    F.when(
        F.col("STRADDLE_FLAG") == "straddle",
        F.greatest(
            (F.datediff("day", F.col("END_OF_MONTH"), F.col("RETURN_DAT")) - F.lit(1)),
            F.lit(0)
        )
    ).otherwise(F.lit(0))
)

# BEFORE segment
before_rows = (sailing_master
    .with_column("CONV_SAIL_DAT", F.col("SAIL_DAT"))
    .with_column("CONV_RETURN_DAT", F.when(F.col("STRADDLE_FLAG") == "straddle", F.col("END_OF_MONTH"))
                                      .otherwise(F.col("RETURN_DAT")))
    .with_column("CONV_SAIL_DAY_QTY", F.col("PRTD_DAYS_BEFORE"))
    .with_column("BEFORE_COLUMN", F.col("PRTD_DAYS_BEFORE"))
    .with_column("AFTER_COLUMN", F.lit(0))
    # Adjust flag like your pandas post-process: if after==0, mark as Non-straddle
    .with_column("STRADDLE_FLAG_ADJ",
                 F.when(F.col("PRTD_DAYS_AFTER_RAW") == 0, F.lit("Non-straddle"))
                  .otherwise(F.col("STRADDLE_FLAG")))
)

# AFTER segment (only when after qty > 0)
after_rows = (sailing_master
    .filter(F.col("PRTD_DAYS_AFTER_RAW") > 0)
    .with_column("CONV_SAIL_DAT", F.dateadd("day", F.lit(1), F.col("END_OF_MONTH")))
    .with_column("CONV_RETURN_DAT", F.col("RETURN_DAT"))
    .with_column("CONV_SAIL_DAY_QTY", F.col("PRTD_DAYS_AFTER_RAW"))
    .with_column("BEFORE_COLUMN", F.lit(0))
    .with_column("AFTER_COLUMN", F.col("PRTD_DAYS_AFTER_RAW"))
    .with_column("STRADDLE_FLAG_ADJ", F.lit("straddle"))
)

# Union and keep only positive durations (mirrors pandas drop of zeros)
final_sailing_df = (before_rows.union_all(after_rows)
    .filter(F.col("CONV_SAIL_DAY_QTY") > 0)
)

# BRAND (same)
final_sailing_df = final_sailing_df.with_column(
    "BRAND",
    F.when(F.col("COMPANY_CD") == 110, F.lit("RSSC")).otherwise(F.lit("OCI"))
)

# Use CONVERTED_SAIL_DATE for month/year like your pandas version
final_sailing_df = (final_sailing_df
    .with_column("MONTH_NBR", F.month("CONV_SAIL_DAT"))
    .with_column("YEAR_NBR",  F.year("CONV_SAIL_DAT"))
)

# Final select (names close to your pandas output)
final_sailing_df = final_sailing_df.select(
    F.col("COMPANY_CD"),
    F.col("CRUISE_SEGMENT_CD").alias("CRUISE_SEGMENT_CD_1"),
    F.col("SHIP_CD").alias("SHIP_CD_1"),
    F.col("SAIL_DAT"),
    F.col("RETURN_DAT"),
    F.col("SAIL_DAY_QTY"),
    F.col("EMBARK_PORT_CD"),
    F.col("DISEMBARK_PORT_CD"),
    F.col("CHARTER_IND").alias("CHARTER_CD"),
    F.col("SEGMENT_MARKET_NAME").alias("PRODUCT_DESC"),
    F.col("SEGMENT_SUPER_MARKET_DESC").alias("RM_ROLLUP_PRODUCT_DESC"),
    F.col("MAIN_VOYAGE_CD"),
    F.col("END_OF_MONTH"),
    F.col("STRADDLE_FLAG_ADJ").alias("STRADDLE_FLAG"),
    F.col("CONV_SAIL_DAT").alias("CONVERTED_SAIL_DATE"),
    F.col("CONV_RETURN_DAT").alias("CONVERTED_RETURN_DATE"),
    F.col("CONV_SAIL_DAY_QTY").alias("CONVERTED_SAIL_DAY_QTY"),
    F.col("BRAND").alias("BRAND_1"),
    F.col("MONTH_NBR"),
    F.col("YEAR_NBR")
)

# Preview
final_sailing_df.limit(10).show()


In [None]:
rss = FDR_final.alias("rss")
sm = final_sailing_df.alias("sm")

# Normal join without col()
FDR_result = rss.join(
    sm,
    (rss["CRUISE_SEGMENT_CD"] == sm["CRUISE_SEGMENT_CD_1"]) &
    (rss["MONTH"] == sm["MONTH_NBR"]) &
    (rss["PRTD_YEAR"] == sm["YEAR_NBR"]),
    how="left"
)




# Drop multiple columns
FDR_result_clean = FDR_result.drop("BRAND_1", "CRUISE_SEGMENT_CD_1", "SHIP_CD_1")

# Show result
# FDR_result_clean.show(10)


final_sailing_df = final_sailing_df.rename(
    {
        "BRAND_1": "BRAND",
        "CRUISE_SEGMENT_CD_1": "CRUISE_SEGMENT_CD",
        "SHIP_CD_1": "SHIP_CD"
    }
)

# Show result
# final_sailing_df.show(10)

FDR_result.show()


In [None]:
from snowflake.snowpark.functions import when, col
FDR_result_clean = FDR_result_clean.with_column(
    "Ship_class",
    when(col("SHIP_CD") == "INS", "Regatta Class")
    .when(col("SHIP_CD") == "REG", "Regatta Class")
    .when(col("SHIP_CD") == "SIR", "Regatta Class")
    .when(col("SHIP_CD") == "MNA", "Oceania Class")
    .when(col("SHIP_CD") == "NAU", "Regatta Class")
    .when(col("SHIP_CD") == "RVA", "Oceania Class")
    .when(col("SHIP_CD") == "VIS", "Allura Class")
)

FDR_result_clean.columns

In [None]:
Master_3 = FDR_result_clean.select(
  'CRUISE_CD', 'CRUISE_SEGMENT_CD', 'CRUISE_GRAB', 'PRTD_YEAR', 'MONTH', 'ALB', 'LOAD_FACTOR',
    'PAX_DAYS', 'DO_CAPS_DAYS', 'PPD', 'PORTCD_ACTIVITY', 'GSS', 'SHIP_CD', 'SHIP_NAME', 'SHIP_CLASS',
    'SAIL_DAT', 'RETURN_DAT', 'SAIL_DAY_QTY', 'EMBARK_PORT_CD', 'DISEMBARK_PORT_CD', 'CHARTER_CD',
    'PRODUCT_DESC', 'RM_ROLLUP_PRODUCT_DESC', 'MAIN_VOYAGE_CD', 'END_OF_MONTH', 'STRADDLE_FLAG',
    'COMPANY_CD', 'CONVERTED_SAIL_DATE', 'CONVERTED_RETURN_DATE', 'CONVERTED_SAIL_DAY_QTY',
    'BRAND', 'MONTH_NBR', 'YEAR_NBR'
)

Master_3.show(10)


In [None]:
Master_3 = Master_3.with_column(
    "CHARTER_CD",
    when(col("CHARTER_CD") == True, "Y")
    .when(col("CHARTER_CD") == False, "N")
    .otherwise(col("CHARTER_CD"))  # If there are nulls or other unexpected values
)
Master_3 = Master_3.with_column("GSS", lit(0))


In [None]:
# from snowflake.snowpark import functions as F
# from snowflake.snowpark import types as T

# Master_OCI = Master_OCI.select(
#     F.col("CRUISE_CD").cast(T.StringType()),
#     F.col("CRUISE_SEGMENT_CD").cast(T.StringType()),
#     F.col("CRUISE_GRAB").cast(T.StringType()),
#     F.col("PRTD_YEAR").cast(T.IntegerType()),
#     F.col("MONTH").cast(T.IntegerType()),
#     F.col("ALB").cast(T.FloatType()),
#     F.col("LOAD_FACTOR").cast(T.FloatType()),
#     F.col("PAX_DAYS").cast(T.IntegerType()),
#     F.col("DO_CAPS_DAYS").cast(T.IntegerType()),
#     F.col("PPD").cast(T.FloatType()),
#     F.col("PORTCD_ACTIVITY").cast(T.StringType()),
#     F.col("GSS").cast(T.StringType()),
#     F.col("SHIP_CD").cast(T.StringType()),
#     F.col("SHIP_NAME").cast(T.StringType()),
#     F.col("SHIP_CLASS").cast(T.StringType()),
#     F.col("SAIL_DAT").cast(T.DateType()),
#     F.col("RETURN_DAT").cast(T.DateType()),
#     F.col("SAIL_DAY_QTY").cast(T.IntegerType()),
#     F.col("EMBARK_PORT_CD").cast(T.StringType()),
#     F.col("DISEMBARK_PORT_CD").cast(T.StringType()),
#     F.col("CHARTER_CD").cast(T.StringType()),
#     F.col("PRODUCT_DESC").cast(T.StringType()),
#     F.col("RM_ROLLUP_PRODUCT_DESC").cast(T.StringType()),
#     F.col("MAIN_VOYAGE_CD").cast(T.StringType()),
#     F.col("END_OF_MONTH").cast(T.DateType()),
#     F.col("STRADDLE_FLAG").cast(T.StringType()),
#     F.col("COMPANY_CD").cast(T.StringType()),
#     F.col("CONVERTED_SAIL_DATE").cast(T.DateType()),
#     F.col("CONVERTED_RETURN_DATE").cast(T.DateType()),
#     F.col("CONVERTED_SAIL_DAY_QTY").cast(T.IntegerType()),
#     F.col("BRAND").cast(T.StringType()),
#     F.col("MONTH_NBR").cast(T.IntegerType()),
#     F.col("YEAR_NBR").cast(T.IntegerType())
# )



In [None]:
# Master_3 = Master_3.with_column("MD5_HUB_VOYAGE", F.md5(F.col("VOYAGE_CD")))

# columns_to_hash = [c for c in Master_3.columns if c not in ("VOYAGE_CD", "LDTS", "RCSR", "LAST_MODIFIED_BY", "MD5_VOYAGE_HASH_KEY")]
# Master_3 = Master_3.withColumn("HASH_DIFF", F.md5(F.concat(*[F.coalesce(F.col(c).cast("string"), F.lit("")).cast("string") for c in columns_to_hash])))


# Master_3 = Master_3.withColumn("LDTS", F.current_timestamp())\
#                     .withColumn("RCSR", F.lit('Sailing_Master,FDR'))\
#                     .withColumn("LAST_MODIFIED_BY", F.current_user())



In [None]:
Master_3.write.mode("overwrite").save_as_table("VESSOPS_D.L20_BDV.SAT_VFM_OCI_MASTER_DATA")

#updated