In [None]:
# Add src directory to Python path
import sys
import os
sys.path.append(os.path.abspath('../src'))

from data_loader import get_snowflake_session

# Get Snowflake session
session = get_snowflake_session()


In [None]:
# Import required libraries
import pandas as pd
from datetime import date, timedelta

from snowflake.snowpark.functions import date_trunc, current_date
from snowflake.snowpark.functions import col, lit, when
from snowflake.snowpark.types import DateType
import snowflake.snowpark.functions as f


In [None]:
# Load all feature tables
subscription_features = session.table('analytics.analytics_inference.bimonthly_churn_features')
demographic_features = session.table('analytics.analytics_inference.demographic_feature')
shop_order_features = session.table('analytics.analytics_inference.shop_order_features')

# Display sample data from each
print("Subscription Features:")
subscription_features.show(5)
print("\nDemographic Features:")
demographic_features.show(5)
print("\nShop Order Features:")
shop_order_features.show(5)


In [None]:
# Combine all features
final_features = (
    subscription_features
    # Join with demographic features
    .join(
        demographic_features,
        subscription_features['EMAIL'] == demographic_features['EMAIL'],
        'left'
    )
    # Join with shop order features
    .join(
        shop_order_features,
        [subscription_features['EMAIL'] == shop_order_features['EMAIL'],
         subscription_features['SNAPSHOT_WEEK'] == shop_order_features['snapshot_week']],
        'left'
    )
    # Select final columns and handle any duplicates
    .select(
        # Subscription features
        subscription_features['SNAPSHOT_WEEK'],
        subscription_features['SUBSCRIPTIONID'],
        subscription_features['EMAIL'],
        subscription_features['LIFETIME'],
        subscription_features['LIFETIME_DAY'],
        subscription_features['CHURN_LABEL_14_DAY'],
        
        # Demographic features
        demographic_features['MALES_PER_100_FEMALES'],
        demographic_features['HH_MEAN_INCOME'],
        demographic_features['HH_MEDIAN_INCOME'],
        demographic_features['AGE_MEDIAN'],
        
        # Shop order features
        f.coalesce(shop_order_features['total_net_revenue'], lit(0)).alias('total_net_revenue'),
        f.coalesce(shop_order_features['order_count'], lit(0)).alias('order_count')
    )
)

# Show the combined features
print("Number of rows in final dataset:", final_features.count())
final_features.show()


In [None]:
# Save the final feature set to Snowflake
final_features.write.mode('overwrite').save_as_table('analytics.analytics_inference.bimonthly_ml_features')

# Verify the saved data
print("Saved features verification:")
session.table('analytics.analytics_inference.bimonthly_ml_features').show()
