In [1]:
# Add src directory to Python path
import sys
import os
from datetime import date, timedelta
import pandas as pd

sys.path.append(os.path.abspath('../src'))



from snowflake.snowpark.functions import date_trunc, current_date
from snowflake.snowpark.functions import col, lit, when
from snowflake.snowpark.types import DateType
import snowflake.snowpark.functions as f

# Get Snowflake session
from SnowflakeConnector import get_snowflake_session
session = get_snowflake_session()


In [2]:
print(session) 

<snowflake.snowpark.session.Session: account="zfb96811", role="ACCOUNTADMIN", database="ANALYTICS", schema="PUBLIC", warehouse="COMPUTE_WH">


In [4]:
# Load all feature tables
subscription_features = session.table('analytics.analytics_inference.bimonthly_churn_features')
demographic_features = session.table('analytics.analytics_inference.demographic_feature')
shop_order_features = session.table('analytics.analytics_inference.shop_orders_churn_features')

# Display sample data from each
print("Subscription Features:")
subscription_features.show(5)

print("\nShop Order Features:")
shop_order_features.show(5)

print("\nDemographic Features:")
demographic_features.show(5)

Subscription Features:
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"SNAPSHOT_WEEK"  |"SUBSCRIPTIONID"                      |"EMAIL"                  |"LIFETIME"  |"LIFETIME_DAY"  |"CHURN_LABEL_14_DAY"  |"MALES_PER_100_FEMALES"  |"HH_MEAN_INCOME"  |"HH_MEDIAN_INCOME"  |"AGE_MEDIAN"  |"AVG_HOUSEHOLD_SIZE"  |"MARRIED_HH"  |"SINGLE_MALE_HH"  |
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|2025-04-28       |eefe0d0a-65d7-406b-ae34-c3b08ff8cf47  |suehofer1@abe.midco.net  |17          |119             |0                     |98.7        

In [14]:
# Combine all features
final_features = (
    subscription_features
    # Join with demographic features
    .join(
        demographic_features,
        subscription_features['EMAIL'] == demographic_features['EMAIL'],
        'left'
    )
    # Join with shop order features
    .join(shop_order_features,
    (subscription_features['EMAIL'] == shop_order_features['EMAIL']) &
    (subscription_features['SNAPSHOT_WEEK'] == shop_order_features['snapshot_week']),
    'left'
    )
    # Select final columns and handle any duplicates
    .select(
        # Subscription features
        subscription_features['SNAPSHOT_WEEK'].alias('SNAPSHOT_WEEK'),
        subscription_features['SUBSCRIPTIONID'],
        subscription_features['EMAIL'].alias('EMAIL'),
        
        subscription_features['CHURN_LABEL_14_DAY'],

        subscription_features['LIFETIME_DAY'],
        
        # Demographic features
        f.coalesce(demographic_features['MALES_PER_100_FEMALES'], lit(-1)).alias('MALES_PER_100_FEMALES'),
        f.coalesce(demographic_features['HH_MEAN_INCOME'], lit(-1)).alias('HH_MEAN_INCOME'),
        f.coalesce(demographic_features['HH_MEDIAN_INCOME'], lit(-1)).alias('HH_MEDIAN_INCOME'),
        f.coalesce(demographic_features['AGE_MEDIAN'], lit(-1)).alias('AGE_MEDIAN'),
        f.coalesce(demographic_features['AVG_HOUSEHOLD_SIZE'], lit(-1)).alias('AVG_HOUSEHOLD_SIZE'),
        f.coalesce(demographic_features['MARRIED_HH'], lit(-1)).alias('MARRIED_HH'),
        f.coalesce(demographic_features['SINGLE_MALE_HH'], lit(-1)).alias('SINGLE_MALE_HH'),
  
        
        # Shop order features
        f.coalesce(shop_order_features['total_net_revenue'], lit(-1)).alias('total_net_revenue'),
        f.coalesce(shop_order_features['order_count'], lit(-1)).alias('order_count'),
        f.coalesce(shop_order_features['days_since_last_order'], lit(-1)).alias('days_since_last_order'),

        # Channel features
        f.coalesce(shop_order_features['paid_social'], lit(-1)).alias('paid_social'),
        f.coalesce(shop_order_features['paid_search'], lit(-1)).alias('paid_search'),
        f.coalesce(shop_order_features['referral'], lit(-1)).alias('referral'),
        f.coalesce(shop_order_features['affiliate'], lit(-1)).alias('affiliate'),
        f.coalesce(shop_order_features['organic_social'], lit(-1)).alias('organic_social'),
        f.coalesce(shop_order_features['organic_search'], lit(-1)).alias('organic_search'),
        f.coalesce(shop_order_features['main_site'], lit(-1)).alias('main_site'),
    )
)


final_features = final_features.sort(['EMAIL','SUBSCRIPTIONID','SNAPSHOT_WEEK'])


# Show the combined features
print("Number of rows in final dataset:", final_features.count())
final_features.show()


Number of rows in final dataset: 123590
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"SNAPSHOT_WEEK"  |"SUBSCRIPTIONID"                      |"EMAIL"                                             |"CHURN_LABEL_14_DAY"  |"LIFETIME_DAY"  |"MALES_PER_100_FEMALES"  |"HH_MEAN_INCOME"  |"HH_MEDIAN_INCOME"  |"AGE_MEDIAN"  |"AVG_HOUSEHOLD_SIZE"  |"MARRIED_HH"  |"SINGLE_MALE_HH"  |"TOTAL_NET_REVENUE"  |"ORDER_COUNT"  |"DAYS_SINCE_LAST_ORDER"  |"PAID_SOCIAL"  |"PAID_SEARCH"  |"REFERRAL"  |"AFFILIATE"  |"ORGANIC_SOCIAL"  |"ORGANIC_SEARCH"  |"MAIN_SITE"  |
------------------------------

In [15]:
# Save the final feature set to Snowflake
final_features.write.mode('overwrite').save_as_table('analytics.analytics_inference.bimonthly_ml_features')

# Verify the saved data
print("Saved features verification:")
session.table('analytics.analytics_inference.bimonthly_ml_features').show()


Saved features verification:
----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"SNAPSHOT_WEEK"  |"SUBSCRIPTIONID"                      |"EMAIL"                 |"CHURN_LABEL_14_DAY"  |"LIFETIME_DAY"  |"MALES_PER_100_FEMALES"  |"HH_MEAN_INCOME"  |"HH_MEDIAN_INCOME"  |"AGE_MEDIAN"  |"AVG_HOUSEHOLD_SIZE"  |"MARRIED_HH"  |"SINGLE_MALE_HH"  |"TOTAL_NET_REVENUE"  |"ORDER_COUNT"  |"DAYS_SINCE_LAST_ORDER"  |"PAID_SOCIAL"  |"PAID_SEARCH"  |"REFERRAL"  |"AFFILIATE"  |"ORGANIC_SOCIAL"  |"ORGANIC_SEARCH"  |"MAIN_SITE"  |
-------------------------------------------------------------------------------------------------

In [16]:
final_features.columns

['SNAPSHOT_WEEK',
 'SUBSCRIPTIONID',
 'EMAIL',
 'CHURN_LABEL_14_DAY',
 'LIFETIME_DAY',
 'MALES_PER_100_FEMALES',
 'HH_MEAN_INCOME',
 'HH_MEDIAN_INCOME',
 'AGE_MEDIAN',
 'AVG_HOUSEHOLD_SIZE',
 'MARRIED_HH',
 'SINGLE_MALE_HH',
 'TOTAL_NET_REVENUE',
 'ORDER_COUNT',
 'DAYS_SINCE_LAST_ORDER',
 'PAID_SOCIAL',
 'PAID_SEARCH',
 'REFERRAL',
 'AFFILIATE',
 'ORGANIC_SOCIAL',
 'ORGANIC_SEARCH',
 'MAIN_SITE']