In [48]:
import os
import sys
import importlib
from datetime import date, timedelta, datetime as dt

try:
    from snowflake.snowpark import Session
    session = Session.get_active_session()
except Exception:  
    sys.path.append(os.path.abspath('/src'))
    import src.SnowflakeConnector
    importlib.reload(src.SnowflakeConnector)
    from src.SnowflakeConnector import create_active_session
    session = create_active_session()

In [64]:
#Dependencies
import yaml
from pathlib import Path

import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, log_loss, recall_score, confusion_matrix, precision_score


from snowflake.snowpark.functions import col
import snowflake.snowpark.functions as f
from snowflake.snowpark.functions import col
import snowflake.snowpark.functions as f
#from snowflake.ml.registry import Registry

import src.model
import src.snapshot_split
import importlib
importlib.reload(src.model)
importlib.reload(src.snapshot_split)

from src.model import train_churn_model, predict_churn
from src.snapshot_split import split_by_snapshot_dmatrix

In [52]:
#dataloader

df = session.table("analytics.analytics_inference.bimonthly_ml_features")
df.show()

----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"SNAPSHOT_WEEK"  |"SUBSCRIPTIONID"                      |"EMAIL"                 |"CHURN_LABEL_14_DAY"  |"LIFETIME_DAY"  |"MALES_PER_100_FEMALES"  |"HH_MEAN_INCOME"  |"HH_MEDIAN_INCOME"  |"AGE_MEDIAN"  |"AVG_HOUSEHOLD_SIZE"  |"MARRIED_HH"  |"SINGLE_MALE_HH"  |"TOTAL_NET_REVENUE"  |"ORDER_COUNT"  |"DAYS_SINCE_LAST_ORDER"  |"PAID_SOCIAL"  |"PAID_SEARCH"  |"REFERRAL"  |"AFFILIATE"  |"ORGANIC_SOCIAL"  |"ORGANIC_SEARCH"  |"MAIN_SITE"  |
------------------------------------------------------------------------------------------------------------------------------

In [53]:
df.group_by('snapshot_week').count().show()

-----------------------------
|"SNAPSHOT_WEEK"  |"COUNT"  |
-----------------------------
|2025-05-12       |13582    |
|2025-04-21       |13998    |
|2025-05-26       |13453    |
|2025-04-07       |14205    |
|2025-05-19       |13473    |
|2025-04-14       |14139    |
|2025-04-28       |13681    |
|2025-06-02       |13377    |
|2025-05-05       |13682    |
-----------------------------



In [27]:
# Assume you're in notebooks/
config_path = "configs/bimonthly.yaml"

with open(config_path, "r") as f:
    config = yaml.safe_load(f)

features = config['features']
label = config['label']

In [60]:
#TRAIN TEST SPLIT
split_date_train = dt.strptime('2025-05-05', '%Y-%m-%d').date()
split_date_val = dt.strptime('2025-05-19', '%Y-%m-%d').date()

df_features = df.to_pandas()

dtrain, dval, dtest, df_test = split_by_snapshot_dmatrix(df_features, split_date_train, split_date_val, train=True)

In [61]:
#TRAIN MODEL
test_model = train_churn_model(dtrain, dval)
eval_df = predict_churn(test_model, dtest)


eval_df["ACTUAL"] = df_test[label].values  # Add actual labels
eval_df.head()

Unnamed: 0,PREDICTED_PROBABILITY,PREDICTED_CLASS,ACTUAL
0,0.000304,0,0
1,3e-06,0,0
2,3e-06,0,0
3,3e-06,0,0
4,3e-06,0,0


In [62]:
#EVALUATE MODEL

cm = confusion_matrix(eval_df["ACTUAL"], eval_df["PREDICTED_CLASS"])

# Create a labeled DataFrame for the confusion matrix
cm_df = pd.DataFrame(cm, 
                     index=["Actual Non-Churn (0)", "Actual Churn (1)"], 
                     columns=["Predicted Non-Churn (0)", "Predicted Churn (1)"])

cm_df




Unnamed: 0,Predicted Non-Churn (0),Predicted Churn (1)
Actual Non-Churn (0),12016,750
Actual Churn (1),194,513


In [65]:
#PRECISION & RECALL


precision = precision_score(eval_df["ACTUAL"], eval_df["PREDICTED_CLASS"])
recall = recall_score(eval_df["ACTUAL"], eval_df["PREDICTED_CLASS"])

pos_neg_ratio = eval_df["PREDICTED_CLASS"].sum() / (len(eval_df["PREDICTED_CLASS"]))

print(f"Precision : {precision}")
print(f"Recall : {recall}")
print(f"Positive/All : {pos_neg_ratio}")

Precision : 0.40617577197149646
Recall : 0.7256011315417256
Positive/All : 0.09374304163883322


In [None]:
#PLOTLY HISTOGRAM
fig1 = px.histogram(eval_df, x='PREDICTED_PROBABILITY', nbins=10, text_auto=True)
fig = px.histogram(eval_df[eval_df["PREDICTED_CLASS"] == 1], x='PREDICTED_PROBABILITY', nbins=50, text_auto=True)

st.title('Distribution of Chuned Predictions')
st.plotly_chart(fig, use_container_width=True)
st.title('Distribution of All Predictions')
st.plotly_chart(fig1, use_container_width=True)

In [67]:
#TRAIN TEST SPLIT FOR INFERENCE

split_date_train = dt.strptime('2025-05-19', '%Y-%m-%d').date()
split_date_val = dt.strptime('2025-06-02', '%Y-%m-%d').date()
dtrain, dval, dtest, df_test = split_by_snapshot_dmatrix(df_features, split_date_train, split_date_val)

In [68]:
#INFERENCE MODEL TRAINING

infer_model = train_churn_model(dtrain, dval)
prob_df = predict_churn(test_model, dtest)

pred_df = df_test.copy()
pred_df["PREDICTED_PROBABILITY"] = prob_df["PREDICTED_PROBABILITY"].values
pred_df["PREDICTED_CLASS"] = prob_df["PREDICTED_CLASS"].values

pred_df.tail()

Unnamed: 0,SNAPSHOT_WEEK,SUBSCRIPTIONID,EMAIL,CHURN_LABEL_14_DAY,LIFETIME_DAY,MALES_PER_100_FEMALES,HH_MEAN_INCOME,HH_MEDIAN_INCOME,AGE_MEDIAN,AVG_HOUSEHOLD_SIZE,...,DAYS_SINCE_LAST_ORDER,PAID_SOCIAL,PAID_SEARCH,REFERRAL,AFFILIATE,ORGANIC_SOCIAL,ORGANIC_SEARCH,MAIN_SITE,PREDICTED_PROBABILITY,PREDICTED_CLASS
123539,2025-06-02,a1142131-6c4a-43fe-a4e3-c21f8bfab580,priehmann@yahoo.com,0,330,96.3,80177,66109,33.9,2.57,...,29,0,0,0,0,0,0,0,0.000447,0
123548,2025-06-02,30e28af1-fdfa-43e0-8fc0-72b62f2c0c00,princeamanda3@yahoo.com,0,58,-1.0,-1,-1,-1.0,-1.0,...,58,0,1,0,0,0,0,0,0.981496,1
123557,2025-06-02,f4f47758-eedd-43fb-8fe7-ae5058f688c1,princessd1214@gmail.com,0,106,107.1,120376,102984,41.8,2.63,...,45,0,0,0,0,0,0,0,0.000395,0
123574,2025-06-02,711a7dfd-a137-4b2d-8885-dbde10dffda6,princesswarren@gmail.com,0,321,108.7,177542,135081,41.8,2.54,...,5,0,0,0,0,0,0,0,0.00018,0
123589,2025-06-02,6679a1a9-f7a0-4532-a4cc-6c9504ec2c7b,priscila.gb@gmail.com,0,181,-1.0,-1,-1,-1.0,-1.0,...,30,0,0,0,0,0,0,0,0.358969,0


In [None]:
#SNOWFLAKE EXPORT CREATE CAMPAIGN TABLE

campaign_df = pred_df.copy()
campaign_df = campaign_df[campaign_df["PREDICTED_PROBABILITY"] > 0.7]
inference_date =dt.strptime('2025-06-09', '%Y-%m-%d').date()
campaign_df["AB_GROUP"] = np.random.choice(["A - Send Email", "B - Don't Send Email"], size=len(campaign_df))

table_name = "PREDICTIVE.CHURN_PREDICTIONS.BIMONTHLY_"+inference_date.strftime('%b_%d').upper() 
print(table_name)

campaign_snowpark_df = session.createDataFrame(data=campaign_df)
campaign_snowpark_df.write.save_as_table(table_name, mode="overwrite")

In [69]:
#PREDICTIONS EVALUATION
positives = pred_df["PREDICTED_CLASS"].sum()
all = len(pred_df["PREDICTED_CLASS"])

pos_neg_ratio = positives / all

# print(f"Precision : {precision}")
print(f"Positives : {positives}")
print(f"Positive/All : {round(pos_neg_ratio,3) * 100}%")
print(f"Normal Positive/All : {round(700/all,2) * 100}%")

Positives : 807
Positive/All : 6.0%
Normal Positive/All : 5.0%


In [None]:
#INFERENCE PREDICTIONS HISTOGRAM
campaign_df['CHURN_PROBABILITY_PERC'] = campaign_df['PREDICTED_PROBABILITY']#*100
#pred_df['CHURN_PROBABILITY_PERC'] = pred_df['PREDICTED_PROBABILITY']*100

fig = px.histogram(campaign_df, x='CHURN_PROBABILITY_PERC', nbins=10, text_auto=True)
st.plotly_chart(fig, use_container_width=True)


fig1 = px.histogram(pred_df, x='PREDICTED_PROBABILITY', nbins=10, text_auto=True)
st.plotly_chart(fig1, use_container_width=True)

In [None]:
#INFERENCE PREDICTIONS BINNED HISTOGRAM
campaign_df['PREDICTED_PROBABILITY'] = pd.cut(
    campaign_df['PREDICTED_PROBABILITY'] * 100,
    bins=[0,10,20,30,40,50,60,70,80,90,100],
    right=False,
    labels=[f"{i}-{i+10}" for i in range(0, 100, 10)]
)

fig = px.histogram(campaign_df, x='PREDICTED_PROBABILITY', text_auto=True, category_orders={"PREDICTED_PROBABILITY": [f"{i}-{i+10}" for i in range(0, 100, 10)]})
st.plotly_chart(fig, use_container_width=True)