In [23]:
import os
import sys
import importlib
from datetime import date, timedelta, datetime as dt

try:
    from snowflake.snowpark import Session, get_active_session
    session = get_active_session()
except ImportError:  
    sys.path.append(os.path.abspath('/src'))
    import src.SnowflakeConnector
    importlib.reload(src.SnowflakeConnector)
    from src.SnowflakeConnector import create_active_session
    session = create_active_session()

In [None]:
#Dependencies
import yaml
from pathlib import Path

import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, log_loss, recall_score, confusion_matrix


from snowflake.snowpark.functions import col
import snowflake.snowpark.functions as f
from snowflake.snowpark.functions import col
import snowflake.snowpark.functions as f
#from snowflake.ml.registry import Registry

import src.model
import src.snapshot_split
import importlib
importlib.reload(src.model)
importlib.reload(src.snapshot_split)

from src.model import train_churn_model, predict_churn
from src.snapshot_split import split_by_snapshot_dmatrix

In [18]:
#dataloader

df = session.table("analytics.analytics_inference.bimonthly_ml_features")
df.select("SNAPSHOT_WEEK").distinct().order_by("SNAPSHOT_WEEK")

<snowflake.snowpark.dataframe.DataFrame at 0x112791370>

In [19]:
df.group_by('snapshot_week').count().show()

-----------------------------
|"SNAPSHOT_WEEK"  |"COUNT"  |
-----------------------------
|2025-05-12       |13582    |
|2025-04-21       |13998    |
|2025-05-26       |13453    |
|2025-04-07       |14205    |
|2025-05-19       |13473    |
|2025-04-14       |14139    |
|2025-04-28       |13681    |
|2025-06-02       |13377    |
|2025-05-05       |13682    |
-----------------------------



In [None]:
# Assume you're in notebooks/
notebook_dir = Path().resolve()
root = notebook_dir.parent
config_path = root / "configs" / "bimonthly.yaml"

with config_path.open("r") as f:
    config = yaml.safe_load(f)

features = config['features']
label = config['label']

In [None]:
#TRAIN TEST SPLIT
split_date_train = dt.strptime('2025-05-05', '%Y-%m-%d').date()
split_date_val = dt.strptime('2025-05-19', '%Y-%m-%d').date()
dtrain, dval, dtest, df_test = split_by_snapshot_dmatrix(df, split_date_train, split_date_val, train=True)

In [None]:
#TRAIN MODEL
test_model = train_churn_model(dtrain, dval)
eval_df = predict_churn(test_model, dtest)


eval_df["ACTUAL"] = df_test[label].values  # Add actual labels
eval_df.head()

In [None]:
#EVALUATE MODEL

cm = confusion_matrix(eval_df["ACTUAL"], eval_df["PREDICTED_CLASS"])

# Create a labeled DataFrame for the confusion matrix
cm_df = pd.DataFrame(cm, 
                     index=["Actual Non-Churn (0)", "Actual Churn (1)"], 
                     columns=["Predicted Non-Churn (0)", "Predicted Churn (1)"])

cm_df




In [None]:
#PRECISION & RECALL


precision = precision_score(eval_df["ACTUAL"], eval_df["PREDICTED_CLASS"])
recall = recall_score(eval_df["ACTUAL"], eval_df["PREDICTED_CLASS"])

pos_neg_ratio = eval_df["PREDICTED_CLASS"].sum() / (len(eval_df["PREDICTED_CLASS"]))

print(f"Precision : {precision}")
print(f"Recall : {recall}")
print(f"Positive/All : {pos_neg_ratio}")

In [None]:
#PLOTLY HISTOGRAM
fig1 = px.histogram(eval_df, x='PREDICTED_PROBABILITY', nbins=10, text_auto=True)
fig = px.histogram(eval_df[eval_df["PREDICTED_CLASS"] == 1], x='PREDICTED_PROBABILITY', nbins=50, text_auto=True)

st.title('Distribution of Chuned Predictions')
st.plotly_chart(fig, use_container_width=True)
st.title('Distribution of All Predictions')
st.plotly_chart(fig1, use_container_width=True)

In [None]:
#TRAIN TEST SPLIT FOR INFERENCE

split_date_train = dt.strptime('2025-05-19', '%Y-%m-%d').date()
split_date_val = dt.strptime('2025-06-02', '%Y-%m-%d').date()
dtrain, dval, dtest, df_test = split_by_snapshot_dmatrix(df, split_date_train, split_date_val)

In [None]:
#INFERENCE MODEL TRAINING

infer_model = train_churn_model(dtrain, dval)
prob_df = predict_churn(test_model, dtest)

pred_df = df_test.copy()
pred_df["PREDICTED_PROBABILITY"] = prob_df["PREDICTED_PROBABILITY"].values
pred_df["PREDICTED_CLASS"] = prob_df["PREDICTED_CLASS"].values

pred_df.tail()

In [None]:
#SNOWFLAKE EXPORT CREATE CAMPAIGN TABLE

campaign_df = pred_df.copy()
campaign_df = campaign_df[campaign_df["PREDICTED_PROBABILITY"] > 0.7]
inference_date =dt.strptime('2025-06-09', '%Y-%m-%d').date()
campaign_df["AB_GROUP"] = np.random.choice(["A - Send Email", "B - Don't Send Email"], size=len(campaign_df))

table_name = "PREDICTIVE.CHURN_PREDICTIONS.BIMONTHLY_"+inference_date.strftime('%b_%d').upper() 
print(table_name)

campaign_snowpark_df = session.createDataFrame(data=campaign_df)
campaign_snowpark_df.write.save_as_table(table_name, mode="overwrite")

In [None]:
#PREDICTIONS EVALUATION
positives = pred_df["PREDICTED_CLASS"].sum()
all = len(pred_df["PREDICTED_CLASS"])

pos_neg_ratio = positives / all

# print(f"Precision : {precision}")
print(f"Positives : {positives}")
print(f"Positive/All : {round(pos_neg_ratio,3) * 100}%")
print(f"Normal Positive/All : {round(700/all,2) * 100}%")

In [None]:
#INFERENCE PREDICTIONS HISTOGRAM
campaign_df['CHURN_PROBABILITY_PERC'] = campaign_df['PREDICTED_PROBABILITY']#*100
#pred_df['CHURN_PROBABILITY_PERC'] = pred_df['PREDICTED_PROBABILITY']*100

fig = px.histogram(campaign_df, x='CHURN_PROBABILITY_PERC', nbins=10, text_auto=True)
st.plotly_chart(fig, use_container_width=True)


fig1 = px.histogram(pred_df, x='PREDICTED_PROBABILITY', nbins=10, text_auto=True)
st.plotly_chart(fig1, use_container_width=True)

In [None]:
#INFERENCE PREDICTIONS BINNED HISTOGRAM
campaign_df['PREDICTED_PROBABILITY'] = pd.cut(
    campaign_df['PREDICTED_PROBABILITY'] * 100,
    bins=[0,10,20,30,40,50,60,70,80,90,100],
    right=False,
    labels=[f"{i}-{i+10}" for i in range(0, 100, 10)]
)

fig = px.histogram(campaign_df, x='PREDICTED_PROBABILITY', text_auto=True, category_orders={"PREDICTED_PROBABILITY": [f"{i}-{i+10}" for i in range(0, 100, 10)]})
st.plotly_chart(fig, use_container_width=True)