In [None]:
import pandas as pd
import os
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import classification_report, roc_auc_score
from snowflake.snowpark import Session
from snowflake.ml.registry import Registry

In [None]:
session = Session.get_active_session()
if session is None:
    connection_name = os.getenv("SNOWFLAKE_DEFAULT_CONNECTION_NAME", "default")
    session = Session.SessionBuilder().configs({"connection_name": connection_name}).create()
session.use_database("POC")
session.use_schema("CHURN")

In [None]:
# 1. Get Snowflake session and load training data
df_raw = session.table("CUSTOMER_CHURN").limit(5000).to_pandas().dropna()
df = df_raw.copy()  # Preserve raw for post-inference join

# 2. Encode 'GENDER' using LabelEncoder
le_gender = LabelEncoder()
df['GENDER'] = le_gender.fit_transform(df['GENDER'])

# 3. Define features and target
target_col = 'CHURN'
id_col = 'CUSTOMERID'
X = df.drop(columns=[target_col, id_col])
y = df[target_col]

# 4. Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# 5. Define feature types
categorical_ohe = ['SUBSCRIPTION_TYPE']
categorical_ord = ['CONTRACT_LENGTH']
numeric_features = [col for col in X.columns if col not in categorical_ohe + categorical_ord + ['GENDER']]

# 6. Preprocessing pipeline
preprocessor = ColumnTransformer([
    ('ohe', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_ohe),
    ('ord', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), categorical_ord),
    ('gender', 'passthrough', ['GENDER']),
    ('numeric', 'passthrough', numeric_features)
])

# 7. Model pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(
        class_weight='balanced',
        random_state=42,
        n_jobs=-1
    ))
])

In [None]:
# 8. Hyperparameter tuning
param_grid = {
    'classifier__n_estimators': [100, 200],
    'classifier__max_depth': [10, 20, None],
    'classifier__min_samples_split': [2, 5],
    'classifier__min_samples_leaf': [1, 2],
    'classifier__max_features': ['sqrt', 'log2']
}

search = RandomizedSearchCV(
    pipeline,
    param_distributions=param_grid,
    n_iter=10,
    cv=5,
    scoring='roc_auc',
    n_jobs=-1,
    random_state=42,
    verbose=1
)

# 9. Train and select best model
search.fit(X_train, y_train)
best_model = search.best_estimator_

# 10. Register model to Snowflake Model Registry
reg = Registry(session=session)
model_name = "RANDOM_FOREST"
model_version = "V_8"

logged_model = reg.log_model(
    model_name=model_name,
    version_name=model_version,
    model=best_model,
    sample_input_data=X_train.head(100),
    target_platforms=["WAREHOUSE"],
    options={"enable_explainability": True}
)

reg.show_models()

In [None]:
# ──────────────────────────────────────────────
# 11. Run inference on X_test using model registry
# ──────────────────────────────────────────────

model_ver = reg.get_model(model_name).version(model_version)

# Inference
pred_df = model_ver.run(X_test, function_name="predict").rename(
    columns={"output_feature_0": "CHURN_PREDICTION"}
)
prob_df = model_ver.run(X_test, function_name="predict_proba")[["output_feature_1"]].rename(
    columns={"output_feature_1": "CHURN_PREDICTION_PROB"}
)
explain_df = model_ver.run(X_test, function_name="explain")

# ──────────────────────────────────────────────
# 12. Combine original raw test data with predictions
# ──────────────────────────────────────────────

# Get full original rows corresponding to test set
raw_test_df = df_raw.loc[X_test.index].reset_index(drop=True)  # Includes CUSTOMERID and raw GENDER

# Combine everything into final_df
final_df = pd.concat([
    raw_test_df,
    pred_df.reset_index(drop=True),
    prob_df.reset_index(drop=True),
    explain_df.reset_index(drop=True)
], axis=1)

# Add actual label
final_df["CHURN"] = y_test.reset_index(drop=True)

# Evaluation
print(f"\n✅ ROC AUC on test data: {roc_auc_score(final_df['CHURN'], final_df['CHURN_PREDICTION_PROB']):.4f}")
print("✅ Model Classification Report:\n", classification_report(final_df['CHURN'], final_df['CHURN_PREDICTION']))


In [None]:
X_test.dtypes

In [None]:
#Save final_df from above into a snowflake table so we can plug it into Snowflake Intelligence by creating a semantic view on top of it
final_df_snowpark = session.create_dataframe(final_df)
final_df_snowpark.write.mode("overwrite").save_as_table("CHURN_DATA_EXPLANATIONS")