# Use Customer_Segmentation Notebook Template

In [1]:
!pip install --q --upgrade pip

In [2]:
!pip install --q seaborn

# Kernel Restart here

In [1]:
import numpy as np # linear algebra
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix

from fosforml.model_manager.snowflakesession import get_session
my_session = get_session()

import warnings
warnings.filterwarnings('ignore')

In [2]:
table_name = 'CUSTOMER_DATA_INSIGHT_TRAIN_OUTPUT'

sf_df = my_session.sql("select * from {}".format(table_name))
train = sf_df.to_pandas()

In [3]:
table_name = 'CUSTOMER_DATA_INSIGHT_TEST_OUTPUT'

sf_df = my_session.sql("select * from {}".format(table_name))
test = sf_df.to_pandas()

In [4]:
test.shape

(40000, 32)

In [5]:
train = train[['CUSTOMER_ID', 'DATE', 'AGE', 'SEX', 'EDUCATION_LEVEL',
       'EMPLOYMENT_STATUS', 'HOBBIES', 'MARITAL_STATUS', 'DEPENDENTS',
       'REGISTRATION_DATE', 'BANK_ACCOUNT_TYPE', 'BALANCE',
       'BALANCE_FREQUENCY', 'CREDIT_LIMIT', 'CASH_ADVANCE',
       'CASH_ADVANCE_FREQUENCY', 'CASH_ADVANCE_TRX', 'PURCHASES',
       'ONEOFF_PURCHASES', 'INSTALLMENTS_PURCHASES', 'PURCHASES_FREQUENCY',
       'ONEOFF_PURCHASES_FREQUENCY', 'PURCHASES_INSTALLMENTS_FREQUENCY',
       'PURCHASES_TRX', 'PAYMENTS', 'MINIMUM_PAYMENTS', 'PRC_FULL_PAYMENT',
       'CLUSTER', 'CLUSTER_TYPE']]

In [6]:
test = test[['CUSTOMER_ID', 'DATE', 'AGE', 'SEX', 'EDUCATION_LEVEL',
       'EMPLOYMENT_STATUS', 'HOBBIES', 'MARITAL_STATUS', 'DEPENDENTS',
       'REGISTRATION_DATE', 'BANK_ACCOUNT_TYPE', 'BALANCE',
       'BALANCE_FREQUENCY', 'CREDIT_LIMIT', 'CASH_ADVANCE',
       'CASH_ADVANCE_FREQUENCY', 'CASH_ADVANCE_TRX', 'PURCHASES',
       'ONEOFF_PURCHASES', 'INSTALLMENTS_PURCHASES', 'PURCHASES_FREQUENCY',
       'ONEOFF_PURCHASES_FREQUENCY', 'PURCHASES_INSTALLMENTS_FREQUENCY',
       'PURCHASES_TRX', 'PAYMENTS', 'MINIMUM_PAYMENTS', 'PRC_FULL_PAYMENT',
       'CLUSTER', 'CLUSTER_TYPE']]

In [7]:
df = pd.concat([train, test], axis=0)

In [8]:
df.shape

(120000, 29)

In [9]:
df.tail()

Unnamed: 0,CUSTOMER_ID,DATE,AGE,SEX,EDUCATION_LEVEL,EMPLOYMENT_STATUS,HOBBIES,MARITAL_STATUS,DEPENDENTS,REGISTRATION_DATE,...,INSTALLMENTS_PURCHASES,PURCHASES_FREQUENCY,ONEOFF_PURCHASES_FREQUENCY,PURCHASES_INSTALLMENTS_FREQUENCY,PURCHASES_TRX,PAYMENTS,MINIMUM_PAYMENTS,PRC_FULL_PAYMENT,CLUSTER,CLUSTER_TYPE
39995,60e2911b-6a1c-4d35-bd6f-700a96055b7d,2022-12-01,32,FEMALE,JD,Employed,reading,Single,One Dependent,2020-02-12,...,0.0,Very low,Very low,Very low,0.0,1911.368074,381.694812,0.0,2,Cash Advance Users
39996,96454f77-584a-442b-8f70-bc38e8630908,2024-02-01,25,FEMALE,JD,Unemployed,reading,Single,One Dependent,2017-04-12,...,109.2648,Low,Very low,Low,4.32,0.0,0.0,0.0,0,Conservative Spenders
39997,1429938e-ab3d-4d43-b0a4-e0d3331501a6,2024-05-01,33,FEMALE,JD,Unemployed,hiking,Single,One Dependent,2019-04-12,...,269.88,Very high,Very low,Very high,26.0,4086.277963,1723.818986,0.5,0,Conservative Spenders
39998,558c05aa-e580-47a3-8257-c3b9dbbcca6f,2023-09-01,33,FEMALE,MD,Unemployed,board-games,Single,One Dependent,2005-11-12,...,250.9884,Very high,Very low,Very high,28.08,4372.31742,1663.485321,0.54,0,Conservative Spenders
39999,1e307f04-29b5-497d-b47f-e1aa0ee43fa0,2023-11-01,41,MALE,JD,Unemployed,base-jumping,Single,Two Dependents,2021-09-12,...,2548.1092,Very high,Very low,Very high,46.2,2666.034909,161.25989,0.7,3,Active Credit Users


In [10]:
to_drop = ['CUSTOMER_ID','DATE','AGE','SEX','EDUCATION_LEVEL','EMPLOYMENT_STATUS','HOBBIES','MARITAL_STATUS','BANK_ACCOUNT_TYPE',
 'DEPENDENTS','REGISTRATION_DATE','PURCHASES_FREQUENCY','ONEOFF_PURCHASES_FREQUENCY','PURCHASES_INSTALLMENTS_FREQUENCY','BALANCE_FREQUENCY','CLUSTER_TYPE']

In [11]:
df.drop(to_drop, axis= 1, inplace= True)

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 120000 entries, 0 to 39999
Data columns (total 13 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   BALANCE                 120000 non-null  float64
 1   CREDIT_LIMIT            120000 non-null  float64
 2   CASH_ADVANCE            120000 non-null  float64
 3   CASH_ADVANCE_FREQUENCY  120000 non-null  float64
 4   CASH_ADVANCE_TRX        120000 non-null  float64
 5   PURCHASES               120000 non-null  float64
 6   ONEOFF_PURCHASES        120000 non-null  float64
 7   INSTALLMENTS_PURCHASES  120000 non-null  float64
 8   PURCHASES_TRX           120000 non-null  float64
 9   PAYMENTS                120000 non-null  float64
 10  MINIMUM_PAYMENTS        120000 non-null  float64
 11  PRC_FULL_PAYMENT        120000 non-null  float64
 12  CLUSTER                 120000 non-null  int8   
dtypes: float64(12), int8(1)
memory usage: 12.0 MB


In [13]:
X = df.drop(['CLUSTER'],axis = 1 )
y = df[['CLUSTER']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3,random_state = 101)

In [14]:
X_train.shape, X_test.shape

((84000, 12), (36000, 12))

# Decision Tree Classifier

In [15]:
#Training our algorithm
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)
clf_prediction = clf.predict(X_test)

In [16]:
#Checking our prediction accuracy score
print(confusion_matrix(y_test, clf_prediction))

[[25046     0   126   128]
 [    0   307     0     8]
 [  132     0  5800    31]
 [  108    12    22  4280]]


In [17]:
print(classification_report(y_test, clf_prediction))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99     25300
           1       0.96      0.97      0.97       315
           2       0.98      0.97      0.97      5963
           3       0.96      0.97      0.97      4422

    accuracy                           0.98     36000
   macro avg       0.97      0.98      0.97     36000
weighted avg       0.98      0.98      0.98     36000



In [18]:
clf.feature_importances_

array([0.07993775, 0.03887377, 0.28584169, 0.03316945, 0.10489753,
       0.34025286, 0.00675092, 0.01335958, 0.04494634, 0.02685201,
       0.01197134, 0.01314676])

In [19]:
pd.Series(clf.feature_importances_,index=X_train.columns).sort_values(ascending=False)

PURCHASES                 0.340253
CASH_ADVANCE              0.285842
CASH_ADVANCE_TRX          0.104898
BALANCE                   0.079938
PURCHASES_TRX             0.044946
CREDIT_LIMIT              0.038874
CASH_ADVANCE_FREQUENCY    0.033169
PAYMENTS                  0.026852
INSTALLMENTS_PURCHASES    0.013360
PRC_FULL_PAYMENT          0.013147
MINIMUM_PAYMENTS          0.011971
ONEOFF_PURCHASES          0.006751
dtype: float64

# Model registration Code

In [20]:
y_pred = pd.DataFrame(clf_prediction, columns=['PREDICTION'])

In [21]:
type(X_train),type(X_test),type(y_train), type(y_test), type(y_pred)

(pandas.core.frame.DataFrame,
 pandas.core.frame.DataFrame,
 pandas.core.frame.DataFrame,
 pandas.core.frame.DataFrame,
 pandas.core.frame.DataFrame)

In [22]:
from fosforml import register_model

In [23]:
## registering the model in Fosfor Insight Designer.
register_model(
    model_obj=clf, 
    session=my_session,
    x_train=X_train,
    y_train=y_train,
    x_test=X_test,
    y_test=y_test,
    y_pred=y_pred,
    source="Notebook",
    dataset_name="CUSTOMER_DATA_INSIGHT_TEST_OUTPUT",
    dataset_source="Snowflake",
    #dataset_source="InMemory",
    name="Decision_Tree_Cluster_Classifier",
    description="Customer_Segmentation_Dtree_Cluster_Classifier",
    flavour="sklearn",
    model_type="classification",
    conda_dependencies=["scikit-learn==1.3.2"]
)

Got error object of type 'NoneType' has no len() when trying to read default values from function: <class 'snowflake.ml.modeling.metrics.classification._register_confusion_matrix_computer.<locals>.ConfusionMatrixComputer'>. Proceeding without creating optional arguments
Got error object of type 'NoneType' has no len() when trying to read default values from function: <class 'snowflake.ml.modeling.metrics.metrics_utils.register_accumulator_udtf.<locals>.Accumulator'>. Proceeding without creating optional arguments


Calculating build time metrics

Progress: ██████████████                                                         20.0%
Calculating build time metrics

Progress: ████████████████████████████                                           40.0%


Got error object of type 'NoneType' has no len() when trying to read default values from function: <class 'snowflake.ml.modeling.metrics.classification._register_multilabel_confusion_matrix_computer.<locals>.MultilabelConfusionMatrixComputer'>. Proceeding without creating optional arguments
The version of package 'scikit-learn' in the local environment is 1.3.2, which does not fit the criteria for the requirement 'scikit-learn<1.4'. Your UDF might not work when the package version is different between the server and your local environment.
DataFrame.flatten() is deprecated since 0.7.0. Use `DataFrame.join_table_function()` instead.
Got error object of type 'NoneType' has no len() when trying to read default values from function: <class 'snowflake.ml.modeling.metrics.classification._register_multilabel_confusion_matrix_computer.<locals>.MultilabelConfusionMatrixComputer'>. Proceeding without creating optional arguments
The version of package 'scikit-learn' in the local environment is 1.

Calculating build time metrics

Progress: ██████████████████████████████████████████                             60.0%
Calculating build time metrics

Progress: ████████████████████████████████████████████████████████               80.0%
Calculating build time metrics

Progress: ██████████████████████████████████████████████████████████████████████ 100.0%


"Model 'MODEL_BDB4A35C_1386_4E11_A417_3344FF06266B_FDC_DECISION_TREE_CLUSTER_CLASSIFIER' registered successfully."

# Another Version

In [24]:
clf_prediction = clf.predict(X_train)

In [25]:
y_pred = pd.DataFrame(clf_prediction, columns=['PREDICTION'])

In [26]:
## registering the model in Fosfor Insight Designer.
register_model(
    model_obj=clf, 
    session=my_session,
    x_train=X_train,
    y_train=y_train,
    x_test=X_test,
    y_test=y_test,
    y_pred=y_pred,
    source="Notebook",
    dataset_name="CUSTOMER_DATA_INSIGHT_TEST_OUTPUT",
    dataset_source="Snowflake",
    #dataset_source="InMemory",
    name="Decision_Tree_Cluster_Classifier",
    description="Customer_Segmentation_Dtree_Cluster_Classifier",
    flavour="sklearn",
    model_type="classification",
    conda_dependencies=["scikit-learn==1.3.2"]
)



Calculating build time metrics

Progress: ██████████████                                                         20.0%
Calculating build time metrics

Progress: ████████████████████████████                                           40.0%




(1300) (1304): 01b6cc42-070e-58e8-0072-f30310704ffe: 100357 (P0000): Python Interpreter Error:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.9/site-packages/snowflake/ml/modeling/metrics/classification.py", line 1059, in end_partition
  File "/usr/lib/python_udf/8f8b01a8f02996ac566993cab9e8f275a40a273ccd1b40bc7e44bac11698e30b/lib/python3.9/site-packages/sklearn/utils/_param_validation.py", line 211, in wrapper
    return func(*args, **kwargs)
  File "/usr/lib/python_udf/8f8b01a8f02996ac566993cab9e8f275a40a273ccd1b40bc7e44bac11698e30b/lib/python3.9/site-packages/sklearn/metrics/_classification.py", line 505, in multilabel_confusion_matrix
    y_type, y_true, y_pred = _check_targets(y_true, y_pred)
  File "/usr/lib/python_udf/8f8b01a8f02996ac566993cab9e8f275a40a273ccd1b40bc7e44bac11698e30b/lib/python3.9/site-packages/sklearn/metrics/_classification.py", line 93, in _check_targets
    raise ValueError(
ValueError: Classification metrics can't handle a mix of unknown an



(1300) (1304): 01b6cc42-070e-58e8-0072-f3031087201e: 100357 (P0000): Python Interpreter Error:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.9/site-packages/snowflake/ml/modeling/metrics/classification.py", line 1059, in end_partition
  File "/usr/lib/python_udf/8f8b01a8f02996ac566993cab9e8f275a40a273ccd1b40bc7e44bac11698e30b/lib/python3.9/site-packages/sklearn/utils/_param_validation.py", line 211, in wrapper
    return func(*args, **kwargs)
  File "/usr/lib/python_udf/8f8b01a8f02996ac566993cab9e8f275a40a273ccd1b40bc7e44bac11698e30b/lib/python3.9/site-packages/sklearn/metrics/_classification.py", line 505, in multilabel_confusion_matrix
    y_type, y_true, y_pred = _check_targets(y_true, y_pred)
  File "/usr/lib/python_udf/8f8b01a8f02996ac566993cab9e8f275a40a273ccd1b40bc7e44bac11698e30b/lib/python3.9/site-packages/sklearn/metrics/_classification.py", line 93, in _check_targets
    raise ValueError(
ValueError: Classification metrics can't handle a mix of unknown an



(1300) (1304): 01b6cc42-070e-1713-0072-f30310708f2e: 100357 (P0000): Python Interpreter Error:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.9/site-packages/snowflake/ml/modeling/metrics/classification.py", line 1059, in end_partition
  File "/usr/lib/python_udf/8f8b01a8f02996ac566993cab9e8f275a40a273ccd1b40bc7e44bac11698e30b/lib/python3.9/site-packages/sklearn/utils/_param_validation.py", line 211, in wrapper
    return func(*args, **kwargs)
  File "/usr/lib/python_udf/8f8b01a8f02996ac566993cab9e8f275a40a273ccd1b40bc7e44bac11698e30b/lib/python3.9/site-packages/sklearn/metrics/_classification.py", line 505, in multilabel_confusion_matrix
    y_type, y_true, y_pred = _check_targets(y_true, y_pred)
  File "/usr/lib/python_udf/8f8b01a8f02996ac566993cab9e8f275a40a273ccd1b40bc7e44bac11698e30b/lib/python3.9/site-packages/sklearn/metrics/_classification.py", line 93, in _check_targets
    raise ValueError(
ValueError: Classification metrics can't handle a mix of unknown an

"Model 'MODEL_BDB4A35C_1386_4E11_A417_3344FF06266B_FDC_DECISION_TREE_CLUSTER_CLASSIFIER' registered successfully."