# Use O2C_Template Customized Notebook Template

In [156]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import classification_report, f1_score, r2_score, roc_auc_score, confusion_matrix, accuracy_score

In [157]:
from fosforml.model_manager.snowflakesession import get_session
my_session = get_session()

In [158]:
table_name = 'ORDER_TO_CASH_ENRICHED'

sf_df = my_session.sql("select * from {}".format(table_name))
df = sf_df.to_pandas()

In [159]:
df_train = df[df['INVOICESTATUS'].isin(['LATE_PAYMENT','ONTIME_PAYMENT'])]
df_test = df[~df['INVOICESTATUS'].isin(['LATE_PAYMENT','ONTIME_PAYMENT'])]

In [160]:
df_train.shape, df_test.shape

((170445, 79), (2992, 79))

In [161]:
df_final = df.copy()
df = df_train.copy()

In [162]:
df_final.shape, df.shape

((173437, 79), (170445, 79))

In [163]:
# Assuming df is your dataframe
df['ORDERDATE'] = pd.to_datetime(df['ORDERDATE'], errors='coerce')
df['DELIVERYDATE'] = pd.to_datetime(df['DELIVERYDATE'], errors='coerce')
df['INVOICEDATE'] = pd.to_datetime(df['INVOICEDATE'], errors='coerce')
df['PAYMENTDATE'] = pd.to_datetime(df['PAYMENTDATE'], errors='coerce')
df['INVOICEDUEDATE'] = pd.to_datetime(df['INVOICEDUEDATE'], errors='coerce')
df['DELIVEREDON'] = pd.to_datetime(df['DELIVEREDON'], errors='coerce')

In [164]:
df['MONTH'] = pd.to_datetime(df['ORDERDATE']).dt.month
df['YEAR'] = pd.to_datetime(df['ORDERDATE']).dt.year
df['WEEK'] = pd.to_datetime(df['ORDERDATE']).dt.isocalendar().week
df['WEEKEND'] = pd.to_datetime(df['ORDERDATE']).dt.weekday >= 5  # True if the day is Saturday or Sunday
df['DAY_OF_THE_MONTH'] = pd.to_datetime(df['ORDERDATE']).dt.day

In [165]:
# Delivery Delay
df['DELIVERY_DELAY'] = (df['DELIVEREDON'] - df['DELIVERYDATE']).dt.days

# Payment Delay
df['PAYMENT_DELAY'] = (df['PAYMENTDATE'] - df['INVOICEDUEDATE']).dt.days

In [166]:
df[['PAYMENTDATE','INVOICEDUEDATE','PAYMENT_DELAY','DELIVEREDON','DELIVERYDATE','DELIVERY_DELAY']].head()

Unnamed: 0,PAYMENTDATE,INVOICEDUEDATE,PAYMENT_DELAY,DELIVEREDON,DELIVERYDATE,DELIVERY_DELAY
0,2022-10-11,2022-09-28,13,2022-09-02,2022-08-28,5
1,2022-09-07,2022-09-16,-9,2022-08-22,2022-08-20,2
2,2022-06-09,2022-06-09,0,2022-05-16,2022-05-14,2
3,2022-06-09,2022-06-09,0,2022-05-16,2022-05-14,2
4,2024-03-25,2024-03-13,12,2024-02-12,2024-02-10,2


In [167]:
def set_flag(delay):
    return 1 if delay >= 1 else 0

In [168]:
df['PAYMENT_DELAY_FLAG'] = df['PAYMENT_DELAY'].apply(set_flag)
df['DELIVERY_DELAY_FLAG'] = df['DELIVERY_DELAY'].apply(set_flag)

In [169]:
df['PAYMENT_DELAY_FLAG'].value_counts()

PAYMENT_DELAY_FLAG
1    89412
0    81033
Name: count, dtype: int64

In [170]:
df['DELIVERY_DELAY_FLAG'].value_counts()

DELIVERY_DELAY_FLAG
1    170445
Name: count, dtype: int64

In [171]:
print(list(df.columns))

['ORDERID', 'ORDERITEMID', 'PRODUCTID', 'PRODUCTNAME', 'ORDERQUANTITY', 'UNITPRICE', 'ORDERVALUE', 'CUSTOMERID', 'CUSTOMERNAME', 'PRODUCTCATEGORY', 'ORDERDATE', 'ORDERSTATUS', 'ORDERAMOUNT', 'ORDERCOUNT', 'SHIPMENTID', 'SHIPMENTDATE', 'DELIVERYDATE', 'DELIVEREDON', 'CARRIER', 'SHIPMENTSTATUS', 'INVOICEID', 'INVOICEDATE', 'INVOICEDUEDATE', 'PAYMENTDATE', 'INVOICESTATUS', 'PAYMENTMETHOD', 'PRODUCTTYPE', 'QUANTITY', 'COMPANYTYPE', 'CONTACTDETAILS', 'EMAILDETAILS', 'ADDRESSDETAILS', 'ADMINDETAILS', 'CREDITLIMIT', 'CUSTOMERSINCE', 'PAYMENTTERMS', 'CREDITLIMITTYPE', 'CUSTOMERTYPE', 'SUPPLIERNAME', 'SUPPLIERID', 'CC_CUSTOMER_LIFETIME_VALUE', 'CC_ORDER_FREQUENCY', 'CC_AVERAGE_ORDER_VALUE', 'CC_RECENCY', 'CC_PREFERRED_PAYMENT_METHOD', 'CC_PREFERRED_PRODUCT_CATEGORY', 'CC_PREFERRED_PRODUCT_TYPE', 'CC_AVG_ORDER_PROCESSING_TIME', 'CC_AVG_INVOICE_PROCESSING_TIME', 'CC_AVG_DELIVERY_DELAY', 'CC_AVG_PAYMENT_DELAY', 'CC_TOTAL_ORDERS', 'CC_TOTAL_DELAYS', 'CC_ORDER_CONSISTENCY', 'CC_INVOICE_CONSISTENCY',

In [172]:
cols = ['ORDERAMOUNT','ORDERCOUNT',
'CC_CUSTOMER_LIFETIME_VALUE', 'CC_ORDER_FREQUENCY', 'CC_AVERAGE_ORDER_VALUE', 'CC_RECENCY', 'CC_AVG_ORDER_PROCESSING_TIME', 'CC_AVG_INVOICE_PROCESSING_TIME', 
'CC_AVG_DELIVERY_DELAY', 'CC_AVG_PAYMENT_DELAY', 'CC_TOTAL_ORDERS', 'CC_TOTAL_DELAYS', 'CC_ORDER_CONSISTENCY', 'CC_INVOICE_CONSISTENCY', 
'CC_DELIVERY_CONSISTENCY', 'CC_PAYMENT_CONSISTENCY', 'CAT_TOTAL_SALES_VOLUME', 'CAT_TOTAL_SALES_VALUE', 
'CAT_AVG_ORDER_QUANTITY', 'CAT_AVG_UNIT_PRICE', 'CAT_NUMBER_OF_ORDERS', 'CAT_AVG_DELIVERY_TIME', 'CAT_AVG_INVOICE_TIME', 
'SP_CUSTOMER_LIFETIME_VALUE', 'SP_ORDER_FREQUENCY', 'SP_AVERAGE_ORDER_VALUE', 'SP_AVG_ORDER_PROCESSING_TIME', 'SP_AVG_DELIVERY_DELAY', 
'SP_TOTAL_ORDERS', 'SP_ORDER_CONSISTENCY', 'SP_DELIVERY_CONSISTENCY', 'MONTH', 'YEAR', 'WEEK', 'WEEKEND', 'DAY_OF_THE_MONTH', 'PAYMENT_DELAY_FLAG']

In [173]:
df_payments = df[cols]

In [174]:
df_payments.head()

Unnamed: 0,ORDERAMOUNT,ORDERCOUNT,CC_CUSTOMER_LIFETIME_VALUE,CC_ORDER_FREQUENCY,CC_AVERAGE_ORDER_VALUE,CC_RECENCY,CC_AVG_ORDER_PROCESSING_TIME,CC_AVG_INVOICE_PROCESSING_TIME,CC_AVG_DELIVERY_DELAY,CC_AVG_PAYMENT_DELAY,...,SP_AVG_DELIVERY_DELAY,SP_TOTAL_ORDERS,SP_ORDER_CONSISTENCY,SP_DELIVERY_CONSISTENCY,MONTH,YEAR,WEEK,WEEKEND,DAY_OF_THE_MONTH,PAYMENT_DELAY_FLAG
0,1410000,1,1140591720,582,946549.145228,-91,15.033195,36.940249,3.034025,12.499585,...,3.018392,32025,1.718133,1.415584,8,2022,32,False,10,1
1,37255,1,14395460,568,12672.059859,-71,4.529049,17.872359,1.490317,-6.616197,...,1.494201,75789,0.818478,0.49997,8,2022,33,False,15,0
2,23085,2,15289447,601,13079.082121,-91,4.523524,18.497861,1.495295,-6.05047,...,1.494201,75789,0.818478,0.49997,5,2022,19,False,9,0
3,23085,2,15289447,601,13079.082121,-91,4.523524,18.497861,1.495295,-6.05047,...,1.494201,75789,0.818478,0.49997,5,2022,19,False,9,0
4,140075,1,41857229,590,35056.305695,-80,4.520101,29.511725,1.446399,0.010888,...,1.494201,75789,0.818478,0.49997,2,2024,6,False,7,1


In [175]:
df_payments.shape

(170445, 37)

In [176]:
df_payments = df_payments.drop_duplicates()

In [177]:
df_payments.shape

(84999, 37)

In [178]:
df_payments['PAYMENT_DELAY_FLAG'].value_counts()

PAYMENT_DELAY_FLAG
0    45954
1    39045
Name: count, dtype: int64

In [179]:
# Separate features and target variable
X = df_payments.drop(columns=['PAYMENT_DELAY_FLAG'])
y = df_payments['PAYMENT_DELAY_FLAG']

In [180]:
X.shape

(84999, 36)

# Standard Scaling of Input

In [181]:
from sklearn.preprocessing import StandardScaler

# Standardize the predictor variables
scaler = StandardScaler()

X_scaled = scaler.fit_transform(X)

df_scaled = pd.DataFrame(X_scaled, columns=X.columns)

In [182]:
df_scaled.shape

(84999, 36)

In [183]:
df_scaled['PAYMENT_DELAY_FLAG'] = y.values

In [184]:
df_scaled['PAYMENT_DELAY_FLAG'].value_counts()

PAYMENT_DELAY_FLAG
0    45954
1    39045
Name: count, dtype: int64

# Model Training for Payment Delay Prediction

In [192]:
# Separate features and target variable
X = df_scaled.drop(columns=['PAYMENT_DELAY_FLAG'])
y = df_scaled['PAYMENT_DELAY_FLAG']

In [193]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Decision Tree Classifier

In [194]:
#from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier()
dtc.fit(X_train, y_train)

In [195]:
y_pred = dtc.predict(X_test)

In [196]:
dtc_train_acc = accuracy_score(y_train, dtc.predict(X_train))
dtc_test_acc = accuracy_score(y_test, y_pred)

print(f"Training accuracy of Decision Tree is : {dtc_train_acc}")
print(f"Test accuracy of Decision Tree is : {dtc_test_acc}")

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

Training accuracy of Decision Tree is : 1.0
Test accuracy of Decision Tree is : 0.9679529411764706
[[11102   379]
 [  302  9467]]
              precision    recall  f1-score   support

           0       0.97      0.97      0.97     11481
           1       0.96      0.97      0.97      9769

    accuracy                           0.97     21250
   macro avg       0.97      0.97      0.97     21250
weighted avg       0.97      0.97      0.97     21250



# Random Forest Classifier

In [197]:
#from sklearn.tree import DecisionTreeClassifier
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)

In [198]:
y_pred = rfc.predict(X_test)

In [199]:
rfc_train_acc = accuracy_score(y_train, rfc.predict(X_train))
rfc_test_acc = accuracy_score(y_test, y_pred)

print(f"Training accuracy of RandomForest is : {rfc_train_acc}")
print(f"Test accuracy of RandomForest is : {rfc_test_acc}")

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

Training accuracy of RandomForest is : 0.9999686269588542
Test accuracy of RandomForest is : 0.9833411764705883
[[11443    38]
 [  316  9453]]
              precision    recall  f1-score   support

           0       0.97      1.00      0.98     11481
           1       1.00      0.97      0.98      9769

    accuracy                           0.98     21250
   macro avg       0.98      0.98      0.98     21250
weighted avg       0.98      0.98      0.98     21250



In [200]:
from fosforml import register_model

In [201]:
type(X_train), type(X_test),type(y_train),type(y_test), type(y_pred)

(pandas.core.frame.DataFrame,
 pandas.core.frame.DataFrame,
 pandas.core.series.Series,
 pandas.core.series.Series,
 numpy.ndarray)

In [202]:
y_train_df = pd.DataFrame(y_train)
y_test_df = pd.DataFrame(y_test)

In [203]:
y_pred_df = pd.DataFrame(y_pred, columns=['PREDICTED_PAYMENT_DELAY'])

In [204]:
type(X_train), type(X_test),type(y_train_df),type(y_test_df), type(y_pred_df)

(pandas.core.frame.DataFrame,
 pandas.core.frame.DataFrame,
 pandas.core.frame.DataFrame,
 pandas.core.frame.DataFrame,
 pandas.core.frame.DataFrame)

# Model Registration

In [None]:
## registering the model in Fosfor Insight Designer.
register_model(
    model_obj=rfc, 
    session=my_session,
    x_train=X_train,
    y_train=y_train_df,
    x_test=X_test,
    y_test=y_test_df,
    y_pred=y_pred_df,
    source="Notebook",
    dataset_name="ORDER_TO_CASH_ENRICHED",
    dataset_source="Snowflake",
    #dataset_source="InMemory",
    name="RandomForest_Payment_Delay_Classifier",
    description="RandomForest model trained via Notebook to identify delay order payment",
    flavour="sklearn",
    model_type="classification",
    conda_dependencies=["scikit-learn==1.3.2"]
)