# Use O2C_Template Customized Notebook Template

In [51]:
!pip install --q lifelines statsmodels

You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [1]:
import pandas as pd
import numpy as np

In [2]:
from fosforml.model_manager.snowflakesession import get_session
my_session = get_session()

In [3]:
table_name = 'ORDER_TO_CASH_ENRICHED'

sf_df = my_session.sql("select * from {}".format(table_name))
df = sf_df.to_pandas()

In [4]:
df_train = df[df['INVOICESTATUS'] == 'Paid']
df_new = df[df['INVOICESTATUS'] != 'Paid']

In [5]:
df_train.shape, df_new.shape

((170290, 68), (3001, 68))

In [6]:
# Assuming df is your dataframe
df['ORDERDATE'] = pd.to_datetime(df['ORDERDATE'], errors='coerce')
df['DELIVERYDATE'] = pd.to_datetime(df['DELIVERYDATE'], errors='coerce')
df['INVOICEDATE'] = pd.to_datetime(df['INVOICEDATE'], errors='coerce')
df['PAYMENTDATE'] = pd.to_datetime(df['PAYMENTDATE'], errors='coerce')
df['INVOICEDUEDATE'] = pd.to_datetime(df['INVOICEDUEDATE'], errors='coerce')
df['DELIVEREDON'] = pd.to_datetime(df['DELIVEREDON'], errors='coerce')

In [7]:
df['MONTH'] = pd.to_datetime(df['ORDERDATE']).dt.month
df['YEAR'] = pd.to_datetime(df['ORDERDATE']).dt.year
df['WEEK'] = pd.to_datetime(df['ORDERDATE']).dt.isocalendar().week
df['WEEKEND'] = pd.to_datetime(df['ORDERDATE']).dt.weekday >= 5  # True if the day is Saturday or Sunday
df['DAY_OF_THE_MONTH'] = pd.to_datetime(df['ORDERDATE']).dt.day

In [8]:
# Delivery Delay
df['DELIVERY_DELAY'] = (df['DELIVERYDATE'] - df['DELIVEREDON']).dt.days

# Payment Delay
df['PAYMENT_DELAY'] = (df['PAYMENTDATE'] - df['INVOICEDUEDATE']).dt.days

In [9]:
df.columns

Index(['ORDERID', 'ORDERITEMID', 'PRODUCTID', 'PRODUCTNAME', 'ORDERQUANTITY',
       'UNITPRICE', 'ORDERVALUE', 'CUSTOMERID', 'CUSTOMERNAME',
       'PRODUCTCATEGORY', 'ORDERDATE', 'ORDERSTATUS', 'ORDERAMOUNT',
       'ORDERCOUNT', 'SHIPMENTID', 'SHIPMENTDATE', 'DELIVERYDATE',
       'DELIVEREDON', 'CARRIER', 'SHIPMENTSTATUS', 'INVOICEID', 'INVOICEDATE',
       'INVOICEDUEDATE', 'PAYMENTDATE', 'INVOICESTATUS', 'PAYMENTMETHOD',
       'PRODUCTTYPE', 'QUANTITY', 'NAME', 'COMPANYTYPE', 'CONTACTDETAILS',
       'EMAILDETAILS', 'ADDRESSDETAILS', 'ADMINDETAILS', 'CREDITLIMIT',
       'CREDITRATING', 'CUSTOMERTYPE', 'CC_CUSTOMER_LIFETIME_VALUE',
       'CC_ORDER_FREQUENCY', 'CC_AVERAGE_ORDER_VALUE', 'CC_RECENCY',
       'CC_PREFERRED_PAYMENT_METHOD', 'CC_PREFERRED_PRODUCT_CATEGORY',
       'CC_PREFERRED_PRODUCT_TYPE', 'CC_AVG_ORDER_PROCESSING_TIME',
       'CC_AVG_INVOICE_PROCESSING_TIME', 'CC_AVG_DELIVERY_DELAY',
       'CC_AVG_PAYMENT_DELAY', 'CC_TOTAL_ORDERS', 'CC_TOTAL_DELAYS',
       'CC

In [10]:
cols = ['ORDERID','PRODUCTID','CUSTOMERID','PRODUCTNAME','ORDERQUANTITY'
,'UNITPRICE','ORDERVALUE','CUSTOMERNAME','PRODUCTCATEGORY','ORDERDATE'
,'PRODUCTTYPE','COMPANYTYPE','ADMINDETAILS','CREDITLIMIT','CREDITRATING','CUSTOMERTYPE'
,'CC_CUSTOMER_LIFETIME_VALUE','CC_ORDER_FREQUENCY', 'CC_AVERAGE_ORDER_VALUE', 'CC_RECENCY'
,'CC_PREFERRED_PAYMENT_METHOD', 'CC_PREFERRED_PRODUCT_CATEGORY'
,'CC_PREFERRED_PRODUCT_TYPE', 'CC_AVG_ORDER_PROCESSING_TIME'
,'CC_AVG_INVOICE_PROCESSING_TIME', 'CC_AVG_DELIVERY_DELAY'
,'CC_AVG_PAYMENT_DELAY', 'CC_TOTAL_ORDERS', 'CC_TOTAL_DELAYS'
,'CC_ORDER_CONSISTENCY', 'CC_INVOICE_CONSISTENCY'
,'CC_DELIVERY_CONSISTENCY', 'CC_PAYMENT_CONSISTENCY'
,'PR_TOTAL_SALES_VOLUME', 'PR_TOTAL_SALES_VALUE'
,'PR_AVG_ORDER_QUANTITY', 'PR_AVG_UNIT_PRICE', 'PR_NUMBER_OF_ORDERS'
,'PR_AVG_DELIVERY_TIME', 'PR_AVG_INVOICE_TIME', 'CAT_TOTAL_SALES_VOLUME'
,'CAT_TOTAL_SALES_VALUE', 'CAT_AVG_ORDER_QUANTITY', 'CAT_AVG_UNIT_PRICE'
,'CAT_NUMBER_OF_ORDERS', 'CAT_AVG_DELIVERY_TIME'
,'CAT_AVG_INVOICE_TIME','DELIVERY_DELAY','PAYMENT_DELAY'
,'MONTH', 'YEAR', 'WEEK', 'WEEKEND', 'DAY_OF_THE_MONTH','INVOICESTATUS','PAYMENTDATE']

In [11]:
df = df[cols]

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 173291 entries, 0 to 173290
Data columns (total 56 columns):
 #   Column                          Non-Null Count   Dtype         
---  ------                          --------------   -----         
 0   ORDERID                         173291 non-null  object        
 1   PRODUCTID                       173291 non-null  object        
 2   CUSTOMERID                      173291 non-null  object        
 3   PRODUCTNAME                     173291 non-null  object        
 4   ORDERQUANTITY                   173291 non-null  int16         
 5   UNITPRICE                       173291 non-null  int32         
 6   ORDERVALUE                      173291 non-null  int32         
 7   CUSTOMERNAME                    173291 non-null  object        
 8   PRODUCTCATEGORY                 173291 non-null  object        
 9   ORDERDATE                       173291 non-null  datetime64[ns]
 10  PRODUCTTYPE                     173291 non-null  object 

In [13]:
df.tail()

Unnamed: 0,ORDERID,PRODUCTID,CUSTOMERID,PRODUCTNAME,ORDERQUANTITY,UNITPRICE,ORDERVALUE,CUSTOMERNAME,PRODUCTCATEGORY,ORDERDATE,...,CAT_AVG_INVOICE_TIME,DELIVERY_DELAY,PAYMENT_DELAY,MONTH,YEAR,WEEK,WEEKEND,DAY_OF_THE_MONTH,INVOICESTATUS,PAYMENTDATE
173286,OR-8eba737b-c6e3-4f33-82b9-2bcb439623c6,PID-152a2d71-4427-49ac-a84a-9720e73f4ac0,CID-54d06c49-31d4-4e2e-a4c5-219ae27349f2,Ergonomic Chair,772,299,230828,Marathon,Office Supplies,2023-07-04,...,7.97128,0.0,-5.0,7,2023,27,False,4,Paid,2023-07-15
173287,OR-3a1f21f9-b5a7-4b49-8aab-7fd4ded05489,PID-6e3e9bbf-0256-4c50-88ba-aa901a601722,CID-213f052b-75af-493d-b3e6-c0bbefc50c32,Gourmet Chocolate,790,9,7110,DOLCH,Food Beverages,2024-08-24,...,7.972449,-2.0,0.0,8,2024,34,True,24,Paid,2024-09-10
173288,OR-3a1f21f9-b5a7-4b49-8aab-7fd4ded05489,PID-ab3f0526-0015-47a8-b6a9-22cbe7e10459,CID-213f052b-75af-493d-b3e6-c0bbefc50c32,Greek Yogurt,421,3,1263,DOLCH,Food Beverages,2024-08-24,...,7.972449,-2.0,0.0,8,2024,34,True,24,Paid,2024-09-10
173289,OR-3a1f21f9-b5a7-4b49-8aab-7fd4ded05489,PID-08d031fd-c61f-47f3-b4d1-e9c79ba23b68,CID-213f052b-75af-493d-b3e6-c0bbefc50c32,Herbal Tea,450,2,900,DOLCH,Food Beverages,2024-08-24,...,7.972449,-2.0,0.0,8,2024,34,True,24,Paid,2024-09-10
173290,OR-1a3df4f8-27b1-4d77-b5ce-3fb49b0df85f,PID-5f01775e-530d-4262-be7c-6960da748d67,CID-4995f0b2-f8e7-4f44-b516-dee7004d9185,Whiteboards,916,19,17404,TautaLog,Office Supplies,2024-01-02,...,7.97128,0.0,8.0,1,2024,1,False,2,Paid,2024-01-26


In [14]:
from lifelines import CoxPHFitter

In [15]:
# Create duration and event columns
df['duration'] = (df['PAYMENTDATE'] - df['ORDERDATE']).dt.days
df['event'] = df['PAYMENTDATE'].notna().astype(int)  # 1 if payment was made, 0 otherwise

In [27]:
cols = ['duration','event','ORDERQUANTITY','UNITPRICE','ORDERVALUE'
,'CC_CUSTOMER_LIFETIME_VALUE','CC_ORDER_FREQUENCY', 'CC_AVERAGE_ORDER_VALUE', 'CC_RECENCY','CC_AVG_ORDER_PROCESSING_TIME'
,'CC_AVG_INVOICE_PROCESSING_TIME', 'CC_AVG_DELIVERY_DELAY','CC_AVG_PAYMENT_DELAY', 'CC_TOTAL_ORDERS', 'CC_TOTAL_DELAYS'
,'CC_ORDER_CONSISTENCY', 'CC_INVOICE_CONSISTENCY','CC_DELIVERY_CONSISTENCY', 'CC_PAYMENT_CONSISTENCY'
,'PR_TOTAL_SALES_VOLUME', 'PR_TOTAL_SALES_VALUE','PR_AVG_ORDER_QUANTITY', 'PR_AVG_UNIT_PRICE', 'PR_NUMBER_OF_ORDERS'
,'PR_AVG_DELIVERY_TIME', 'PR_AVG_INVOICE_TIME', 'CAT_TOTAL_SALES_VOLUME','CAT_TOTAL_SALES_VALUE', 'CAT_AVG_ORDER_QUANTITY'
,'CAT_AVG_UNIT_PRICE','CAT_NUMBER_OF_ORDERS', 'CAT_AVG_DELIVERY_TIME','CAT_AVG_INVOICE_TIME'
,'MONTH', 'YEAR', 'WEEK', 'DAY_OF_THE_MONTH']

In [28]:
df_model = df[cols]

In [29]:
df_model.shape

(173291, 37)

In [30]:
df_model.dropna(inplace=True)

In [31]:
df_model.shape

(171043, 37)

In [32]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Select the predictor variables
X = df_model.drop(columns=['duration', 'event'])

# Calculate VIF for each predictor variable
vif_data = pd.DataFrame()
vif_data['feature'] = X.columns

In [33]:
X.values

array([[2328, 9, 20952, ..., 2022, 45, 11],
       [2086, 5, 10430, ..., 2022, 45, 11],
       [150, 3, 450, ..., 2022, 45, 11],
       ...,
       [421, 3, 1263, ..., 2024, 34, 24],
       [450, 2, 900, ..., 2024, 34, 24],
       [916, 19, 17404, ..., 2024, 1, 2]], dtype=object)

In [34]:
for i in range(len(X.columns)):
    print(i)
    vif_data['VIF'] = variance_inflation_factor(X.values, i)

0


TypeError: ufunc 'isfinite' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''

In [None]:
print(vif_data)

In [None]:
# Example: Remove variables with VIF > 10
high_vif_features = vif_data[vif_data['VIF'] > 10]['feature']
df_model_reduced = df_model.drop(columns=high_vif_features)

In [None]:
# Initialize the Cox Proportional Hazards model
cph = CoxPHFitter()

In [None]:
# Fit the model
cph.fit(df_model_reduced, duration_col='duration', event_col='event')

In [None]:
# Print the summary of the model
cph.print_summary()

In [None]:
# Predict the survival function for a new customer order
# Example: new_customer_order = df_model.iloc[0]  # Replace with actual new order data
# survival_function = cph.predict_survival_function(new_customer_order)
# print(survival_function)