# Use O2C_Template Customized Notebook Template

In [63]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import classification_report, f1_score, r2_score, roc_auc_score, confusion_matrix, accuracy_score

In [64]:
from fosforml.model_manager.snowflakesession import get_session
my_session = get_session()

In [65]:
table_name = 'ORDER_TO_CASH_ENRICHED'

sf_df = my_session.sql("select * from {}".format(table_name))
df = sf_df.to_pandas()

In [66]:
df_train = df[df['INVOICESTATUS'].isin(['LATE_PAYMENT','ONTIME_PAYMENT'])]
df_test = df[~df['INVOICESTATUS'].isin(['LATE_PAYMENT','ONTIME_PAYMENT'])]

In [67]:
df_train.shape, df_test.shape

((170445, 79), (2992, 79))

In [68]:
df_final = df.copy()
df = df_train.copy()

In [69]:
df_final.shape, df.shape

((173437, 79), (170445, 79))

In [70]:
# Assuming df is your dataframe
df['ORDERDATE'] = pd.to_datetime(df['ORDERDATE'], errors='coerce')
df['DELIVERYDATE'] = pd.to_datetime(df['DELIVERYDATE'], errors='coerce')
df['INVOICEDATE'] = pd.to_datetime(df['INVOICEDATE'], errors='coerce')
df['PAYMENTDATE'] = pd.to_datetime(df['PAYMENTDATE'], errors='coerce')
df['INVOICEDUEDATE'] = pd.to_datetime(df['INVOICEDUEDATE'], errors='coerce')
df['DELIVEREDON'] = pd.to_datetime(df['DELIVEREDON'], errors='coerce')

In [71]:
df['MONTH'] = pd.to_datetime(df['ORDERDATE']).dt.month
df['YEAR'] = pd.to_datetime(df['ORDERDATE']).dt.year
df['WEEK'] = pd.to_datetime(df['ORDERDATE']).dt.isocalendar().week
df['WEEKEND'] = pd.to_datetime(df['ORDERDATE']).dt.weekday >= 5  # True if the day is Saturday or Sunday
df['DAY_OF_THE_MONTH'] = pd.to_datetime(df['ORDERDATE']).dt.day

In [72]:
# Delivery Delay
df['DELIVERY_DELAY'] = (df['DELIVEREDON'] - df['DELIVERYDATE']).dt.days

# Payment Delay
df['PAYMENT_DELAY'] = (df['PAYMENTDATE'] - df['INVOICEDUEDATE']).dt.days

In [73]:
df[['PAYMENTDATE','INVOICEDUEDATE','PAYMENT_DELAY','DELIVEREDON','DELIVERYDATE','DELIVERY_DELAY']].head()

Unnamed: 0,PAYMENTDATE,INVOICEDUEDATE,PAYMENT_DELAY,DELIVEREDON,DELIVERYDATE,DELIVERY_DELAY
0,2022-10-11,2022-09-28,13,2022-09-02,2022-08-28,5
1,2022-09-07,2022-09-16,-9,2022-08-22,2022-08-20,2
2,2022-06-09,2022-06-09,0,2022-05-16,2022-05-14,2
3,2022-06-09,2022-06-09,0,2022-05-16,2022-05-14,2
4,2024-03-25,2024-03-13,12,2024-02-12,2024-02-10,2


In [74]:
def set_flag(delay):
    return 'Y' if delay >= 1 else 'N'

In [75]:
df['PAYMENT_DELAY_FLAG'] = df['PAYMENT_DELAY'].apply(set_flag)
df['DELIVERY_DELAY_FLAG'] = df['DELIVERY_DELAY'].apply(set_flag)

In [76]:
df['PAYMENT_DELAY_FLAG'].value_counts()

PAYMENT_DELAY_FLAG
Y    89412
N    81033
Name: count, dtype: int64

In [77]:
df['DELIVERY_DELAY_FLAG'].value_counts()

DELIVERY_DELAY_FLAG
Y    170445
Name: count, dtype: int64

In [78]:
print(list(df.columns))

['ORDERID', 'ORDERITEMID', 'PRODUCTID', 'PRODUCTNAME', 'ORDERQUANTITY', 'UNITPRICE', 'ORDERVALUE', 'CUSTOMERID', 'CUSTOMERNAME', 'PRODUCTCATEGORY', 'ORDERDATE', 'ORDERSTATUS', 'ORDERAMOUNT', 'ORDERCOUNT', 'SHIPMENTID', 'SHIPMENTDATE', 'DELIVERYDATE', 'DELIVEREDON', 'CARRIER', 'SHIPMENTSTATUS', 'INVOICEID', 'INVOICEDATE', 'INVOICEDUEDATE', 'PAYMENTDATE', 'INVOICESTATUS', 'PAYMENTMETHOD', 'PRODUCTTYPE', 'QUANTITY', 'COMPANYTYPE', 'CONTACTDETAILS', 'EMAILDETAILS', 'ADDRESSDETAILS', 'ADMINDETAILS', 'CREDITLIMIT', 'CUSTOMERSINCE', 'PAYMENTTERMS', 'CREDITLIMITTYPE', 'CUSTOMERTYPE', 'SUPPLIERNAME', 'SUPPLIERID', 'CC_CUSTOMER_LIFETIME_VALUE', 'CC_ORDER_FREQUENCY', 'CC_AVERAGE_ORDER_VALUE', 'CC_RECENCY', 'CC_PREFERRED_PAYMENT_METHOD', 'CC_PREFERRED_PRODUCT_CATEGORY', 'CC_PREFERRED_PRODUCT_TYPE', 'CC_AVG_ORDER_PROCESSING_TIME', 'CC_AVG_INVOICE_PROCESSING_TIME', 'CC_AVG_DELIVERY_DELAY', 'CC_AVG_PAYMENT_DELAY', 'CC_TOTAL_ORDERS', 'CC_TOTAL_DELAYS', 'CC_ORDER_CONSISTENCY', 'CC_INVOICE_CONSISTENCY',

In [118]:
cols = ['ORDERAMOUNT','ORDERCOUNT',
'CC_CUSTOMER_LIFETIME_VALUE', 'CC_ORDER_FREQUENCY', 'CC_AVERAGE_ORDER_VALUE', 'CC_RECENCY', 'CC_AVG_ORDER_PROCESSING_TIME', 'CC_AVG_INVOICE_PROCESSING_TIME', 
'CC_AVG_DELIVERY_DELAY', 'CC_AVG_PAYMENT_DELAY', 'CC_TOTAL_ORDERS', 'CC_TOTAL_DELAYS', 'CC_ORDER_CONSISTENCY', 'CC_INVOICE_CONSISTENCY', 
'CC_DELIVERY_CONSISTENCY', 'CC_PAYMENT_CONSISTENCY', 'CAT_TOTAL_SALES_VOLUME', 'CAT_TOTAL_SALES_VALUE', 
'CAT_AVG_ORDER_QUANTITY', 'CAT_AVG_UNIT_PRICE', 'CAT_NUMBER_OF_ORDERS', 'CAT_AVG_DELIVERY_TIME', 'CAT_AVG_INVOICE_TIME', 
'SP_CUSTOMER_LIFETIME_VALUE', 'SP_ORDER_FREQUENCY', 'SP_AVERAGE_ORDER_VALUE', 'SP_AVG_ORDER_PROCESSING_TIME', 'SP_AVG_DELIVERY_DELAY', 
'SP_TOTAL_ORDERS', 'SP_ORDER_CONSISTENCY', 'SP_DELIVERY_CONSISTENCY', 'MONTH', 'YEAR', 'WEEK', 'WEEKEND', 'DAY_OF_THE_MONTH', 'PAYMENT_DELAY_FLAG']

In [119]:
df_payments = df[cols]

In [120]:
df_payments.head()

Unnamed: 0,ORDERAMOUNT,ORDERCOUNT,CC_CUSTOMER_LIFETIME_VALUE,CC_ORDER_FREQUENCY,CC_AVERAGE_ORDER_VALUE,CC_RECENCY,CC_AVG_ORDER_PROCESSING_TIME,CC_AVG_INVOICE_PROCESSING_TIME,CC_AVG_DELIVERY_DELAY,CC_AVG_PAYMENT_DELAY,...,SP_AVG_DELIVERY_DELAY,SP_TOTAL_ORDERS,SP_ORDER_CONSISTENCY,SP_DELIVERY_CONSISTENCY,MONTH,YEAR,WEEK,WEEKEND,DAY_OF_THE_MONTH,PAYMENT_DELAY_FLAG
0,1410000,1,1140591720,582,946549.145228,-91,15.033195,36.940249,3.034025,12.499585,...,3.018392,32025,1.718133,1.415584,8,2022,32,False,10,Y
1,37255,1,14395460,568,12672.059859,-71,4.529049,17.872359,1.490317,-6.616197,...,1.494201,75789,0.818478,0.49997,8,2022,33,False,15,N
2,23085,2,15289447,601,13079.082121,-91,4.523524,18.497861,1.495295,-6.05047,...,1.494201,75789,0.818478,0.49997,5,2022,19,False,9,N
3,23085,2,15289447,601,13079.082121,-91,4.523524,18.497861,1.495295,-6.05047,...,1.494201,75789,0.818478,0.49997,5,2022,19,False,9,N
4,140075,1,41857229,590,35056.305695,-80,4.520101,29.511725,1.446399,0.010888,...,1.494201,75789,0.818478,0.49997,2,2024,6,False,7,Y


In [121]:
df_payments.shape

(170445, 37)

In [122]:
df_payments = df_payments.drop_duplicates()

In [123]:
df_payments.shape

(84999, 37)

In [124]:
df_payments['PAYMENT_DELAY_FLAG'].value_counts()

PAYMENT_DELAY_FLAG
N    45954
Y    39045
Name: count, dtype: int64

In [125]:
# Separate features and target variable
X = df_payments.drop(columns=['PAYMENT_DELAY_FLAG'])
y = df_payments['PAYMENT_DELAY_FLAG']

In [126]:
X.shape

(84999, 36)

In [128]:
from sklearn.preprocessing import StandardScaler

# Standardize the predictor variables
scaler = StandardScaler()

X_scaled = scaler.fit_transform(X)

df_scaled = pd.DataFrame(X_scaled, columns=X.columns)

In [129]:
df_scaled.shape

(84999, 36)

In [130]:
df_scaled['PAYMENT_DELAY_FLAG'] = y

In [132]:
df_scaled.columns

Index(['ORDERAMOUNT', 'ORDERCOUNT', 'CC_CUSTOMER_LIFETIME_VALUE',
       'CC_ORDER_FREQUENCY', 'CC_AVERAGE_ORDER_VALUE', 'CC_RECENCY',
       'CC_AVG_ORDER_PROCESSING_TIME', 'CC_AVG_INVOICE_PROCESSING_TIME',
       'CC_AVG_DELIVERY_DELAY', 'CC_AVG_PAYMENT_DELAY', 'CC_TOTAL_ORDERS',
       'CC_TOTAL_DELAYS', 'CC_ORDER_CONSISTENCY', 'CC_INVOICE_CONSISTENCY',
       'CC_DELIVERY_CONSISTENCY', 'CC_PAYMENT_CONSISTENCY',
       'CAT_TOTAL_SALES_VOLUME', 'CAT_TOTAL_SALES_VALUE',
       'CAT_AVG_ORDER_QUANTITY', 'CAT_AVG_UNIT_PRICE', 'CAT_NUMBER_OF_ORDERS',
       'CAT_AVG_DELIVERY_TIME', 'CAT_AVG_INVOICE_TIME',
       'SP_CUSTOMER_LIFETIME_VALUE', 'SP_ORDER_FREQUENCY',
       'SP_AVERAGE_ORDER_VALUE', 'SP_AVG_ORDER_PROCESSING_TIME',
       'SP_AVG_DELIVERY_DELAY', 'SP_TOTAL_ORDERS', 'SP_ORDER_CONSISTENCY',
       'SP_DELIVERY_CONSISTENCY', 'MONTH', 'YEAR', 'WEEK', 'WEEKEND',
       'DAY_OF_THE_MONTH', 'PAYMENT_DELAY_FLAG'],
      dtype='object')

In [133]:
df_scaled.head()

Unnamed: 0,ORDERAMOUNT,ORDERCOUNT,CC_CUSTOMER_LIFETIME_VALUE,CC_ORDER_FREQUENCY,CC_AVERAGE_ORDER_VALUE,CC_RECENCY,CC_AVG_ORDER_PROCESSING_TIME,CC_AVG_INVOICE_PROCESSING_TIME,CC_AVG_DELIVERY_DELAY,CC_AVG_PAYMENT_DELAY,...,SP_AVG_DELIVERY_DELAY,SP_TOTAL_ORDERS,SP_ORDER_CONSISTENCY,SP_DELIVERY_CONSISTENCY,MONTH,YEAR,WEEK,WEEKEND,DAY_OF_THE_MONTH,PAYMENT_DELAY_FLAG
0,1.818268,-1.228878,3.53931,0.726324,3.38349,-0.821854,1.593069,1.545543,1.132921,1.634803,...,1.113871,-0.635376,0.438009,1.090469,0.577216,-0.373741,0.500131,-0.632448,-0.653822,Y
1,-0.279858,-1.228878,-0.390061,0.030769,-0.391811,1.581225,-0.956234,-1.018776,-0.91834,-0.981598,...,-0.914599,1.093012,-0.875847,-0.914548,0.577216,-0.373741,0.569976,-0.632448,-0.084781,N
2,-0.301515,-0.006414,-0.386941,1.670292,-0.390166,-0.821854,-0.957575,-0.934656,-0.911725,-0.904166,...,-0.914599,1.093012,-0.875847,-0.914548,-0.339514,-0.373741,-0.407863,-0.632448,-0.767631,N
3,-0.122706,-1.228878,-0.294245,1.123784,-0.301321,0.499839,-0.958405,0.546528,-0.976698,-0.074541,...,-0.914599,1.093012,-0.875847,-0.914548,-1.256245,3.282751,-1.315857,-0.632448,-0.995247,
4,-0.312708,1.21605,-0.307762,-0.267326,-0.303575,-0.942008,-0.972652,0.490779,-0.901249,-0.135329,...,-0.914599,1.093012,-0.875847,-0.914548,-0.033937,-0.373741,-0.198326,1.581158,-1.336672,Y


In [85]:
df_scaled['PAYMENT_DELAY_FLAG'].isna().sum()

2992

# Model Training for Payment Delay Prediction

In [47]:
# Separate features and target variable
X = df_scaled.drop(columns=['PAYMENT_DELAY_FLAG'])
y = df_scaled['PAYMENT_DELAY_FLAG']

In [48]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [55]:
y_train

106441      N
141237      Y
67668       Y
97177       Y
93093       N
         ... 
119879      Y
103694      Y
131932    NaN
146867      Y
121958      N
Name: PAYMENT_DELAY_FLAG, Length: 127833, dtype: object

In [52]:
#from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier()
dtc.fit(X_train, y_train)

ValueError: Input contains NaN

In [138]:
# Feature Selection using Recursive Feature Elimination (RFE)
model = LinearRegression()
rfe = RFE(model, n_features_to_select=10)  # Adjust the number of features to select
rfe.fit(X_train, y_train)

In [139]:
# Get the selected features
selected_features_rfe = X.columns[rfe.support_]
print(f'Selected features by RFE: {selected_features_rfe}')

Selected features by RFE: Index(['UNITPRICE', 'PR_AVG_UNIT_PRICE', 'PR_NUMBER_OF_ORDERS',
       'CAT_TOTAL_SALES_VOLUME', 'CAT_TOTAL_SALES_VALUE',
       'CAT_AVG_ORDER_QUANTITY', 'CAT_AVG_UNIT_PRICE', 'CAT_NUMBER_OF_ORDERS',
       'CAT_AVG_DELIVERY_TIME', 'CAT_AVG_INVOICE_TIME'],
      dtype='object')


In [140]:
# Feature Selection using Random Forest feature importance
model_rf = RandomForestRegressor(n_estimators=100, random_state=42)
model_rf.fit(X_train, y_train)

In [141]:
# Get feature importances
importances = model_rf.feature_importances_
indices = np.argsort(importances)[-10:]  # Select top 10 features
selected_features_rf = X.columns[indices]
print(f'Selected features by Random Forest: {selected_features_rf}')

Selected features by Random Forest: Index(['WEEKEND', 'PR_AVG_INVOICE_TIME', 'PR_AVG_DELIVERY_TIME', 'YEAR',
       'MONTH', 'DELIVERY_DELAY', 'WEEK', 'DAY_OF_THE_MONTH', 'ORDERQUANTITY',
       'ORDERVALUE'],
      dtype='object')


In [142]:
# Combine selected features from both methods (optional)
selected_features = list(set(selected_features_rfe) | set(selected_features_rf))
print(f'Combined selected features: {selected_features}')

Combined selected features: ['ORDERVALUE', 'UNITPRICE', 'CAT_AVG_INVOICE_TIME', 'PR_AVG_DELIVERY_TIME', 'WEEKEND', 'PR_NUMBER_OF_ORDERS', 'CAT_AVG_ORDER_QUANTITY', 'MONTH', 'WEEK', 'CAT_AVG_UNIT_PRICE', 'DELIVERY_DELAY', 'YEAR', 'CAT_TOTAL_SALES_VOLUME', 'DAY_OF_THE_MONTH', 'CAT_NUMBER_OF_ORDERS', 'PR_AVG_INVOICE_TIME', 'CAT_AVG_DELIVERY_TIME', 'PR_AVG_UNIT_PRICE', 'ORDERQUANTITY', 'CAT_TOTAL_SALES_VALUE']


In [143]:
# Reduce the dataframe to selected features
X_train_selected = X_train[selected_features]
X_test_selected = X_test[selected_features]

In [144]:
# Define a list of regression models to train
models = [
    ('Linear Regression', LinearRegression()),
    ('Random Forest', RandomForestRegressor(n_estimators=100, random_state=42)),
    ('Support Vector Regressor', SVR())
]

In [None]:
for name, model in models:
    pipeline = Pipeline([
        ('regressor', model)
    ])
    
    # Train the model
    pipeline.fit(X_train_selected, y_train)
    
    # Predict on the test set
    y_pred = pipeline.predict(X_test_selected)
    
    # Evaluate the model
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    mape = mean_absolute_percentage_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f'{name} - Mean Squared Error: {mse}')
    print(f'{name} - Mean Absolute Error: {mae}')
    print(f'{name} - MAPE : {mape}')
    print(f'{name} - R2 Score : {r2}')
    
    # Cross-validation score
    #cv_scores = cross_val_score(pipeline, X, y, cv=5, scoring='neg_mean_squared_error')
    #print(f'{name} - Cross-Validation MSE: {-cv_scores.mean()}')

Linear Regression - Mean Squared Error: 0.07496596300002328
Linear Regression - Mean Absolute Error: 0.234970147449601
Linear Regression - MAPE : 50505571160638.22
Linear Regression - R2 Score : 0.0003977951881123376
Random Forest - Mean Squared Error: 0.08010892032955419
Random Forest - Mean Absolute Error: 0.2408518982954685
Random Forest - MAPE : 50410238487456.03
Random Forest - R2 Score : -0.06817881318348884
