# Use O2C_Template Customized Notebook Template

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler,MinMaxScaler
#from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, ExtraTreesClassifier
from sklearn.feature_selection import RFE
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, mean_absolute_percentage_error
from sklearn.metrics import classification_report, f1_score, r2_score, roc_auc_score

In [2]:
from fosforml.model_manager.snowflakesession import get_session
my_session = get_session()

In [11]:
table_name = 'ORDER_TO_CASH_ENRICHED'

sf_df = my_session.sql("select * from {}".format(table_name))
df = sf_df.to_pandas()

In [12]:
df_train = df[df['INVOICESTATUS'].isin(['LATE_PAYMENT','ONTIME_PAYMENT'])]
df_test = df[~df['INVOICESTATUS'].isin(['LATE_PAYMENT','ONTIME_PAYMENT'])]

In [13]:
df_train.shape, df_test.shape

((170445, 79), (2992, 79))

In [14]:
df_final = df.copy()
df = df_train.copy()

In [15]:
df_final.shape, df.shape

((173437, 79), (170445, 79))

In [16]:
# Assuming df is your dataframe
df['ORDERDATE'] = pd.to_datetime(df['ORDERDATE'], errors='coerce')
df['DELIVERYDATE'] = pd.to_datetime(df['DELIVERYDATE'], errors='coerce')
df['INVOICEDATE'] = pd.to_datetime(df['INVOICEDATE'], errors='coerce')
df['PAYMENTDATE'] = pd.to_datetime(df['PAYMENTDATE'], errors='coerce')
df['INVOICEDUEDATE'] = pd.to_datetime(df['INVOICEDUEDATE'], errors='coerce')
df['DELIVEREDON'] = pd.to_datetime(df['DELIVEREDON'], errors='coerce')

In [17]:
df['MONTH'] = pd.to_datetime(df['ORDERDATE']).dt.month
df['YEAR'] = pd.to_datetime(df['ORDERDATE']).dt.year
df['WEEK'] = pd.to_datetime(df['ORDERDATE']).dt.isocalendar().week
df['WEEKEND'] = pd.to_datetime(df['ORDERDATE']).dt.weekday >= 5  # True if the day is Saturday or Sunday
df['DAY_OF_THE_MONTH'] = pd.to_datetime(df['ORDERDATE']).dt.day

In [20]:
# Delivery Delay
df['DELIVERY_DELAY'] = (df['DELIVEREDON'] - df['DELIVERYDATE']).dt.days

# Payment Delay
df['PAYMENT_DELAY'] = (df['PAYMENTDATE'] - df['INVOICEDUEDATE']).dt.days

In [22]:
df[['PAYMENTDATE','INVOICEDUEDATE','PAYMENT_DELAY','DELIVEREDON','DELIVERYDATE','DELIVERY_DELAY']].head()

Unnamed: 0,PAYMENTDATE,INVOICEDUEDATE,PAYMENT_DELAY,DELIVEREDON,DELIVERYDATE,DELIVERY_DELAY
0,2022-10-11,2022-09-28,13,2022-09-02,2022-08-28,5
1,2022-09-07,2022-09-16,-9,2022-08-22,2022-08-20,2
2,2022-06-09,2022-06-09,0,2022-05-16,2022-05-14,2
3,2022-06-09,2022-06-09,0,2022-05-16,2022-05-14,2
4,2024-03-25,2024-03-13,12,2024-02-12,2024-02-10,2


In [23]:
def set_flag(delay):
    return 'Y' if delay >= 1 else 'N'

In [25]:
df['PAYMENT_DELAY_FLAG'] = df['PAYMENT_DELAY'].apply(set_flag)
df['DELIVERY_DELAY_FLAG'] = df['DELIVERY_DELAY'].apply(set_flag)

In [29]:
print(list(df.columns))

['ORDERID', 'ORDERITEMID', 'PRODUCTID', 'PRODUCTNAME', 'ORDERQUANTITY', 'UNITPRICE', 'ORDERVALUE', 'CUSTOMERID', 'CUSTOMERNAME', 'PRODUCTCATEGORY', 'ORDERDATE', 'ORDERSTATUS', 'ORDERAMOUNT', 'ORDERCOUNT', 'SHIPMENTID', 'SHIPMENTDATE', 'DELIVERYDATE', 'DELIVEREDON', 'CARRIER', 'SHIPMENTSTATUS', 'INVOICEID', 'INVOICEDATE', 'INVOICEDUEDATE', 'PAYMENTDATE', 'INVOICESTATUS', 'PAYMENTMETHOD', 'PRODUCTTYPE', 'QUANTITY', 'COMPANYTYPE', 'CONTACTDETAILS', 'EMAILDETAILS', 'ADDRESSDETAILS', 'ADMINDETAILS', 'CREDITLIMIT', 'CUSTOMERSINCE', 'PAYMENTTERMS', 'CREDITLIMITTYPE', 'CUSTOMERTYPE', 'SUPPLIERNAME', 'SUPPLIERID', 'CC_CUSTOMER_LIFETIME_VALUE', 'CC_ORDER_FREQUENCY', 'CC_AVERAGE_ORDER_VALUE', 'CC_RECENCY', 'CC_PREFERRED_PAYMENT_METHOD', 'CC_PREFERRED_PRODUCT_CATEGORY', 'CC_PREFERRED_PRODUCT_TYPE', 'CC_AVG_ORDER_PROCESSING_TIME', 'CC_AVG_INVOICE_PROCESSING_TIME', 'CC_AVG_DELIVERY_DELAY', 'CC_AVG_PAYMENT_DELAY', 'CC_TOTAL_ORDERS', 'CC_TOTAL_DELAYS', 'CC_ORDER_CONSISTENCY', 'CC_INVOICE_CONSISTENCY',

In [36]:
cols = ['ORDERQUANTITY','ORDERAMOUNT','ORDERCOUNT',
'CC_CUSTOMER_LIFETIME_VALUE', 'CC_ORDER_FREQUENCY', 'CC_AVERAGE_ORDER_VALUE', 'CC_RECENCY', 'CC_AVG_ORDER_PROCESSING_TIME', 'CC_AVG_INVOICE_PROCESSING_TIME', 
'CC_AVG_DELIVERY_DELAY', 'CC_AVG_PAYMENT_DELAY', 'CC_TOTAL_ORDERS', 'CC_TOTAL_DELAYS', 'CC_ORDER_CONSISTENCY', 'CC_INVOICE_CONSISTENCY', 
'CC_DELIVERY_CONSISTENCY', 'CC_PAYMENT_CONSISTENCY', 'PR_TOTAL_SALES_VOLUME', 'PR_TOTAL_SALES_VALUE', 'PR_AVG_ORDER_QUANTITY', 
'PR_AVG_UNIT_PRICE', 'PR_NUMBER_OF_ORDERS', 'PR_AVG_DELIVERY_TIME', 'PR_AVG_INVOICE_TIME', 'CAT_TOTAL_SALES_VOLUME', 'CAT_TOTAL_SALES_VALUE', 
'CAT_AVG_ORDER_QUANTITY', 'CAT_AVG_UNIT_PRICE', 'CAT_NUMBER_OF_ORDERS', 'CAT_AVG_DELIVERY_TIME', 'CAT_AVG_INVOICE_TIME', 
'SP_CUSTOMER_LIFETIME_VALUE', 'SP_ORDER_FREQUENCY', 'SP_AVERAGE_ORDER_VALUE', 'SP_AVG_ORDER_PROCESSING_TIME', 'SP_AVG_DELIVERY_DELAY', 
'SP_TOTAL_ORDERS', 'SP_ORDER_CONSISTENCY', 'SP_DELIVERY_CONSISTENCY', 'MONTH', 'YEAR', 'WEEK', 'WEEKEND', 'DAY_OF_THE_MONTH', 'PAYMENT_DELAY_FLAG']

In [37]:
df_payments = df[cols]

In [38]:
df_payments.info()

<class 'pandas.core.frame.DataFrame'>
Index: 170445 entries, 0 to 173436
Data columns (total 45 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   ORDERQUANTITY                   170445 non-null  int16  
 1   ORDERAMOUNT                     170445 non-null  int32  
 2   ORDERCOUNT                      170445 non-null  int8   
 3   CC_CUSTOMER_LIFETIME_VALUE      170445 non-null  int32  
 4   CC_ORDER_FREQUENCY              170445 non-null  int16  
 5   CC_AVERAGE_ORDER_VALUE          170445 non-null  float64
 6   CC_RECENCY                      170445 non-null  int8   
 7   CC_AVG_ORDER_PROCESSING_TIME    170445 non-null  float64
 8   CC_AVG_INVOICE_PROCESSING_TIME  170445 non-null  float64
 9   CC_AVG_DELIVERY_DELAY           170445 non-null  float64
 10  CC_AVG_PAYMENT_DELAY            170445 non-null  float64
 11  CC_TOTAL_ORDERS                 170445 non-null  int16  
 12  CC_TOTAL_DELAYS      

In [39]:
df_payments.tail()

Unnamed: 0,ORDERQUANTITY,ORDERAMOUNT,ORDERCOUNT,CC_CUSTOMER_LIFETIME_VALUE,CC_ORDER_FREQUENCY,CC_AVERAGE_ORDER_VALUE,CC_RECENCY,CC_AVG_ORDER_PROCESSING_TIME,CC_AVG_INVOICE_PROCESSING_TIME,CC_AVG_DELIVERY_DELAY,...,SP_AVG_DELIVERY_DELAY,SP_TOTAL_ORDERS,SP_ORDER_CONSISTENCY,SP_DELIVERY_CONSISTENCY,MONTH,YEAR,WEEK,WEEKEND,DAY_OF_THE_MONTH,PAYMENT_DELAY_FLAG
173432,88,6776,1,64033712,538,59180.879852,-80,10.062847,28.409427,3.050832,...,2.998765,29950,1.706115,1.41788,11,2022,47,False,21,N
173433,4012,37084,2,82160478,564,71073.077855,-89,14.938581,26.097751,3.037197,...,3.018392,32025,1.718133,1.415584,2,2022,8,False,23,N
173434,90,37084,2,82160478,564,71073.077855,-89,14.938581,26.097751,3.037197,...,3.018392,32025,1.718133,1.415584,2,2022,8,False,23,N
173435,279,3892,3,15562685,594,12958.105745,-79,4.513739,14.835137,1.502082,...,1.494201,75789,0.818478,0.49997,7,2022,29,False,19,N
173436,299,3892,3,15562685,594,12958.105745,-79,4.513739,14.835137,1.502082,...,1.494201,75789,0.818478,0.49997,7,2022,29,False,19,N


In [40]:
from sklearn.preprocessing import StandardScaler

# Standardize the predictor variables
scaler = StandardScaler()

X_scaled = scaler.fit_transform(df_payments.drop(columns=['PAYMENT_DELAY_FLAG']))

df_scaled = pd.DataFrame(X_scaled, columns=df.drop(columns=['PAYMENT_DELAY_FLAG']).columns)
df_scaled['DELIVERY_DELAY'] = df_payments['DELIVERY_DELAY']
df_scaled['PAYMENT_DELAY'] = df_payments['PAYMENT_DELAY']

ValueError: Shape of passed values is (170445, 44), indices imply (170445, 87)

In [127]:
df_scaled.head()

Unnamed: 0,ORDERQUANTITY,UNITPRICE,ORDERVALUE,CC_CUSTOMER_LIFETIME_VALUE,CC_ORDER_FREQUENCY,CC_AVERAGE_ORDER_VALUE,CC_RECENCY,CC_AVG_ORDER_PROCESSING_TIME,CC_AVG_INVOICE_PROCESSING_TIME,CC_AVG_DELIVERY_DELAY,...,CAT_NUMBER_OF_ORDERS,CAT_AVG_DELIVERY_TIME,CAT_AVG_INVOICE_TIME,MONTH,YEAR,WEEK,WEEKEND,DAY_OF_THE_MONTH,DELIVERY_DELAY,PAYMENT_DELAY
0,0.768522,-0.249351,-0.245039,-0.38159,1.139589,-0.387349,-0.662097,-1.432849,0.87199,-0.685103,...,1.161755,-0.013016,-0.316488,1.30994,-1.590644,1.243291,-0.632181,-0.536187,-3.0,0.0
1,0.622408,-0.249991,-0.273455,-0.38159,1.139589,-0.387349,-0.662097,-1.432849,0.87199,-0.685103,...,1.161755,-0.013016,-0.316488,1.30994,-1.590644,1.243291,-0.632181,-0.536187,-3.0,0.0
2,-0.546511,-0.25031,-0.300408,-0.38159,1.139589,-0.387349,-0.662097,-1.432849,0.87199,-0.685103,...,1.161755,-0.013016,-0.316488,1.30994,-1.590644,1.243291,-0.632181,-0.536187,-3.0,0.0
3,2.930659,-0.25047,-0.269707,-0.146407,1.506251,-0.165817,0.797586,-0.091813,1.026887,0.813034,...,-1.013455,0.146568,1.419836,-1.069112,1.207321,-1.221954,1.581826,-1.442385,-2.0,-5.0
4,1.439322,-0.249671,-0.23661,-0.146407,1.506251,-0.165817,0.797586,-0.091813,1.026887,0.813034,...,-1.013455,0.146568,1.419836,-1.069112,1.207321,-1.221954,1.581826,-1.442385,-2.0,-5.0


In [128]:
df_scaled.describe()

Unnamed: 0,ORDERQUANTITY,UNITPRICE,ORDERVALUE,CC_CUSTOMER_LIFETIME_VALUE,CC_ORDER_FREQUENCY,CC_AVERAGE_ORDER_VALUE,CC_RECENCY,CC_AVG_ORDER_PROCESSING_TIME,CC_AVG_INVOICE_PROCESSING_TIME,CC_AVG_DELIVERY_DELAY,...,CAT_NUMBER_OF_ORDERS,CAT_AVG_DELIVERY_TIME,CAT_AVG_INVOICE_TIME,MONTH,YEAR,WEEK,WEEKEND,DAY_OF_THE_MONTH,DELIVERY_DELAY,PAYMENT_DELAY
count,173291.0,173291.0,173291.0,173291.0,173291.0,173291.0,173291.0,173291.0,173291.0,173291.0,...,173291.0,173291.0,173291.0,173291.0,173291.0,173291.0,173291.0,173291.0,171043.0,171043.0
mean,-5.742451e-17,5.9864180000000004e-18,-1.927135e-17,4.879341e-18,-5.762748e-16,5.777304000000001e-17,1.142176e-15,-1.544578e-15,6.882618e-15,-6.694948e-15,...,1.670047e-16,4.892068e-14,-1.892356e-14,7.491223000000001e-17,-1.38978e-13,-7.483023000000001e-17,-1.828728e-17,-5.9864180000000004e-18,-1.499711,1.983688
std,1.000003,1.000003,1.000003,1.000003,1.000003,1.000003,1.000003,1.000003,1.000003,1.000003,...,1.000003,1.000003,1.000003,1.000003,1.000003,1.000003,1.000003,1.000003,1.116606,4.397062
min,-0.6364742,-0.2504704,-0.3016176,-0.3901331,-2.343694,-0.3919639,-0.6620972,-2.633525,-3.28241,-2.439818,...,-1.432705,-1.421744,-1.967177,-1.663875,-1.590644,-1.769786,-0.6321809,-1.668935,-3.0,-6.0
25%,-0.5585866,-0.2496708,-0.2872826,-0.3462952,-0.7303838,-0.3466685,-0.6620972,-0.7403324,-0.6361801,-0.7646495,...,-1.119262,-0.3950523,-0.3720938,-0.7717303,-0.1916616,-0.8795586,-0.6321809,-0.8760112,-2.0,-2.0
50%,-0.4022075,-0.247432,-0.2554716,-0.3069064,0.002938945,-0.3072592,-0.6620972,0.0068897,0.05556234,-0.002251823,...,0.05893536,-0.01301581,-0.3164883,0.120414,-0.1916616,0.07914777,-0.6321809,0.03018704,-2.0,2.0
75%,-0.04537341,-0.2189672,-0.1699055,-0.2077363,0.6262633,-0.20535,0.06774444,0.6904922,0.7417703,0.7019169,...,0.7966014,0.2967571,1.000842,0.7151768,1.207321,0.763938,1.581826,0.8231105,-1.0,6.0
max,5.400724,5.985888,10.23092,3.860401,2.972896,3.526119,5.906477,2.741224,3.447941,2.301595,...,1.161755,1.787299,1.755793,1.607321,1.207321,1.722644,1.581826,1.729309,0.0,10.0


In [129]:
y_target = np.array(df_scaled['PAYMENT_DELAY']).reshape(-1,1)

In [130]:
y_train_df = np.array(y_train).reshape(-1,1)
y_test_df = np.array(y_test).reshape(-1,1)

In [131]:
# Target normalization
scaler_y = MinMaxScaler()
y_target_scaled = scaler_y.fit_transform(y_target)

In [132]:
df_scaled['PAYMENT_DELAY'] = y_target_scaled

# Model Training for Payment Delay Prediction

In [133]:
df_scaled.shape

(173291, 38)

In [134]:
df_scaled.dropna(inplace=True)

In [135]:
df_scaled.shape

(171043, 38)

In [136]:
# Separate features and target variable
X = df_scaled.drop(columns=['PAYMENT_DELAY'])
y = df_scaled['PAYMENT_DELAY']

In [137]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [138]:
# Feature Selection using Recursive Feature Elimination (RFE)
model = LinearRegression()
rfe = RFE(model, n_features_to_select=10)  # Adjust the number of features to select
rfe.fit(X_train, y_train)

In [139]:
# Get the selected features
selected_features_rfe = X.columns[rfe.support_]
print(f'Selected features by RFE: {selected_features_rfe}')

Selected features by RFE: Index(['UNITPRICE', 'PR_AVG_UNIT_PRICE', 'PR_NUMBER_OF_ORDERS',
       'CAT_TOTAL_SALES_VOLUME', 'CAT_TOTAL_SALES_VALUE',
       'CAT_AVG_ORDER_QUANTITY', 'CAT_AVG_UNIT_PRICE', 'CAT_NUMBER_OF_ORDERS',
       'CAT_AVG_DELIVERY_TIME', 'CAT_AVG_INVOICE_TIME'],
      dtype='object')


In [140]:
# Feature Selection using Random Forest feature importance
model_rf = RandomForestRegressor(n_estimators=100, random_state=42)
model_rf.fit(X_train, y_train)

In [141]:
# Get feature importances
importances = model_rf.feature_importances_
indices = np.argsort(importances)[-10:]  # Select top 10 features
selected_features_rf = X.columns[indices]
print(f'Selected features by Random Forest: {selected_features_rf}')

Selected features by Random Forest: Index(['WEEKEND', 'PR_AVG_INVOICE_TIME', 'PR_AVG_DELIVERY_TIME', 'YEAR',
       'MONTH', 'DELIVERY_DELAY', 'WEEK', 'DAY_OF_THE_MONTH', 'ORDERQUANTITY',
       'ORDERVALUE'],
      dtype='object')


In [142]:
# Combine selected features from both methods (optional)
selected_features = list(set(selected_features_rfe) | set(selected_features_rf))
print(f'Combined selected features: {selected_features}')

Combined selected features: ['ORDERVALUE', 'UNITPRICE', 'CAT_AVG_INVOICE_TIME', 'PR_AVG_DELIVERY_TIME', 'WEEKEND', 'PR_NUMBER_OF_ORDERS', 'CAT_AVG_ORDER_QUANTITY', 'MONTH', 'WEEK', 'CAT_AVG_UNIT_PRICE', 'DELIVERY_DELAY', 'YEAR', 'CAT_TOTAL_SALES_VOLUME', 'DAY_OF_THE_MONTH', 'CAT_NUMBER_OF_ORDERS', 'PR_AVG_INVOICE_TIME', 'CAT_AVG_DELIVERY_TIME', 'PR_AVG_UNIT_PRICE', 'ORDERQUANTITY', 'CAT_TOTAL_SALES_VALUE']


In [143]:
# Reduce the dataframe to selected features
X_train_selected = X_train[selected_features]
X_test_selected = X_test[selected_features]

In [144]:
# Define a list of regression models to train
models = [
    ('Linear Regression', LinearRegression()),
    ('Random Forest', RandomForestRegressor(n_estimators=100, random_state=42)),
    ('Support Vector Regressor', SVR())
]

In [None]:
for name, model in models:
    pipeline = Pipeline([
        ('regressor', model)
    ])
    
    # Train the model
    pipeline.fit(X_train_selected, y_train)
    
    # Predict on the test set
    y_pred = pipeline.predict(X_test_selected)
    
    # Evaluate the model
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    mape = mean_absolute_percentage_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f'{name} - Mean Squared Error: {mse}')
    print(f'{name} - Mean Absolute Error: {mae}')
    print(f'{name} - MAPE : {mape}')
    print(f'{name} - R2 Score : {r2}')
    
    # Cross-validation score
    #cv_scores = cross_val_score(pipeline, X, y, cv=5, scoring='neg_mean_squared_error')
    #print(f'{name} - Cross-Validation MSE: {-cv_scores.mean()}')

Linear Regression - Mean Squared Error: 0.07496596300002328
Linear Regression - Mean Absolute Error: 0.234970147449601
Linear Regression - MAPE : 50505571160638.22
Linear Regression - R2 Score : 0.0003977951881123376
Random Forest - Mean Squared Error: 0.08010892032955419
Random Forest - Mean Absolute Error: 0.2408518982954685
Random Forest - MAPE : 50410238487456.03
Random Forest - R2 Score : -0.06817881318348884
