# Use O2C_Template Customized Notebook Template

In [35]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import RFE
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, mean_absolute_percentage_error

In [36]:
from fosforml.model_manager.snowflakesession import get_session
my_session = get_session()

In [37]:
table_name = 'ORDER_TO_CASH_ENRICHED'

sf_df = my_session.sql("select * from {}".format(table_name))
df_final = sf_df.to_pandas()

In [38]:
df_train = df_final[df_final['INVOICESTATUS'] == 'Paid']
df_new = df_final[df_final['INVOICESTATUS'] != 'Paid']

In [39]:
df_train.shape, df_new.shape

((170290, 68), (3001, 68))

In [40]:
df = df_final.copy()

In [41]:
# Assuming df is your dataframe
df['ORDERDATE'] = pd.to_datetime(df['ORDERDATE'], errors='coerce')
df['DELIVERYDATE'] = pd.to_datetime(df['DELIVERYDATE'], errors='coerce')
df['INVOICEDATE'] = pd.to_datetime(df['INVOICEDATE'], errors='coerce')
df['PAYMENTDATE'] = pd.to_datetime(df['PAYMENTDATE'], errors='coerce')
df['INVOICEDUEDATE'] = pd.to_datetime(df['INVOICEDUEDATE'], errors='coerce')
df['DELIVEREDON'] = pd.to_datetime(df['DELIVEREDON'], errors='coerce')

In [42]:
df['MONTH'] = pd.to_datetime(df['ORDERDATE']).dt.month
df['YEAR'] = pd.to_datetime(df['ORDERDATE']).dt.year
df['WEEK'] = pd.to_datetime(df['ORDERDATE']).dt.isocalendar().week
df['WEEKEND'] = pd.to_datetime(df['ORDERDATE']).dt.weekday >= 5  # True if the day is Saturday or Sunday
df['DAY_OF_THE_MONTH'] = pd.to_datetime(df['ORDERDATE']).dt.day

In [43]:
# Delivery Delay
df['DELIVERY_DELAY'] = (df['DELIVERYDATE'] - df['DELIVEREDON']).dt.days

# Payment Delay
df['PAYMENT_DELAY'] = (df['PAYMENTDATE'] - df['INVOICEDUEDATE']).dt.days

TypeError: cannot subtract DatetimeArray from ndarray

In [None]:
df.columns

In [None]:
cols = ['ORDERQUANTITY','UNITPRICE','ORDERVALUE'
,'CC_CUSTOMER_LIFETIME_VALUE','CC_ORDER_FREQUENCY', 'CC_AVERAGE_ORDER_VALUE', 'CC_RECENCY','CC_AVG_ORDER_PROCESSING_TIME'
,'CC_AVG_INVOICE_PROCESSING_TIME', 'CC_AVG_DELIVERY_DELAY','CC_AVG_PAYMENT_DELAY', 'CC_TOTAL_ORDERS', 'CC_TOTAL_DELAYS'
,'CC_ORDER_CONSISTENCY', 'CC_INVOICE_CONSISTENCY','CC_DELIVERY_CONSISTENCY', 'CC_PAYMENT_CONSISTENCY'
,'PR_TOTAL_SALES_VOLUME', 'PR_TOTAL_SALES_VALUE','PR_AVG_ORDER_QUANTITY', 'PR_AVG_UNIT_PRICE', 'PR_NUMBER_OF_ORDERS'
,'PR_AVG_DELIVERY_TIME', 'PR_AVG_INVOICE_TIME', 'CAT_TOTAL_SALES_VOLUME','CAT_TOTAL_SALES_VALUE', 'CAT_AVG_ORDER_QUANTITY', 'CAT_AVG_UNIT_PRICE'
,'CAT_NUMBER_OF_ORDERS', 'CAT_AVG_DELIVERY_TIME','CAT_AVG_INVOICE_TIME','DELIVERY_DELAY','PAYMENT_DELAY'
,'MONTH', 'YEAR', 'WEEK', 'WEEKEND', 'DAY_OF_THE_MONTH']

In [None]:
df = df[cols]

In [None]:
df.info()

In [None]:
df.tail()

In [None]:
from sklearn.preprocessing import StandardScaler

# Standardize the predictor variables
scaler = StandardScaler()

X_scaled = scaler.fit_transform(df.drop(columns=['DELIVERY_DELAY', 'PAYMENT_DELAY']))

df_scaled = pd.DataFrame(X_scaled, columns=df.drop(columns=['DELIVERY_DELAY', 'PAYMENT_DELAY']).columns)
df_scaled['DELIVERY_DELAY'] = df['DELIVERY_DELAY']
df_scaled['PAYMENT_DELAY'] = df['PAYMENT_DELAY']

In [None]:
df_scaled.head()

In [None]:
df_scaled.describe()

# Model Training for Payment Delay Prediction

In [None]:
df_scaled.shape

In [None]:
df_scaled.dropna(inplace=True)

In [None]:
df_scaled.shape

In [None]:
# Separate features and target variable
X = df_scaled.drop(columns=['PAYMENT_DELAY'])
y = df_scaled['PAYMENT_DELAY']

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Feature Selection using Recursive Feature Elimination (RFE)
model = LinearRegression()
rfe = RFE(model, n_features_to_select=10)  # Adjust the number of features to select
rfe.fit(X_train, y_train)

In [None]:
# Get the selected features
selected_features_rfe = X.columns[rfe.support_]
print(f'Selected features by RFE: {selected_features_rfe}')

In [None]:
# Feature Selection using Random Forest feature importance
model_rf = RandomForestRegressor(n_estimators=100, random_state=42)
model_rf.fit(X_train, y_train)

In [None]:
# Get feature importances
importances = model_rf.feature_importances_
indices = np.argsort(importances)[-10:]  # Select top 10 features
selected_features_rf = X.columns[indices]
print(f'Selected features by Random Forest: {selected_features_rf}')

In [None]:
# Combine selected features from both methods (optional)
selected_features = list(set(selected_features_rfe) | set(selected_features_rf))
print(f'Combined selected features: {selected_features}')

In [None]:
# Reduce the dataframe to selected features
X_train_selected = X_train[selected_features]
X_test_selected = X_test[selected_features]

In [None]:
# Define a list of regression models to train
models = [
    ('Linear Regression', LinearRegression()),
    ('Random Forest', RandomForestRegressor(n_estimators=100, random_state=42)),
    ('Support Vector Regressor', SVR())
]

In [None]:
for name, model in models:
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('regressor', model)
    ])
    
    # Train the model
    pipeline.fit(X_train_selected, y_train)
    
    # Predict on the test set
    y_pred = pipeline.predict(X_train_selected)
    
    # Evaluate the model
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    mape = mean_absolute_percentage_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f'{name} - Mean Squared Error: {mse}')
    print(f'{name} - Mean Absolute Error: {mae}')
    print(f'{name} - MAPE : {mape}')
    print(f'{name} - R2 Score : {r2}')
    
    # Cross-validation score
    cv_scores = cross_val_score(pipeline, X, y, cv=5, scoring='neg_mean_squared_error')
    print(f'{name} - Cross-Validation MSE: {-cv_scores.mean()}')