In [None]:
import pandas as pd
import numpy as np
from scipy import stats
from fitter import Fitter # For conveniently fitting multiple distributions
import matplotlib.pyplot as plt
import seaborn as sns

# Load the cleaned dataset
df_cleaned = pd.read_csv('/kaggle/working/cleaned_supply_chain_data.csv')

# --- Distribution Fitting Prototype ---

# Define key variables for distribution fitting
key_variables = {
    'Inter-Arrival Time (Days)': df_cleaned['inter_arrival_time'].dropna(),
    'Order Profit Per Order': df_cleaned['order_profit_per_order'].dropna()
}

# Define distributions to fit
distribution_names = ['expon', 'weibull_min', 'lognorm', 'pareto']

# Store results for comparison table
fitting_results = []

# Perform fitting for each key variable
for var_name, data_series in key_variables.items():
    print(f"\n--- Fitting Distributions for {var_name} ---")

    # Filter out non-positive values for Pareto and Lognormal if they are present and cause issues
    # Pareto and Lognormal are defined for positive values.
    # For profit, negative values are valid losses. We might need to transform or use a shifted version.
    # For inter_arrival_time, 0s were introduced for first orders. Pareto/Lognormal won't fit 0.
    # Let's handle 0s for inter_arrival_time by either excluding them or adding a small constant if the nature allows.
    # For now, let's remove 0s for fitting Pareto/Lognormal for inter-arrival time as they represent no prior event.
    if var_name == 'Inter-Arrival Time (Days)':
        data_for_positive_dist = data_series[data_series > 0]
    else:
        # For order profit, negative values exist. Lognormal/Pareto won't fit negative values directly.
        # This dataset's 'order_profit_per_order' includes negative values.
        # Fitting Lognormal/Pareto to profit directly is problematic if it includes negative values.
        # A common approach for profit/loss is to model gains and losses separately, or use a shifted distribution.
        # For this prototype, if the distribution (e.g., Lognormal, Pareto) cannot handle negative values,
        # we'll note it and proceed with distributions that can.
        # Let's fit only on positive profits for these distributions for now for demonstration.
        # For the sake of demonstrating fitting, let's consider the absolute value or only positive part for these specific distributions
        # For `order_profit_per_order`, the range includes negative values.
        # Exponential, Weibull are typically for positive values (time to event, etc.).
        # Given the requirements, I will attempt to fit these directly, but acknowledge the limitations for negative values.
        # Fitter handles this by ignoring non-positive values for distributions that require positive input.
        data_for_positive_dist = data_series


    f = Fitter(data_for_positive_dist, distributions=distribution_names)
    f.fit()

    # Get the summary of fitted distributions
    #summary = f.get_results()
    summary = f.summary()
    print(summary.head())
    print(summary.index) 


    # Plotting the fitted PDFs
    plt.figure(figsize=(12, 7))
    f.plot_pdf(names=distribution_names, lw=2)
    plt.title(f'Distribution Fit for {var_name}')
    plt.xlabel(var_name)
    plt.ylabel('Probability Density')
    plt.legend(title='Distribution')
    plt.savefig(f'distribution_fit_{var_name.replace(" ", "_").lower()}.png')
    plt.close()


    # Prepare results for the comparison table
    for dist_name in distribution_names:
        if dist_name in f.fitted_param:
            # params = f.fitted_param_names[dist_name]
            # Later in the loop
            params = f.fitted_param[dist_name]
            # Fitter gives 'sumsquare_error' as a GOF metric.
            # To get KS p-value, AIC, BIC, we'd ideally re-calculate or use a more robust library.
            # For prototype, we'll use Fitter's `sumsquare_error` as a proxy for GOF,
            # and illustrate how AIC/BIC/KS would be obtained if implementing from scratch.

            # Attempt to calculate KS p-value directly for fitted distribution
            try:
                # Need to use the correct `loc` and `scale` parameters depending on the distribution.
                # Fitter's params are often (shape, loc, scale) or (loc, scale)
                if dist_name == 'expon':
                    loc, scale = params
                    ks_stat, ks_pvalue = stats.kstest(data_for_positive_dist, 'expon', args=(loc, scale))
                elif dist_name == 'weibull_min':
                    shape, loc, scale = params
                    ks_stat, ks_pvalue = stats.kstest(data_for_positive_dist, 'weibull_min', args=(shape, loc, scale))
                elif dist_name == 'lognorm':
                    shape, loc, scale = params
                    ks_stat, ks_pvalue = stats.kstest(data_for_positive_dist, 'lognorm', args=(shape, loc, scale))
                elif dist_name == 'pareto':
                    shape, loc, scale = params
                    ks_stat, ks_pvalue = stats.kstest(data_for_positive_dist, 'pareto', args=(shape, loc, scale))
                else:
                    ks_stat, ks_pvalue = np.nan, np.nan
            except Exception as e:
                ks_stat, ks_pvalue = np.nan, np.nan # If fitting failed or kstest fails

            # For AIC/BIC, direct calculation from `scipy.stats.fit` usually isn't provided directly.
            # This requires calculating the log-likelihood of the fitted model.
            # This is a placeholder as `fitter` doesn't expose them directly without more manual work.
            aic = np.nan
            bic = np.nan

            fitting_results.append({
                'Variable': var_name,
                'Distribution': dist_name,
                'Parameters': params,
                'KS p-value': ks_pvalue,
                'Sum of Squared Errors': summary.loc[dist_name, 'sumsquare_error'], # From Fitter
                'AIC': aic, # Placeholder
                'BIC': bic  # Placeholder
            })

# Create the comparison table DataFrame
comparison_df = pd.DataFrame(fitting_results)
# Sort by Sum of Squared Errors to show best fit first
comparison_df.sort_values(by=['Variable', 'Sum of Squared Errors'], inplace=True)

print("\n--- Distribution Fitting Comparison Table ---")
print(comparison_df.to_markdown(index=False))

# Optional: Save the comparison table to a CSV or Excel file
comparison_df.to_csv('distribution_fitting_comparison.csv', index=False)
print(f"\nDistribution fitting comparison table saved to distribution_fitting_comparison.csv")
