In [2]:
import pandas as pd
import math
import plotly.express as px
from scipy.stats import levene, bartlett, f_oneway, norm
import scikit_posthocs as sp

# Load the dataset
file_path = 'C:\\Users\\loydt\\Downloads\\Projects\\Superstore Sales Dataset.xlsx'
data = pd.read_excel(file_path)

# Filter the big four states generating higher sales
states_of_interest = ['Washington', 'California', 'New York', 'Florida', 'Pennsylvania']
state_data = data[data['State'].isin(states_of_interest)]

# Extract relevant columns and create a deep copy to avoid SettingWithCopyWarning
high_sales_states = state_data[['Segment', 'State', 'City', 'Region', 'Ship Mode', 'Order Date', 'Category', 'Sub-Category', 'Product Name', 'Sales']].copy()

# Take logarithm of the Sales column, ensuring non-positive values are handled
high_sales_states['log_sales'] = high_sales_states['Sales'].apply(lambda x: math.log(x) if x > 0 else None)

# Check for null values in 'log_sales' after transformation
if high_sales_states['log_sales'].isnull().any():
    print("Warning: There are non-positive sales values that have been transformed to NaN.")

# Create a DataFrame with 'log_sales' and 'Category'
dunn_data = high_sales_states[['log_sales', 'Category']].dropna()  # Drop NaN values for valid analysis

# Conduct Dunn's Test
dunn_results = sp.posthoc_dunn(dunn_data, val_col='log_sales', group_col='Category', p_adjust='bonferroni')

# Convert the results to a DataFrame
dunn_results_df = dunn_results.stack().reset_index()
dunn_results_df.columns = ['group1', 'group2', 'p-adj']

# Create a 'reject' column based on the adjusted p-value
dunn_results_df['reject'] = dunn_results_df['p-adj'] < 0.05

# Display the results
print(dunn_results_df)

            group1           group2          p-adj  reject
0        Furniture        Furniture   1.000000e+00   False
1        Furniture  Office Supplies  1.534319e-138    True
2        Furniture       Technology   5.207264e-02   False
3  Office Supplies        Furniture  1.534319e-138    True
4  Office Supplies  Office Supplies   1.000000e+00   False
5  Office Supplies       Technology  2.001982e-154    True
6       Technology        Furniture   5.207264e-02   False
7       Technology  Office Supplies  2.001982e-154    True
8       Technology       Technology   1.000000e+00   False
