In [5]:
import pandas as pd
import math
import plotly.express as px
from scipy.stats import levene, bartlett, f_oneway, norm
import scikit_posthocs as sp

# Load the dataset
file_path = 'C:\\Users\\loydt\\Downloads\\Projects\\Superstore Sales Dataset.xlsx'
data = pd.read_excel(file_path)

# Filter the big four states generating higher sales
states_of_interest = ['Washington', 'California', 'New York', 'Florida', 'Pennsylvania']
state_data = data[data['State'].isin(states_of_interest)]

# Extract relevant columns and create a deep copy to avoid SettingWithCopyWarning
high_sales_states = state_data[['Segment', 'State', 'City', 'Region', 'Ship Mode', 'Order Date', 'Category', 'Sub-Category', 'Product Name', 'Sales']].copy()

# Take logarithm of the Sales column, ensuring non-positive values are handled
high_sales_states['log_sales'] = high_sales_states['Sales'].apply(lambda x: math.log(x) if x > 0 else None)

# Check for null values in 'log_sales' after transformation
if high_sales_states['log_sales'].isnull().any():
    print("Warning: There are non-positive sales values that have been transformed to NaN.")

# Create a DataFrame with 'log_sales' and 'Category'
dunn_data = high_sales_states[['log_sales', 'Category']].dropna()  # Drop NaN values for valid analysis

# Conduct Dunn's Test
dunn_results = sp.posthoc_dunn(dunn_data, val_col='log_sales', group_col='Category', p_adjust='bonferroni')

# Convert the results to a DataFrame
dunn_results_df = dunn_results.stack().reset_index()
dunn_results_df.columns = ['group1', 'group2', 'p-adj']

# Calculate mean difference, lower and upper confidence intervals, and reject
dunn_results_df['meandiff'] = dunn_results_df.apply(
    lambda row: dunn_data[dunn_data['Category'] == row['group1']]['log_sales'].mean() - 
                 dunn_data[dunn_data['Category'] == row['group2']]['log_sales'].mean(), axis=1)

# Create a 'reject' column based on the adjusted p-value
dunn_results_df['reject'] = dunn_results_df['p-adj'] < 0.05

# Calculate confidence intervals
for index, row in dunn_results_df.iterrows():
    group1_data = dunn_data[dunn_data['Category'] == row['group1']]['log_sales']
    group2_data = dunn_data[dunn_data['Category'] == row['group2']]['log_sales']
    
    mean1 = group1_data.mean()
    mean2 = group2_data.mean()
    
    # Calculate standard error
    se1 = group1_data.std() / math.sqrt(len(group1_data))
    se2 = group2_data.std() / math.sqrt(len(group2_data))
    
    # Calculate mean difference
    meandiff = mean1 - mean2
    
    # Z-score for 95% confidence interval
    z_score = norm.ppf(0.975)  # 1.96 for 95% CI
    
    # Calculate confidence intervals
    lower_bound = meandiff - z_score * math.sqrt(se1**2 + se2**2)
    upper_bound = meandiff + z_score * math.sqrt(se1**2 + se2**2)
    
    # Assign lower and upper bounds to the DataFrame
    dunn_results_df.at[index, 'lower'] = lower_bound
    dunn_results_df.at[index, 'upper'] = upper_bound

# Display the results
print(dunn_results_df)


            group1           group2          p-adj  meandiff  reject  \
0        Furniture        Furniture   1.000000e+00  0.000000   False   
1        Furniture  Office Supplies  1.534319e-138  1.481696    True   
2        Furniture       Technology   5.207264e-02 -0.201886   False   
3  Office Supplies        Furniture  1.534319e-138 -1.481696    True   
4  Office Supplies  Office Supplies   1.000000e+00  0.000000   False   
5  Office Supplies       Technology  2.001982e-154 -1.683583    True   
6       Technology        Furniture   5.207264e-02  0.201886   False   
7       Technology  Office Supplies  2.001982e-154  1.683583    True   
8       Technology       Technology   1.000000e+00  0.000000   False   

      lower     upper  
0 -0.127258  0.127258  
1  1.376760  1.586633  
2 -0.332975 -0.070797  
3 -1.586633 -1.376760  
4 -0.076346  0.076346  
5 -1.793134 -1.574032  
6  0.070797  0.332975  
7  1.574032  1.793134  
8 -0.134812  0.134812  
