In [3]:
import pandas as pd
import scipy.stats as stats
import itertools
from statsmodels.stats.multitest import multipletests

# Load the dataset
file_path = 'C:\\Users\\loydt\\Downloads\\Projects\\Superstore Sales Dataset.xlsx'
data = pd.read_excel(file_path)

# Convert 'Order Date' to datetime format and extract the month
data['Order Date'] = pd.to_datetime(data['Order Date'])
data['Order Month'] = data['Order Date'].dt.month

# Function for conducting Chi-squared test and storing significant results
def chi_squared_test(data, variable1, variable2, smoothing=0.5):
    contingency_table = pd.crosstab(data[variable1], data[variable2])
    contingency_table += smoothing  # Add a small constant to avoid zeros
    chi2, p, dof, _ = stats.chi2_contingency(contingency_table)
    return chi2, p, dof, contingency_table

# Conduct Chi-squared tests between Segment and specified variables
variables_to_test = ['State', 'City', 'Ship Mode', 'Order Month']
significant_results = []

for variable in variables_to_test:
    try:
        chi2, p, dof, table = chi_squared_test(data, 'Segment', variable)
        if p < 0.05:
            significant_results.append((variable, chi2, p, dof, table))
            print(f"{variable}: Chi-squared = {chi2}, p = {p}, DoF = {dof}")
    except ValueError as e:
        print(f"Skipping {variable} due to error: {e}")

# Post-hoc pairwise comparisons for significant results
posthoc_results = {}

for variable, chi2, p, dof, table in significant_results:
    pairs = list(itertools.combinations(table.index, 2))
    p_values = []
    
    for pair in pairs:
        sub_table = table.loc[list(pair)]
        try:
            chi2_pair, p_pair, _, _ = stats.chi2_contingency(sub_table)
            p_values.append(p_pair)
        except ValueError:
            p_values.append(1)  # If error occurs, assign non-significant p-value
    
    # Adjust p-values using the Bonferroni correction
    corrected_p_values = multipletests(p_values, method='bonferroni')[1]
    posthoc_results[variable] = dict(zip(pairs, corrected_p_values))

# Display results
for variable, results in posthoc_results.items():
    print(f"\nPost-hoc results for {variable} with Segment:")
    for pair, p_val in results.items():
        print(f"{pair}: Adjusted p-value = {p_val}")


State: Chi-squared = 249.36663742175628, p = 1.3908748204941375e-15, DoF = 96
City: Chi-squared = 2015.006789463118, p = 1.5748980369915456e-62, DoF = 1056
Ship Mode: Chi-squared = 25.763379077286622, p = 0.0002464086024327002, DoF = 6
Order Month: Chi-squared = 63.66829835544393, p = 6.302173847865037e-06, DoF = 22

Post-hoc results for State with Segment:
('Consumer', 'Corporate'): Adjusted p-value = 1.9642241429667575e-07
('Consumer', 'Home Office'): Adjusted p-value = 7.026645491004073e-11
('Corporate', 'Home Office'): Adjusted p-value = 3.3795511931386933e-06

Post-hoc results for City with Segment:
('Consumer', 'Corporate'): Adjusted p-value = 3.94750895186823e-27
('Consumer', 'Home Office'): Adjusted p-value = 1.0096633663067795e-23
('Corporate', 'Home Office'): Adjusted p-value = 3.757551612400925e-11

Post-hoc results for Ship Mode with Segment:
('Consumer', 'Corporate'): Adjusted p-value = 0.0006284570132243212
('Consumer', 'Home Office'): Adjusted p-value = 0.850642548016505