# Industry differences statistical test

This notebook uses the Kruskal-Wallis test to test for differences in ESG supply chain content between different sub-sectors.

In [1]:
import pandas as pd
import numpy as np
import pickle
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

In [2]:
# Import final dataframe
data = pd.read_pickle('../Data/Variable dataframes/company_details_pivoted.pkl')

### Hypothesis 1: Supply chain speaking space

In [3]:
#Get lists of SC paragraph share by industry
cons_dur_apparel = data[(data['main_industry']=='Consumer Durables and Apparel (Primary)')&(data['Share_SC'].notna())]['Share_SC'].to_list()
cons_services = data[(data['main_industry']=='Consumer Services (Primary)')&(data['Share_SC'].notna())]['Share_SC'].to_list()
retail = data[(data['main_industry']=='Retailing (Primary)')&(data['Share_SC'].notna())]['Share_SC'].to_list()
food_staples_retail = data[(data['main_industry']=='Food and Staples Retailing (Primary)')&(data['Share_SC'].notna())]['Share_SC'].to_list()
auto = data[(data['main_industry']=='Automobiles and Components (Primary)')&(data['Share_SC'].notna())]['Share_SC'].to_list()
food_bev_tob = data[(data['main_industry']=='Food, Beverage and Tobacco (Primary)')&(data['Share_SC'].notna())]['Share_SC'].to_list()
household_personal = data[(data['main_industry']=='Household and Personal Products (Primary)')&(data['Share_SC'].notna())]['Share_SC'].to_list()

In [4]:
# Check whether groups differ
stats.kruskal(cons_dur_apparel, cons_services, retail, 
              food_staples_retail, auto, food_bev_tob, household_personal)

KruskalResult(statistic=129.04323067349682, pvalue=2.043896223948096e-25)

In [5]:
# Need to check which groups are different

ind_lists = [cons_dur_apparel, cons_services, retail, 
             food_staples_retail, auto, food_bev_tob, household_personal]
names = ['Consumer Durables and Apparel','Consumer Services','Retailing',
         'Food and Staples Retailing','Automobiles and Components',
         'Food, Beverage and Tobacco','Household and Personal Products']

kruskal_dict = dict()

for ind1, name1 in zip(ind_lists, names):
    kruskal_dict[name1] = dict()
    for ind2, name2 in zip(ind_lists, names):
        stat, p = stats.kruskal(ind1, ind2)
        kruskal_dict[name1][name2] = f"{stat:.2f} ({p:.3f})"
        
kruskal_df = pd.DataFrame(kruskal_dict)
display(kruskal_df)

Unnamed: 0,Consumer Durables and Apparel,Consumer Services,Retailing,Food and Staples Retailing,Automobiles and Components,"Food, Beverage and Tobacco",Household and Personal Products
Consumer Durables and Apparel,0.00 (1.000),18.21 (0.000),3.55 (0.060),5.15 (0.023),45.80 (0.000),0.60 (0.439),0.02 (0.886)
Consumer Services,18.21 (0.000),0.00 (1.000),27.21 (0.000),33.69 (0.000),1.21 (0.271),29.91 (0.000),18.54 (0.000)
Retailing,3.55 (0.060),27.21 (0.000),-0.00 (1.000),0.00 (0.968),55.85 (0.000),4.20 (0.040),3.59 (0.058)
Food and Staples Retailing,5.15 (0.023),33.69 (0.000),0.00 (0.968),0.00 (1.000),73.15 (0.000),4.93 (0.026),4.64 (0.031)
Automobiles and Components,45.80 (0.000),1.21 (0.271),55.85 (0.000),73.15 (0.000),0.00 (1.000),80.76 (0.000),51.91 (0.000)
"Food, Beverage and Tobacco",0.60 (0.439),29.91 (0.000),4.20 (0.040),4.93 (0.026),80.76 (0.000),0.00 (1.000),0.11 (0.741)
Household and Personal Products,0.02 (0.886),18.54 (0.000),3.59 (0.058),4.64 (0.031),51.91 (0.000),0.11 (0.741),0.00 (1.000)


### Hypothesis 2: Process-to-market speaking space index

In [6]:
#Get lists of process-to-market by industry
cons_dur_apparel = data[(data['main_industry']=='Consumer Durables and Apparel (Primary)')&(data['Market_to_process_log'].notna())]['Market_to_process_log'].to_list()
cons_services = data[(data['main_industry']=='Consumer Services (Primary)')&(data['Market_to_process_log'].notna())]['Market_to_process_log'].to_list()
retail = data[(data['main_industry']=='Retailing (Primary)')&(data['Market_to_process_log'].notna())]['Market_to_process_log'].to_list()
food_staples_retail = data[(data['main_industry']=='Food and Staples Retailing (Primary)')&(data['Market_to_process_log'].notna())]['Market_to_process_log'].to_list()
auto = data[(data['main_industry']=='Automobiles and Components (Primary)')&(data['Market_to_process_log'].notna())]['Market_to_process_log'].to_list()
food_bev_tob = data[(data['main_industry']=='Food, Beverage and Tobacco (Primary)')&(data['Market_to_process_log'].notna())]['Market_to_process_log'].to_list()
household_personal = data[(data['main_industry']=='Household and Personal Products (Primary)')&(data['Market_to_process_log'].notna())]['Market_to_process_log'].to_list()

In [7]:
# Check whether groups differ
stats.kruskal(cons_dur_apparel, cons_services, retail, 
              food_staples_retail, auto, food_bev_tob, household_personal)

KruskalResult(statistic=76.7660215850911, pvalue=1.6608788794856263e-14)

In [8]:
# Need to check which groups are different

ind_lists = [cons_dur_apparel, cons_services, retail, 
             food_staples_retail, auto, food_bev_tob, household_personal]
names = ['Consumer Durables and Apparel','Consumer Services','Retailing',
         'Food and Staples Retailing','Automobiles and Components',
         'Food, Beverage and Tobacco','Household and Personal Products']

kruskal_dict = dict()

for ind1, name1 in zip(ind_lists, names):
    kruskal_dict[name1] = dict()
    for ind2, name2 in zip(ind_lists, names):
        stat, p = stats.kruskal(ind1, ind2)
        kruskal_dict[name1][name2] = f"{stat:.2f} ({p:.3f})"
        
kruskal_df = pd.DataFrame(kruskal_dict)
display(kruskal_df)

Unnamed: 0,Consumer Durables and Apparel,Consumer Services,Retailing,Food and Staples Retailing,Automobiles and Components,"Food, Beverage and Tobacco",Household and Personal Products
Consumer Durables and Apparel,0.00 (1.000),0.51 (0.476),14.73 (0.000),15.73 (0.000),6.54 (0.011),7.68 (0.006),17.24 (0.000)
Consumer Services,0.51 (0.476),0.00 (1.000),13.88 (0.000),12.45 (0.000),1.52 (0.217),7.07 (0.008),13.11 (0.000)
Retailing,14.73 (0.000),13.88 (0.000),-0.00 (1.000),0.00 (0.958),37.07 (0.000),4.14 (0.042),0.04 (0.842)
Food and Staples Retailing,15.73 (0.000),12.45 (0.000),0.00 (0.958),0.00 (1.000),39.71 (0.000),4.76 (0.029),0.02 (0.875)
Automobiles and Components,6.54 (0.011),1.52 (0.217),37.07 (0.000),39.71 (0.000),0.00 (1.000),31.20 (0.000),44.88 (0.000)
"Food, Beverage and Tobacco",7.68 (0.006),7.07 (0.008),4.14 (0.042),4.76 (0.029),31.20 (0.000),0.00 (1.000),4.73 (0.030)
Household and Personal Products,17.24 (0.000),13.11 (0.000),0.04 (0.842),0.02 (0.875),44.88 (0.000),4.73 (0.030),0.00 (1.000)


### Hypothesis 3: Environment-to-social speaking space index

In [9]:
#Get lists of SC paragraph share by industry
cons_dur_apparel = data[(data['main_industry']=='Consumer Durables and Apparel (Primary)')&(data['Env_to_social_log'].notna())]['Env_to_social_log'].to_list()
cons_services = data[(data['main_industry']=='Consumer Services (Primary)')&(data['Env_to_social_log'].notna())]['Env_to_social_log'].to_list()
retail = data[(data['main_industry']=='Retailing (Primary)')&(data['Env_to_social_log'].notna())]['Env_to_social_log'].to_list()
food_staples_retail = data[(data['main_industry']=='Food and Staples Retailing (Primary)')&(data['Env_to_social_log'].notna())]['Env_to_social_log'].to_list()
auto = data[(data['main_industry']=='Automobiles and Components (Primary)')&(data['Env_to_social_log'].notna())]['Env_to_social_log'].to_list()
food_bev_tob = data[(data['main_industry']=='Food, Beverage and Tobacco (Primary)')&(data['Env_to_social_log'].notna())]['Env_to_social_log'].to_list()
household_personal = data[(data['main_industry']=='Household and Personal Products (Primary)')&(data['Env_to_social_log'].notna())]['Env_to_social_log'].to_list()

In [10]:
# Check whether groups differ
stats.kruskal(cons_dur_apparel, cons_services, retail, 
              food_staples_retail, auto, food_bev_tob, household_personal)

KruskalResult(statistic=43.18094576951314, pvalue=1.0741071023682279e-07)

In [11]:
# Need to check which groups are different

ind_lists = [cons_dur_apparel, cons_services, retail, 
             food_staples_retail, auto, food_bev_tob, household_personal]
names = ['Consumer Durables and Apparel','Consumer Services','Retailing',
         'Food and Staples Retailing','Automobiles and Components',
         'Food, Beverage and Tobacco','Household and Personal Products']

kruskal_dict = dict()

for ind1, name1 in zip(ind_lists, names):
    kruskal_dict[name1] = dict()
    for ind2, name2 in zip(ind_lists, names):
        stat, p = stats.kruskal(ind1, ind2)
        kruskal_dict[name1][name2] = f"{stat:.2f} ({p:.3f})"
        
kruskal_df = pd.DataFrame(kruskal_dict)
display(kruskal_df)

Unnamed: 0,Consumer Durables and Apparel,Consumer Services,Retailing,Food and Staples Retailing,Automobiles and Components,"Food, Beverage and Tobacco",Household and Personal Products
Consumer Durables and Apparel,0.00 (1.000),9.58 (0.002),1.17 (0.279),0.95 (0.330),1.51 (0.219),1.19 (0.275),11.92 (0.001)
Consumer Services,9.58 (0.002),0.00 (1.000),2.82 (0.093),5.00 (0.025),4.32 (0.038),6.92 (0.009),35.44 (0.000)
Retailing,1.17 (0.279),2.82 (0.093),-0.00 (1.000),0.18 (0.671),0.07 (0.796),0.26 (0.610),15.72 (0.000)
Food and Staples Retailing,0.95 (0.330),5.00 (0.025),0.18 (0.671),0.00 (1.000),0.04 (0.836),0.01 (0.915),21.31 (0.000)
Automobiles and Components,1.51 (0.219),4.32 (0.038),0.07 (0.796),0.04 (0.836),0.00 (1.000),0.04 (0.839),24.75 (0.000)
"Food, Beverage and Tobacco",1.19 (0.275),6.92 (0.009),0.26 (0.610),0.01 (0.915),0.04 (0.839),0.00 (1.000),25.22 (0.000)
Household and Personal Products,11.92 (0.001),35.44 (0.000),15.72 (0.000),21.31 (0.000),24.75 (0.000),25.22 (0.000),0.00 (1.000)


### Hypothesis 4: Supply chain topic breakdown

In [12]:
topic_list = ['Management systems_percent','Deforestation_percent',
                       'Human rights_percent','Employee health & safety_percent',
                       'Resource usage_percent',
                       'Certifications & training_percent',
                       'Collaborations & partnerships_percent',
                       'Plans and progress_percent',
                       'Governance & stakeholders_percent','Policies_percent',
                       'Product quality_percent',
                       'Diversity & inclusion_percent','Agriculture_percent',
                       'Risk assessments_percent','Chemicals_percent',
                       'Transportation & logistics_percent',
                       'Society_percent','Store operations_percent',
                       'Materials & packaging_percent']

topic_kruskal_dict = dict()

for topic in topic_list:
    #Get lists of SC paragraph share by industry
    cons_dur_apparel = data[data['main_industry']=='Consumer Durables and Apparel (Primary)'][topic].to_list()
    cons_services = data[data['main_industry']=='Consumer Services (Primary)'][topic].to_list()
    retail = data[data['main_industry']=='Retailing (Primary)'][topic].to_list()
    food_staples_retail = data[data['main_industry']=='Food and Staples Retailing (Primary)'][topic].to_list()
    auto = data[data['main_industry']=='Automobiles and Components (Primary)'][topic].to_list()
    food_bev_tob = data[data['main_industry']=='Food, Beverage and Tobacco (Primary)'][topic].to_list()
    household_personal = data[data['main_industry']=='Household and Personal Products (Primary)'][topic].to_list()
    
    # Get stats
    stat, p = stats.kruskal(cons_dur_apparel, cons_services, retail, 
              food_staples_retail, auto, food_bev_tob, household_personal, nan_policy='omit')
    
    topic_kruskal_dict[topic] = f"{stat:.2f} ({p:.3f})"
    
topic_kruskal_dict

{'Management systems_percent': '243.70 (0.000)',
 'Deforestation_percent': '267.72 (0.000)',
 'Human rights_percent': '30.17 (0.000)',
 'Employee health & safety_percent': '141.95 (0.000)',
 'Resource usage_percent': '42.20 (0.000)',
 'Certifications & training_percent': '54.43 (0.000)',
 'Collaborations & partnerships_percent': '85.28 (0.000)',
 'Plans and progress_percent': '41.81 (0.000)',
 'Governance & stakeholders_percent': '5.22 (0.516)',
 'Policies_percent': '18.47 (0.005)',
 'Product quality_percent': '138.08 (0.000)',
 'Diversity & inclusion_percent': '61.57 (0.000)',
 'Agriculture_percent': '297.43 (0.000)',
 'Risk assessments_percent': '43.70 (0.000)',
 'Chemicals_percent': '127.89 (0.000)',
 'Transportation & logistics_percent': '49.39 (0.000)',
 'Society_percent': '48.52 (0.000)',
 'Store operations_percent': '139.55 (0.000)',
 'Materials & packaging_percent': '118.93 (0.000)'}

In [13]:
pd.Series(topic_kruskal_dict)

Management systems_percent               243.70 (0.000)
Deforestation_percent                    267.72 (0.000)
Human rights_percent                      30.17 (0.000)
Employee health & safety_percent         141.95 (0.000)
Resource usage_percent                    42.20 (0.000)
Certifications & training_percent         54.43 (0.000)
Collaborations & partnerships_percent     85.28 (0.000)
Plans and progress_percent                41.81 (0.000)
Governance & stakeholders_percent          5.22 (0.516)
Policies_percent                          18.47 (0.005)
Product quality_percent                  138.08 (0.000)
Diversity & inclusion_percent             61.57 (0.000)
Agriculture_percent                      297.43 (0.000)
Risk assessments_percent                  43.70 (0.000)
Chemicals_percent                        127.89 (0.000)
Transportation & logistics_percent        49.39 (0.000)
Society_percent                           48.52 (0.000)
Store operations_percent                 139.55 