In [1]:
from pathlib import Path
import plotly.express as px
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import pandas as pd
from scipy.stats import chi2_contingency
from itertools import combinations

In [2]:
pd.options.plotting.backend = "plotly"

### Load results

In [4]:
dataf = pd.read_csv("data/classified-mapped.csv").assign(**{
    "Purpose Categories": lambda d: d["Purpose Categories"].apply(lambda x: eval(x))
})

### Analysis

#### Category distribution

In [5]:
dataf["Purpose Categories"].explode().value_counts()

Purpose Categories
Sustainability                             204
Innovation, Transformation & Technology    203
Customer centrism                          165
Positive impact                             88
Stakeholder value                           87
Leadership & Excellence                     67
Well-being                                  57
Efficiency & Improvement                    46
Growth                                      41
Community & Collaboration                   37
Principles                                  34
Corporate culture & Inclusivity             24
Corporate responsibility                    18
Accessibility                               18
Empowerment                                 17
Responsibility                              15
Affordability                               12
Uncategorized                                3
Name: count, dtype: int64

In [6]:
dataf["Purpose Categories"].explode().value_counts().plot.bar()

#### Category correlation

In [7]:
dataf

Unnamed: 0,Company name,Purpose Text,Purpose Categories
0,AB Skf,Condition monitoring and data collection will ...,"[Innovation, Transformation & Technology, Effi..."
1,Abb Ltd,ABB's purpose is to enable a more sustainable ...,"[Leadership & Excellence, Sustainability]"
2,Abrdn PLC,"At abrdn, our purpose is to enable our clients...","[Growth, Customer centrism, Empowerment]"
3,Acciona SA,Our Purpose: BUSINESS AS UNUSUAL\nThere is a d...,"[Growth, Positive impact, Leadership & Excelle..."
4,Accor SA,By staying focused on the future and on our\nv...,"[Leadership & Excellence, Innovation, Transfor..."
...,...,...,...
432,Yara International ASA,Vision: A collaborative society; a world witho...,"[Positive impact, Community & Collaboration, S..."
433,Zalando SE,"At Zalando, our purpose of reimagining fashion...","[Customer centrism, Innovation, Transformation..."
434,Zur Rose Group AG,"With its business model, the Zur Rose Group of...","[Customer centrism, Affordability, Innovation,..."
435,AAK AB (publ),"Purpose: Our purpose, and everything we do, is...","[Customer centrism, Innovation, Transformation..."


In [8]:
def get_dummies(df):
    
    # Create binary encoding for each label
    binary_encoded = pd.get_dummies(df['Purpose Categories'])
    binary_encoded['Company name'] = df['Company name']
    return binary_encoded.groupby('Company name').max()

# Explode the 'Purpose Categories' column
correlation_matrix = (
    dataf.explode('Purpose Categories')
    [lambda x: x['Purpose Categories'] != "Uncategorized"]
    .pipe(get_dummies)
    .corr()
    .round(3)
)

# Create a heatmap to visualize the correlation matrix
fig = px.imshow(
    correlation_matrix,
    text_auto=True,
    aspect="auto", 
    labels=dict(x="Label", y="Label", color="Correlation"),
    x=correlation_matrix.columns, 
    y=correlation_matrix.columns
)

fig.update_layout(
    title_text='Pearson Correlation Matrix of Purpose Categories',
    title_x=0.5,
    width=800,
    height=800,
    font=dict(
        size=10,
    )
)
fig.show()


#### Cluster analysis

In [9]:
# Explode 'Purpose Categories' and create a binary encoding
binary_encoded = (
    dataf.drop(columns=["Purpose Text"])
    .explode('Purpose Categories')
    .pipe(lambda df: pd.get_dummies(df, columns=['Purpose Categories']))
    .groupby('Company name')
    .sum()
)

#### Finding number of clusters
Calculating inertia for a range of k values

In [10]:

inertia = []
k_values = range(1, 20)
for k in k_values:
    kmeans = KMeans(
        n_clusters=k,
        random_state=1234,
        n_init="auto"
    )
    kmeans.fit(binary_encoded)
    inertia.append(kmeans.inertia_)

fig = px.line(
    x=k_values,
    y=inertia,
    markers=True,
    title="Elbow Method for Optimal k",
    labels={'x': 'Number of clusters (k)', 'y': 'Inertia'},
    width=800,
    height=500
)
fig.show()


In [11]:
def visualize_clusters(df, n_clusters):

    kmeans = KMeans(n_clusters=n_clusters, random_state=0, n_init="auto")
    clusters = kmeans.fit_predict(df)

    # Applying PCA for dimensionality reduction
    principal_components = PCA(n_components=2).fit_transform(df)
    principal_df = pd.DataFrame(data=principal_components, columns=['PC1', 'PC2'])
    principal_df['Cluster'] = clusters
    principal_df['Company name'] = df.index

        # Plotting using Plotly
    return principal_df, px.scatter(
        principal_df,
        x='PC1',
        y='PC2',
        color='Cluster', 
        title="PCA of Companies with Clusters",
        labels={'Cluster': 'Cluster', 'PC1': 'Principal Component 1', 'PC2': 'Principal Component 2'},
        hover_data=['Company name', 'Cluster'],
        width=800,
        height=600
    )
    

In [12]:
clustered_df, fig = visualize_clusters(binary_encoded, 15)
fig.show()


In [13]:
_, fig = visualize_clusters(binary_encoded, 5)
fig.show()

#### Save clustered data

In [14]:
(
    dataf.merge(
        clustered_df[["Company name", "Cluster"]],
        on="Company name"
    )
    .to_csv("data/classified-clustered.csv", index=False)
)

#### Chi-squared test
Are the categories independent?

In [15]:
binary_encoded = (
    dataf.explode('Purpose Categories')
    .pipe(lambda df: pd.get_dummies(df, columns=['Purpose Categories']))
    .groupby('Company name')
    .sum()
)

# Function to perform Chi-Square Test on a pair of labels
def chi_square_test(df, label1, label2):
    table = pd.crosstab(df[label1], df[label2])
    chi2, p, dof, ex = chi2_contingency(table)
    return chi2, p

# Prepare a list to collect results
chi_square_results = []

# Iterating over each combination of labels and performing Chi-Square Test
alpha = 0.05  # Significance level
for label1, label2 in combinations(binary_encoded.columns, 2):
    chi2, p = chi_square_test(binary_encoded, label1, label2)
    independence = "Independence" if p > alpha else "Dependence"
    chi_square_results.append({
        'Label 1': label1, 
        'Label 2': label2, 
        'Chi-Square Statistic': chi2, 
        'P-Value': p, 
        'Result': independence
    })
    
results_df = (
    pd.DataFrame(chi_square_results)
    .replace({"Purpose Categories_": ""}, regex=True)
)

In [16]:
# Filter to show only significant results
results_df[results_df['P-Value'] < 0.05].sort_values(by='P-Value', ascending=True)

Unnamed: 0,Label 1,Label 2,Chi-Square Statistic,P-Value,Result
102,Customer centrism,Sustainability,18.12586,2.1e-05,Dependence
98,Customer centrism,Positive impact,16.939314,3.9e-05,Dependence
110,Efficiency & Improvement,Principles,11.871992,0.00057,Dependence
84,Corporate responsibility,"Innovation, Transformation & Technology",10.967083,0.000927,Dependence
165,Stakeholder value,Sustainability,9.575385,0.001972,Dependence
152,Positive impact,Stakeholder value,8.95853,0.002762,Dependence
147,Leadership & Excellence,Sustainability,8.225616,0.00413,Dependence
69,Corporate culture & Inclusivity,Empowerment,7.766413,0.005323,Dependence
123,Empowerment,Sustainability,7.266198,0.007026,Dependence
115,Efficiency & Improvement,Well-being,6.480025,0.010909,Dependence


In [17]:
(
    px.imshow(
        results_df.pivot(index="Label 1", columns="Label 2", values="P-Value").round(3),
        text_auto=True,
        aspect='auto',
        color_continuous_scale='Viridis',
        width=800,
        height=900
    )
    .update_layout(
        title='P-Value Heatmap for Label Pairs',
        xaxis_title='Label 2',
        yaxis_title='Label 1',
        title_x=0.3,
    )
)
