In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('dark_background')

# Example of multi-label multi-class dataframe

In [13]:
df = pd.DataFrame({'label1': ['classA', 'classB', 'classB', 'classC', 'classC', 'classC'],
                   'label2': ['classD', 'classD', 'classD', 'classE', 'classF', 'classG'],
                   'label3': ['classA', 'classB', 'classC', 'classD', 'classE', 'classF']
                  }, dtype='category')
df

Unnamed: 0,label1,label2,label3
0,classA,classD,classA
1,classB,classD,classB
2,classB,classD,classC
3,classC,classE,classD
4,classC,classF,classE
5,classC,classG,classF


# Number of unique classes per label

In [14]:
nunique_classes = df.apply(pd.Series.nunique)
nunique_classes

label1    3
label2    4
label3    6
dtype: int64

# From labels to dummy variables

In [15]:
y = pd.get_dummies(df)
y

Unnamed: 0,label1_classA,label1_classB,label1_classC,label2_classD,label2_classE,label2_classF,label2_classG,label3_classA,label3_classB,label3_classC,label3_classD,label3_classE,label3_classF
0,1,0,0,1,0,0,0,1,0,0,0,0,0
1,0,1,0,1,0,0,0,0,1,0,0,0,0
2,0,1,0,1,0,0,0,0,0,1,0,0,0
3,0,0,1,0,1,0,0,0,0,0,1,0,0
4,0,0,1,0,0,1,0,0,0,0,0,1,0
5,0,0,1,0,0,0,1,0,0,0,0,0,1


# Class indices to the dummy variables

In [16]:
label_margins = nunique_classes.cumsum()
class_indices = [list(range(start, stop)) for start, stop in list(zip((label_margins - nunique_classes), label_margins))]
class_indices

[[0, 1, 2], [3, 4, 5, 6], [7, 8, 9, 10, 11, 12]]

# Accessing classes per label

Various ways to do that:

In [17]:
print('dict of label indices and class indices:')
print({l: c for l,c in enumerate(class_indices)})
print('\ndict of labels and classes:')
print({df.columns[l]: y.columns[c].to_list() for l,c in enumerate(class_indices)})
print('\ndict of labels and class indices:')
print({df.columns[l]: c for l,c in enumerate(class_indices)})
print('\ndict of label indices and classes:')
print({l: y.columns[c].to_list() for l,c in enumerate(class_indices)})

dict of label indices and class indices:
{0: [0, 1, 2], 1: [3, 4, 5, 6], 2: [7, 8, 9, 10, 11, 12]}

dict of labels and classes:
{'label1': ['label1_classA', 'label1_classB', 'label1_classC'], 'label2': ['label2_classD', 'label2_classE', 'label2_classF', 'label2_classG'], 'label3': ['label3_classA', 'label3_classB', 'label3_classC', 'label3_classD', 'label3_classE', 'label3_classF']}

dict of labels and class indices:
{'label1': [0, 1, 2], 'label2': [3, 4, 5, 6], 'label3': [7, 8, 9, 10, 11, 12]}

dict of label indices and classes:
{0: ['label1_classA', 'label1_classB', 'label1_classC'], 1: ['label2_classD', 'label2_classE', 'label2_classF', 'label2_classG'], 2: ['label3_classA', 'label3_classB', 'label3_classC', 'label3_classD', 'label3_classE', 'label3_classF']}


In [18]:
for l, c in enumerate(class_indices):
    print('Label:', df.columns[l])
    display(y.iloc[:,class_indices[l]])

Label: label1


Unnamed: 0,label1_classA,label1_classB,label1_classC
0,1,0,0
1,0,1,0
2,0,1,0
3,0,0,1
4,0,0,1
5,0,0,1


Label: label2


Unnamed: 0,label2_classD,label2_classE,label2_classF,label2_classG
0,1,0,0,0
1,1,0,0,0
2,1,0,0,0
3,0,1,0,0
4,0,0,1,0
5,0,0,0,1


Label: label3


Unnamed: 0,label3_classA,label3_classB,label3_classC,label3_classD,label3_classE,label3_classF
0,1,0,0,0,0,0
1,0,1,0,0,0,0
2,0,0,1,0,0,0
3,0,0,0,1,0,0
4,0,0,0,0,1,0
5,0,0,0,0,0,1


# Dummy sums

## Column sums: class frequency

In [19]:
y.sum()

label1_classA    1
label1_classB    2
label1_classC    3
label2_classD    3
label2_classE    1
label2_classF    1
label2_classG    1
label3_classA    1
label3_classB    1
label3_classC    1
label3_classD    1
label3_classE    1
label3_classF    1
dtype: int64

## Row sums: always sum up to the number of labels

In [20]:
y.sum(axis=1)

0    3
1    3
2    3
3    3
4    3
5    3
dtype: int64

## Column sums by label: label frequency = number of rows

In [21]:
{df.columns[l]: y.iloc[:,c].sum().sum() for l, c in enumerate(class_indices)}

{'label1': 6, 'label2': 6, 'label3': 6}

# Dummy averages

## Column averages: class probabilities 

In [22]:
y.mean()

label1_classA    0.166667
label1_classB    0.333333
label1_classC    0.500000
label2_classD    0.500000
label2_classE    0.166667
label2_classF    0.166667
label2_classG    0.166667
label3_classA    0.166667
label3_classB    0.166667
label3_classC    0.166667
label3_classD    0.166667
label3_classE    0.166667
label3_classF    0.166667
dtype: float64

## Products of class probabilities

In [23]:
# Product of all column averages
print(f'{y.mean().prod():,.20f}')

0.00000000137818097399


Factors

In [24]:
# Products of column averages within labels
[y.iloc[:,l].mean().prod() for l in class_indices]

[0.027777777777777776, 0.0023148148148148147, 2.1433470507544577e-05]

## Column averages by label: label probabilities

In [25]:
[y.iloc[:,l].values.mean() for l in class_indices]

[0.3333333333333333, 0.25, 0.16666666666666666]

Is the same as the probability distribution of number of unique classes

In [29]:
1 / nunique_classes.values

array([0.33333333, 0.25      , 0.16666667])

## Product of column averages by label: product of label probabilities

In [30]:
np.product([y.iloc[:,l].values.mean() for l in class_indices])

0.013888888888888888

## Row averages: allways averages to labels / classes ratio

In [31]:
y.mean(axis=1)

0    0.230769
1    0.230769
2    0.230769
3    0.230769
4    0.230769
5    0.230769
dtype: float64

In [32]:
len(df.columns) / len(y.columns)

0.23076923076923078

## Products of row averages

In [33]:
y.mean(axis=1).product()

0.00015103145784305954

In [34]:
[y.iloc[:,l].mean(axis=1).prod() for l in class_indices]

[0.001371742112482853, 0.000244140625, 2.1433470507544577e-05]