In [39]:
import ast
import numpy as np
import pandas as pd
from tabulate import tabulate
import matplotlib.pyplot as plt
from sklearn.dummy import DummyClassifier
from sklearn.metrics import multilabel_confusion_matrix, ConfusionMatrixDisplay

In [40]:
def apply_threshold(array, threshold):
    return np.where(array >= threshold, 1, 0)


def sigmoid(x):
    return 1 / (1 + np.exp(-x))


def multilabel_confusion_matrix_plot(decoded_labels, predictions, targets):
    # Create a grid for subplots
    num_rows = np.sqrt(len(decoded_labels)).astype(int)
    num_cols = np.sqrt(len(decoded_labels)).astype(int)

    cm = multilabel_confusion_matrix(targets, predictions)

    fig, axes = plt.subplots(nrows=num_rows, ncols=num_cols, figsize=(24, 24))

    # Loop through each label and display the confusion matrix
    for i, ax in enumerate(axes.flatten()):
        if i < len(decoded_labels):
            label_matrix = cm[i]
            disp = ConfusionMatrixDisplay(confusion_matrix=label_matrix)
            disp.plot(ax=ax, colorbar=True)
            disp.ax_.set_title(f"{decoded_labels[i]}")
        # else:
        #     ax.axis('off')
    plt.tight_layout()
    plt.show()

In [41]:
train_df = pd.read_csv("data/internal/splits/final/train.csv")
val_df = pd.read_csv("data/internal/splits/final/val.csv")
test_df = pd.read_csv("data/internal/splits/final/test.csv")

In [42]:
def get_freq_table(df):
    total_annotations = df[df.columns[1:-1]].sum(1).sum()
    behaviour, count, percentage = [], [], []
    for v in df.columns[1:-1]:
        index, values = df[v].value_counts()
        behaviour.append(v)
        count.append(values)
        percentage.append((values / total_annotations) * 100)

    unsorted_list = list(zip(behaviour, count, percentage))
    sorted_by_second = sorted(unsorted_list, key=lambda x: x[2], reverse=True)
    table = tabulate(
        sorted_by_second,
        headers=["Behaviour", "Count", "Percentage"],
        tablefmt="orgtbl",
    )
    return table

In [43]:
print(get_freq_table(train_df))

| Behaviour                   |   Count |   Percentage |
|-----------------------------+---------+--------------|
| p_feeding                   |    4412 |    19.263    |
| p_travel                    |    3655 |    15.9579   |
| p_no_behaviour              |    3091 |    13.4955   |
| p_resting                   |    3013 |    13.1549   |
| p_camera_reaction           |    1561 |     6.8154   |
| p_climbing                  |    1430 |     6.24345  |
| p_social_interaction        |    1359 |     5.93346  |
| p_chimp_carrying            |    1332 |     5.81558  |
| p_tool_use                  |     807 |     3.5234   |
| p_vocalisation              |     722 |     3.15229  |
| p_object_carrying           |     458 |     1.99965  |
| p_grooming                  |     455 |     1.98655  |
| p_display                   |     160 |     0.698568 |
| p_aggression                |     122 |     0.532658 |
| p_sex                       |     116 |     0.506462 |
| p_bipedal                   |

In [27]:
labels = np.array([ast.literal_eval(l) for l in train_df.label.tolist()])
X = np.ones(labels.shape)
dummy_clf = DummyClassifier(strategy="most_frequent")
dummy_clf.fit(X=X, y=labels)

In [29]:
dummy_clf.score(X=X, y=labels)

0.0040208352371378965

In [31]:
dummy_clf.predict(X=X)

(10943, 18)