In [None]:
# Copyright (C) 2022 Mila - Institut québécois d'intelligence artificielle
# SPDX-License-Identifier: Apache-2.0

In [None]:
# This notebook permits to visualize the labels and their statistics for
# the tight crop and tight crop imagette dataset.

In [None]:
import os

from collections import Counter

import matplotlib.patches as mpatches
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from anomalib.data.utils import read_image

%matplotlib inline

In [None]:
# To adapt
root_directory = os.environ["HOME"]

In [None]:
# Load labels
data_dir = os.path.join(root_directory, "CableInspect-AD")
df = pd.read_csv(os.path.join(data_dir, "labels.csv"))
# Add cable_side_ids and cable_side_pass_ids columns
df["cable_side_ids"] = df["cable_id"] + "-" + df["side_id"]
df["cable_side_pass_ids"] = df["cable_id"] + "-" + df["side_id"] + "-" + df["pass_id"].astype("str")

In [None]:
# Define anomaly columns
anomaly_columns = [
    "anomaly_type_id",
    "anomaly_type",
    "bbox_area",
    "primary_identification",
    "secondary_identification",
    "anomaly_grade",
    "identification",
    "mask_path",
]

In [None]:
# Print general informations about the labels dataset:
# - Number of rows, columns;
# - Information about the number of unique values per column and their values.
print(f"# of rows: {df.shape[0]}, # of columns: {df.shape[1]}\n")
for col in df.columns.tolist():
    print(col)
    print("-" * len(col))
    print(f"# unique values: {df[col].nunique()}")
    print(f"unique values: {df[col].unique()}")
    print("\n")

In [None]:
# Show the first five rows of the labels dataframe
df.head()

In [None]:
# Isolate nominal examples using label_index
df_nominal = df[df["label_index"] == 0].copy()
print(f"Number of nominal images: {df_nominal['image_path'].nunique()}")

In [None]:
# Check nominal images have no anomaly info (i.e. anomaly columns are all NaN)
assert df_nominal.dropna(subset=anomaly_columns, how="all").empty

In [None]:
# Print number of nominal images per cable
cable_id = df_nominal["cable_id"].unique()
for cable in cable_id:
    tmp_df = df_nominal[df_nominal["cable_id"] == cable]
    print(f"{cable} - total # of nominal images: {len(tmp_df)}")

In [None]:
# Print number of nominal images per cable side
for cable_side in df_nominal["cable_side_ids"].unique():
    df_cable = df_nominal[df_nominal["cable_side_ids"] == cable_side]
    nb_images = df_cable["image_path"].nunique()
    nb_unique_anomalies = df_cable["identification"].nunique()
    print(f"{cable_side} - # nominal images: {nb_images}")

In [None]:
# Print number of nominal images per cable side pass
for cable_side_pass in sorted(df_nominal["cable_side_pass_ids"].unique()):
    df_cable = df_nominal[df_nominal["cable_side_pass_ids"] == cable_side_pass]
    nb_images = df_cable["image_path"].nunique()
    print(f"{cable_side_pass} - # nominal images: {nb_images}")

In [None]:
# Bar plot showing the number of nominal frames per cable
group = df_nominal.cable_id.value_counts()
group.plot.bar(ylabel="Number of nominal frames", xlabel="Cable id.")
plt.xticks(rotation="horizontal")
plt.xlabel("")
plt.show()

In [None]:
# Bar plot showing the number of nominal frames per cable side
group = df_nominal.groupby("cable_id").side_id.value_counts().unstack()
group.plot.bar(xlabel="Cable id.", ylabel="Number of nominal frames")
plt.xticks(rotation="horizontal")
plt.xlabel("")
plt.legend(title="Side ID")
plt.show()

In [None]:
# Bar plot showing the number of nominal frames per cable pass
group = df_nominal.groupby(["cable_id"]).pass_id.value_counts().unstack()
group.plot.bar(xlabel="Cable id.", ylabel="Number of nominal frames")
plt.xticks(rotation="horizontal")
plt.xlabel("")
plt.legend(title="Pass ID")
plt.show()

In [None]:
# Bar plot showing the number of nominal frames per cable side pass
group = df_nominal.groupby(["cable_id", "side_id"]).pass_id.value_counts().unstack()
ax = group.plot.bar(xlabel="(Cable id., Side id.)", ylabel="Number of nominal frames")
xtickslabels = ax.get_xticklabels()
xtickspos = ax.get_xticks()
xtickslabels = [i.get_text().replace(", ", "-")[1:-1] for i in xtickslabels]
plt.xticks(rotation="horizontal", ticks=xtickspos, labels=xtickslabels)
plt.xlabel("")
plt.legend(title="Pass ID")
plt.show()

In [None]:
# Isolate anomalous samples using label index
df_anomalous = df[df["label_index"] == 1].copy()
print(f"Number of anomalous images: {df_anomalous['image_path'].nunique()}")

In [None]:
print(f"Number of unique anomalies: {df_anomalous['identification'].nunique()}")

In [None]:
text = "Number of anomalies fabricated (different from those that were already there on the cable)"
print(f"{text}: {df_anomalous['primary_identification'].nunique() - 1}")

In [None]:
# Check if there is missing values for anomaly features
assert df_anomalous.shape == df_anomalous.dropna(subset=anomaly_columns, how="any").shape

In [None]:
# Print number of anomalous images and unique anomalies per cable
for cable in df_anomalous["cable_id"].unique():
    df_cable = df_anomalous[df_anomalous["cable_id"] == cable]
    nb_images = df_cable["image_path"].nunique()
    nb_unique_anomalies = df_cable["identification"].nunique()
    print(f"Cable: {cable} - # anomalous images {nb_images}, # unique anomalies {nb_unique_anomalies}")

In [None]:
# Print number of anomalous images and unique anomalies per cable side
for cable_side in df_anomalous["cable_side_ids"].unique():
    df_cable = df_anomalous[df_anomalous["cable_side_ids"] == cable_side]
    nb_images = df_cable["image_path"].nunique()
    nb_unique_anomalies = df_cable["identification"].nunique()
    print(f"{cable_side} - # anomalous images: {nb_images}; # unique anomalies: {nb_unique_anomalies}")

In [None]:
# Print number of anomalous images and unique anomalies per cable side pass
for cable_side_pass in sorted(df_anomalous["cable_side_pass_ids"].unique()):
    df_cable = df_anomalous[df_anomalous["cable_side_pass_ids"] == cable_side_pass]
    nb_images = df_cable["image_path"].nunique()
    nb_unique_anomalies = df_cable["identification"].nunique()
    print(f"{cable_side_pass} - # anomalous images: {nb_images}; # unique anomalies: {nb_unique_anomalies}")

In [None]:
# Bar plot showing the number of frames per anomaly types
group = df_anomalous.anomaly_type.value_counts()
group.plot.bar(ylabel="Number of frames", xlabel="Anomaly types")
plt.show()

In [None]:
# Bar plot showing the number of frames per anomaly types per cable
group = df_anomalous.groupby("anomaly_type").cable_id.value_counts().unstack()
group.plot.bar(xlabel="Anomaly types", ylabel="Number of frames")
plt.show()

In [None]:
# For each anomaly type
# Bar plot showing the number of frames per anomaly grade per cable
for anomaly_type in df_anomalous["anomaly_type"].unique():
    temp_df = df_anomalous[df_anomalous["anomaly_type"] == anomaly_type]
    group = temp_df.groupby("anomaly_grade").cable_id.value_counts().unstack()
    group.plot.bar(xlabel=anomaly_type, ylabel="Number of frames")
    plt.show()

In [None]:
# Print bounding  boxes  areas general statistics
df["bbox_area"].describe()

In [None]:
# Visualize masks with anomaly on the border
nb_border_pixel = 20  # To adapt: number of pixel to include in the border
for idx, row in df_anomalous.iterrows():
    img_path = os.path.join(data_dir, row["image_path"])
    img = read_image(img_path)
    w, h, c = img.shape
    mask_path = img_path.replace("images", "masks")
    mask = read_image(mask_path)

    if mask.sum() != w * h * 255 * 3:
        assert mask.min() == 0
    else:
        assert mask.min() == 255
    assert mask.max() == 255

    if mask[nb_border_pixel:-nb_border_pixel, nb_border_pixel:-nb_border_pixel].sum() == 0:
        print(img_path)
        print(mask_path)
        print(mask.shape)
        print(round(mask.sum() / 255 / c / (w * h), 3))
        f, axarr = plt.subplots(1, 2)
        axarr[0].imshow(img)
        axarr[1].imshow(mask)
        plt.title(f"Bbox area: {row['bbox_area']}")
        plt.show()

In [None]:
# Plot bounding boxes areas histograms per anomaly type
# The histograms are normalized by the image size
# Y-axis: Count
# X-axis: Bounding box area (% of the image)
df_anomalous.bbox_area.hist(by=df_anomalous["anomaly_type"], figsize=(15, 15))
plt.show()

In [None]:
# Define bins to use to plot the bounding boxes areas histograms
bins = np.linspace(0, 1, 100 + 1, True)

In [None]:
# Plot bounding boxes areas histograms per anomaly type
fig, ax = plt.subplots(figsize=(8, 6))
for anomaly_type in sorted(df_anomalous["anomaly_type"].unique()):
    data = df_anomalous[df_anomalous["anomaly_type"] == anomaly_type]
    plt.hist(data["bbox_area"], bins=bins, alpha=0.5, label=anomaly_type)

plt.xlabel("Bounding box area (% of the image)")
plt.ylabel("Count")
plt.legend(title="Anomaly type:")

ax.set_axisbelow(True)
ax.minorticks_on()
ax.grid(which="major", linestyle="-", linewidth="0.5", color="grey")
ax.grid(which="minor", linestyle=":", linewidth="0.5", color="grey")

ax.set_xlim(-0.01, 1.01)

plt.show()

In [None]:
def _find_nearest(lst, value):
    """Find nearest value in a list."""
    array = np.asarray(lst)
    idx = (np.abs(array - value)).argmin()
    return array[idx]

In [None]:
# For each anomaly type
# Plot bounding boxes areas histograms per anomaly grade
for anomaly_type in sorted(df_anomalous["anomaly_type"].unique()):
    fig, ax = plt.subplots(figsize=(8, 6))
    data = df_anomalous[df_anomalous["anomaly_type"] == anomaly_type]
    for anomaly_grade in data["anomaly_grade"].unique():
        data1 = data[data["anomaly_grade"] == anomaly_grade]
        plt.hist(data1["bbox_area"], bins=bins, alpha=0.5, label=anomaly_grade)
    plt.xlabel("Bounding box area (% of the image)")
    plt.ylabel("Count")
    plt.legend(title="Anomaly grade:")
    plt.title(anomaly_type)

    ax.set_axisbelow(True)
    ax.minorticks_on()
    ax.grid(which="major", linestyle="-", linewidth="0.5", color="grey")
    ax.grid(which="minor", linestyle=":", linewidth="0.5", color="grey")

    max_bbox_area = data["bbox_area"].max()
    xlim = _find_nearest(bins, max_bbox_area)
    ax.set_xlim(-0.01, xlim + 0.02)

    plt.show()

In [None]:
# For each anomaly type and grade
# Plot bounding boxes areas histograms per cable side
for anomaly_type in sorted(df_anomalous["anomaly_type"].unique()):
    data = df_anomalous[df_anomalous["anomaly_type"] == anomaly_type]
    for anomaly_grade in data["anomaly_grade"].unique():
        fig, ax = plt.subplots(figsize=(8, 6))
        data1 = data[data["anomaly_grade"] == anomaly_grade]
        cable_side_ids = ["C01-A", "C01-B", "C02-A", "C02-B", "C03-A", "C03-B"]
        for cable_side in cable_side_ids:
            data2 = data1[data1["cable_side_ids"] == cable_side]
            plt.hist(data2["bbox_area"], bins=bins, alpha=0.5, label=cable_side)
        plt.xlabel("Bounding box area (% of the image)")
        plt.ylabel("Count")
        plt.legend(title="Cable side ID:")
        plt.title(f"{anomaly_type} - {anomaly_grade}")

        ax.set_axisbelow(True)
        ax.minorticks_on()
        ax.grid(which="major", linestyle="-", linewidth="0.5", color="grey")
        ax.grid(which="minor", linestyle=":", linewidth="0.5", color="grey")

        max_bbox_area = data1["bbox_area"].max()
        xlim = _find_nearest(bins, max_bbox_area)
        ax.set_xlim(-0.01, xlim + 0.02)

        plt.show()

In [None]:
# Print general informations about unique anomalies
identification = df_anomalous["identification"].unique().tolist()
print(f"# of unique anomalies: {len(identification)}\n")
print("Anomalies unique identification:")
print(sorted(identification))

In [None]:
# For each unique anomaly print:
# Id - cable - cable side - passes in which the anomaly appear -
# anomaly type(s) - anomaly grade(s) - number of frame in which the anomaly appear
#
# Notes:
# - An anomaly can have more than one part and thus can be tagged with different types and grades;
# - An anomaly can be seen under different views which can make it looks like from different grades.
for i in sorted(identification):
    test = df_anomalous[df_anomalous["identification"] == i]
    cable_id = test["cable_id"].unique().tolist()
    side_id = test["side_id"].unique().tolist()
    pass_id = test["pass_id"].unique().tolist()
    anomaly_type = test["anomaly_type"].unique().tolist()
    anomaly_grade = test["anomaly_grade"].unique().tolist()
    nb_img = len(test["image_path"].unique().tolist())
    print(f"Id: {i} - {cable_id} - {side_id} - {pass_id} - {anomaly_type} - {anomaly_grade} - {nb_img}")

In [None]:
# Print general informations about handmade anomalies and already present anomalies.
print(f"# of created anomalies: {len([i for i in sorted(identification) if i[-2:] == '00'])}")
print(f"# of already there anomalies: {len([i for i in sorted(identification) if i[-2:] != '00'])}")

In [None]:
unique_anomalies = df_anomalous.drop_duplicates(subset=["side_id", "identification", "anomaly_type", "anomaly_grade"])
unique_anomalies.shape

In [None]:
# Bar plot of the number of unique anomalies per anomaly types
# Note that they can be some duplicates since a unique anomaly ID can have more than one type and grade.
# Also, in few cases the anomaly can appear in both sides of the cables.
unique_anomalies.anomaly_type.value_counts().plot.bar(xlabel="Anomaly types", ylabel="# of unique anomalies")
plt.show()

In [None]:
# Bar plot of the number of unique anomalies per anomaly types and cable side
# Note that they can be some duplicates since a unique anomaly ID can have more than one type and grade.
# Also, in few cases the anomaly can appear in both sides of the cables.
ax = (
    unique_anomalies.groupby("anomaly_type")
    .cable_side_ids.value_counts()
    .unstack()
    .plot.bar(width=0.8, xlabel="Anomaly types", ylabel="# of unique anomalies")
)

ax.set_axisbelow(True)
ax.minorticks_on()
ax.grid(which="major", linestyle="-", linewidth="0.5", color="black")
ax.grid(which="minor", linestyle=":", linewidth="0.5", color="black")

plt.legend(title="Cable side ID")
plt.show()

In [None]:
def add_line(ax, xpos, ypos):
    line = plt.Line2D([xpos, xpos], [ypos + 0.1, ypos], transform=ax.transAxes, color="black", linewidth=0.9)
    line.set_clip_on(False)
    ax.add_line(line)


font_size = 30
plt.rcParams["font.family"] = "DeJavu Serif"
plt.rcParams["font.serif"] = ["Times New Roman"]

# Bar plot of the number of unique anomalies per anomaly type grade and cable side
# Note that they can be some duplicates since a unique anomaly ID can have more than one type and grade.
# Also, in few cases the anomaly can appear in both sides of the cables.
ax = (
    unique_anomalies.groupby(["anomaly_type", "anomaly_grade"])
    .cable_side_ids.value_counts()
    .unstack()
    .plot.bar(width=0.8, xlabel="", figsize=(25, 8), fontsize=font_size)
)
ax.set_xlim(-0.5, 15.5)


xtickspos = ax.get_xticks()
xtickslabels_first = [i.get_text().split(", ")[1][0].upper() for i in ax.get_xticklabels()]
xtickslabels_second = [i.get_text().split(", ")[0][1:] for i in ax.get_xticklabels()]
plt.xticks(rotation="horizontal", ticks=xtickspos, labels=xtickslabels_first, fontsize=font_size)
plt.ylabel(ylabel="# of unique anomalies", fontsize=font_size)

for pos in xtickspos:
    plt.axvline(pos + 0.5, color="red", ymin=0, ymax=30, linestyle="--", alpha=0.3)

scale = 1.0 / 16
for i in range(17):
    # print(i * scale)
    add_line(ax, i * scale, -0.1)

add_line(ax, 0.0, -0.2)
ax.text(0.0625, -0.25, "Bent \nstrand", ha="center", transform=ax.transAxes, fontsize=font_size)
add_line(ax, 0.125, -0.2)

ax.text(
    0.1875 - 0.5 * (0.1875 - 0.25),
    -0.25,
    "Broken \nstrands",
    ha="center",
    transform=ax.transAxes,
    fontsize=font_size,
)
add_line(ax, 0.3125, -0.2)

ax.text(0.375, -0.25, "Crushed", ha="center", transform=ax.transAxes, fontsize=font_size)
add_line(ax, 0.4375, -0.2)

ax.text(0.5, -0.25, "Deposit", ha="center", transform=ax.transAxes, fontsize=font_size)
add_line(ax, 0.5625, -0.2)

ax.text(0.625, -0.25, "Long \nscratches", ha="center", transform=ax.transAxes, fontsize=font_size)
add_line(ax, 0.6875, -0.2)

ax.text(0.75, -0.25, "Spaced \nstrands", ha="center", transform=ax.transAxes, fontsize=font_size)
add_line(ax, 0.8125, -0.2)

ax.text(
    0.9375 - 0.5 * (1.0 - 0.9375),
    -0.25,
    "Welded \nstrands",
    ha="center",
    transform=ax.transAxes,
    fontsize=font_size,
)
add_line(ax, 1.0, -0.2)

ax.set_axisbelow(True)
ax.minorticks_on()
ax.tick_params(axis="x", which="minor", bottom=False)
ax.yaxis.grid(which="major", linestyle="-", linewidth="0.5", color="grey")
ax.yaxis.grid(which="minor", linestyle=":", linewidth="0.5", color="grey")

plt.legend(title="Cable side ID", fontsize=font_size, title_fontsize=font_size)

# Uncomment to save figure.
# plt.savefig("anomalies_types_grades.png", bbox_inches="tight")
plt.show()

In [None]:
# Bar plot of the number of unique anomalies per cable without duplicates
img_data = df_anomalous[["cable_id", "identification"]].drop_duplicates()
img_data.groupby("cable_id").size().plot.bar()
plt.xticks(rotation=360)
plt.ylabel("# of unique anomalies")
plt.title("# of anomalies per cable")
plt.show()

In [None]:
# Bar plot of the number of unique anomalies per cable
# Note that they can be some duplicates since a unique anomaly ID can have more than one type.
img_data = df_anomalous[["identification", "anomaly_type"]].drop_duplicates()
img_data.groupby("anomaly_type").size().plot.bar()
plt.xticks(rotation=45)
plt.ylabel("# of unique anomalies")
plt.title("# of anomalies per type")
plt.show()

In [None]:
# Plot number of frames with "x" anomalies
D = Counter([v for k, v in Counter(df_anomalous["image_path"].tolist()).items()])
keys = D.keys()
values = D.values()
plt.bar(keys, values)
plt.ylabel("# of frames")
plt.xlabel("# of anomalies in the frame")
plt.title("# of frames with 'x' anomalies")
plt.show()

In [None]:
# Keep only frame "localization" informations and label index
df = df[["image_path", "cable_id", "side_id", "pass_id", "frame_id", "label_index"]].drop_duplicates()
df = df.sort_values(by=["cable_id", "side_id", "pass_id", "frame_id"])
df.head()

In [None]:
# Represent the frames sequence of label index per cable side pass
groups = df.groupby(["cable_id", "side_id", "pass_id"])
groups["label_index"].apply(list)

In [None]:
# Plot the frames sequence of label index per cable side pass
# The peaks represent anomalous frames
labels = [f"{i[0]}-{i[1]}0{i[2]}" for i in list(groups.groups.keys())]
colors = ["tab:blue"] * 3
colors += ["tab:orange"] * 3
colors += ["tab:green"] * 3
colors += ["tab:red"] * 3
colors += ["tab:purple"] * 3
colors += ["tab:brown"] * 3

plt.figure(figsize=(15, 7))
labels_pos = []
for idx, label in enumerate(labels):
    y = np.array(groups["label_index"].apply(list)[idx]) * 0.5 + idx
    x = groups["frame_id"].apply(list)[idx]
    plt.plot(x, y, label=label, color=colors[idx])
    labels_pos.append(0.25 + idx)

blue_patch = mpatches.Patch(color="tab:blue", label="C01-A")
orange_patch = mpatches.Patch(color="tab:orange", label="C01-B")
green_patch = mpatches.Patch(color="tab:green", label="C02-A")
red_patch = mpatches.Patch(color="tab:red", label="C02-B")
violet_patch = mpatches.Patch(color="tab:purple", label="C03-A")
brown_patch = mpatches.Patch(color="tab:brown", label="C03-B")
plt.legend(handles=[blue_patch, orange_patch, green_patch, red_patch, violet_patch, brown_patch], title="Cable side ID")

plt.yticks(labels_pos, labels)
plt.xlabel("Frame ID")
plt.show()

In [None]:
# For each anomaly identification
# Plot the frames sequence of label index per cable side pass
# The peaks represent anomalous frames
df_ids = df_anomalous[["cable_id", "side_id", "pass_id", "frame_id", "identification"]].drop_duplicates()
for identification in sorted(df_ids["identification"].unique()):
    labels = [f"{i[0]}-{i[1]}0{i[2]}" for i in list(groups.groups.keys())]
    colors = ["tab:blue"] * 3
    colors += ["tab:orange"] * 3
    colors += ["tab:green"] * 3
    colors += ["tab:red"] * 3
    colors += ["tab:purple"] * 3
    colors += ["tab:brown"] * 3

    plt.figure(figsize=(15, 7))
    labels_pos = []
    for idx, label in enumerate(labels):
        y = np.array(groups["label_index"].apply(list)[idx]) * 0.5 + idx
        x = groups["frame_id"].apply(list)[idx]
        plt.plot(x, y, label=label, color=colors[idx])
        labels_pos.append(0.25 + idx)

        df_ids_ = df_ids[
            (df_ids["cable_id"] == label[:3]) & (df_ids["side_id"] == label[4]) & (df_ids["pass_id"] == int(label[6]))
        ]

        anomaly_id = list(df_ids_[df_ids_["identification"] == identification]["frame_id"])
        if anomaly_id:
            plt.scatter(anomaly_id, [0.5 + idx] * len(anomaly_id), color="black", alpha=0.3)

    blue_patch = mpatches.Patch(color="tab:blue", label="C01-A")
    orange_patch = mpatches.Patch(color="tab:orange", label="C01-B")
    green_patch = mpatches.Patch(color="tab:green", label="C02-A")
    red_patch = mpatches.Patch(color="tab:red", label="C02-B")
    violet_patch = mpatches.Patch(color="tab:purple", label="C03-A")
    brown_patch = mpatches.Patch(color="tab:brown", label="C03-B")
    plt.legend(
        handles=[blue_patch, orange_patch, green_patch, red_patch, violet_patch, brown_patch], title="Cable side ID"
    )

    plt.yticks(labels_pos, labels)
    plt.xlabel("Frame ID")
    plt.title(f"Anomaly id: {identification}")
    plt.show()