In [1]:
#| default_exp metadata
# all_slow

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/marcomatteo/steel_segmentation/blob/master/nbs/01_metadata.ipynb)

In [2]:
#| include: false
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [3]:
#| include: false
from nbdev.showdoc import *

In [4]:
#| export
import warnings
warnings.filterwarnings("ignore")

from fastai.vision.all import *
import pandas as pd
import os

## Data structure

In [5]:
#| exports
curr_dir_name = Path(os.getcwd()).name
path = Path('data/')

if not path.is_dir():
    path = Path("../data/")

The `path` variable is a `os.pathlib.Path` object that points to the competition data.

To print all the files this directory use the `print_competition_data` function.

In [6]:
#| export
def print_competition_data(p: Path = path):
    for elem in p.ls():
        print(elem)

In [7]:
print_competition_data()

FileNotFoundError: [Errno 2] No such file or directory: '../data'

The competition files: 

- **train_images/** - folder of training images (12.5k images)
- **test_images/** - folder of test images to segment and classify (5506 images)
- **train.csv** - training annotations which provide segments for defects (`ClassId = [1, 2, 3, 4]`)
- **sample_submission.csv** - a sample submission file in the correct format; note, each `ImageId` 4 rows, one for each of the 4 defect classes


In [None]:
#| exports
# train images
train_path = path/"train_images"
train_pfiles = get_image_files(train_path) if train_path.is_dir() else L([])
# test images
test_path = path/"test_images"
test_pfiles = get_image_files(test_path) if train_path.is_dir() else L([])
# dl models
models_dir = path.parent / "models"
# mask predictions
pred_path = path / "predictions"
pred_path.mkdir(parents=True, exist_ok=True)
# submissions
sub_path = path / "submissions"
sub_path.mkdir(parents=True, exist_ok=True)

## Training metadata

The training data includes:

- faulty images: images that have at least one defect

- hard negative images: images with no defects

In [None]:
train_pfiles

In [None]:
#| export
def get_train_df(only_faulty=False):
    """
    Get training DataFrame with all the images in data/train_images.
    Returns only the faulty images if `only_faulty`.
    """
    train = pd.read_csv(path/"train.csv")
    train["ImageId_ClassId"] = train["ImageId"] + \
        "_" + train["ClassId"].astype('str')
    if only_faulty:
        return train

    img_names = [img_path.name for img_path in train_pfiles]
    df_all = pd.DataFrame({'ImageId': img_names})
    train_all = pd.merge(df_all, train, on="ImageId",
                         how="outer", indicator=True)
    # Renaming and fillna
    train_all.rename(columns={'_merge': 'status'}, inplace=True)
    rename_dict = {"both": "faulty", "left_only": "no_faulty"}
    train_all["status"] = train_all["status"].cat.rename_categories(
        rename_dict)
    train_all.ClassId.fillna(0, inplace=True)
    train_all.ClassId = train_all.ClassId.astype('int64')
    train_all.EncodedPixels.fillna(-1, inplace=True)

    return train_all

The `get_train_df` function returns the DataFrame from the `train.csv` file, only faulty image names if `only_faulty`, with the training images metadata:
- **ImageId**: image name

- **ClassId**: the class type

- **EncodedPixels**: the encoded pixels follows a run-length encoding rule, a sequence of pair values that contains a start position and a run length with the space as the delimiter. E.g. `1 3 10 5` means pixels `(1,2,3)` and `(10,11,12,13,14)`.  

Each Image may have no defects, a single defect, or multiple defects.

In [None]:
#| export
class SteelMeta:
    
    def __init__(self): pass
    
    @property
    def hard_neg_with_patterns(self):
        try:
            df = pd.read_csv(path/"hard_negatives_patterns.txt", 
                             header=None, names=["ImageId"])
        except:
            df = pd.DataFrame()
        return df
    
    @property
    def train(self):
        try:
            df = get_train_df(only_faulty=True)
        except:
            df = pd.DataFrame()
        return df
    
    @property
    def train_all(self):
        try:
            df = get_train_df()
        except:
            df = pd.DataFrame()
        return df
        
    @property
    def train_pivot(self):
        def get_train_pivot():
            """
            Summarize the training csv with ClassId as columns and values EncodedPixels
            """
            train_pivot = self.train_all.pivot(
                index="ImageId", columns="ClassId", values="EncodedPixels")
            train_pivot["n"] = train_pivot.notnull().sum(1)
            return train_pivot
        
        try:
            df = get_train_pivot()
        except:
            df = pd.DataFrame()
        return df
    
    @property
    def train_multi(self):
        def get_classification_df():
            """
            Get the DataFrame for the multiclass classification model
            """

            def assign_multi_ClassId(x):
                """Returns a string with multi ClassId sep with a blank space (' ')"""
                def fill_cols(c):
                    return c.fillna(5).astype('int64').astype(str)

                cols = [fill_cols(x[i]) for i in range(5)]
                cols = [col.replace('5', '') for col in cols]
                ClassId_multi = cols[0] + " " + cols[1] + " " + \
                    cols[2] + " " + cols[3] + " " + cols[4]
                ClassId_multi = ClassId_multi.str.strip()
                ClassId_multi = ClassId_multi.str.replace('  ', ' ')

                return ClassId_multi.str.strip()

            train_multi = self.train_all.pivot(
                index="ImageId", columns="ClassId", values="ClassId")
            train_multi = train_multi.assign(
                ClassId_multi=lambda x: assign_multi_ClassId(x))
            return train_multi.reset_index()[["ImageId", "ClassId_multi"]]
        try:
            df = get_classification_df()
        except:
            df = pd.DataFrame()
        return df
    
    @property
    def test_df(self):
        try:
            df = pd.read_csv(path / 'sample_submission.csv')
        except:
            df = pd.DataFrame()
        return df

steel_meta = SteelMeta()

### Hard negatives with patterns

In [None]:
#| export
hard_neg_patterns = steel_meta.hard_neg_with_patterns

In [None]:
hard_neg_patterns.head()

In `pattern_img_list` there are the `ImageId`s of training images with some patterns (from this [kernel](https://www.kaggle.com/ateplyuk/detector-steels-with-texture/data)).

### Faulty images

In [None]:
#| exports
train = steel_meta.train

Only the faulty images are listed in `train` with the defected pixels encoded.

In [None]:
assert isinstance(train, pd.DataFrame)

In [None]:
print(train.shape)
train.head(2)

Every row of the DataFrame is a unique `(ImageId, ClassId)`.

In [None]:
assert not train.ImageId_ClassId.duplicated().any()

In [None]:
train.dtypes

In [None]:
#| include: false
def count_pct(df, column="ClassId"):
    class_count = df[column].value_counts().sort_index()
    class_count.index.set_names(column, inplace=True)
    class_count = class_count.to_frame()
    class_count.rename(columns={column: "num"}, inplace=True)
    return class_count.assign(freq=lambda df: df["num"] / df["num"].sum())    

The `ClassId` column values from `train` are:

In [None]:
class_count = count_pct(train)
class_count

In [None]:
class_count["num"].plot.bar(title="Defects by ClassId count");

In [None]:
counts = train["ImageId"].value_counts()
hist_counts, _ = np.histogram(counts.values, bins=3)
nums = ['1', '2', '3']
plt.bar(x=nums, height=hist_counts)
plt.title("Num of defects per images")
plt.show()

{i+1: c for i, c in enumerate(hist_counts)}

Images have at least one defect and there's a small number of images with two or three defects. 

In [None]:
print("Multiple defects:")
imgs_class_agg = train.groupby("ImageId").apply(
    lambda x: x["ClassId"].values.tolist())

pairs = imgs_class_agg[imgs_class_agg.map(
    lambda v: len(v) == 2)].values.tolist()
triples = imgs_class_agg[imgs_class_agg.map(
    lambda v: len(v) == 3)].values.tolist()

p = Counter(map(lambda l: (l[0], l[1]), pairs))
t = Counter(map(lambda l: (l[0], l[1], l[2]), triples))

print("\n".join(
    [f"ClassId {ids} counts {count} images" for ids, count in p.most_common()]))
print("\n".join(
    [f"ClassId {ids} counts {count} images" for ids, count in t.most_common()]))

As we can see below, in `train` there're 7095 defects in 6666 images but there're 12568 images in `train_pfiles`. 

In [None]:
unique_imgs = train.describe(include='all')["ImageId"].T[:2]
unique_imgs

In [None]:
assert unique_imgs[0] == 7095
assert unique_imgs[1] == 6666

### All training images

In [None]:
#| exports
train_all = steel_meta.train_all

In `train_all` are stored all the training images in `train` plus the hard negatives. The latter have:
- `EncodedPixels`: with -1, 
- `ClassId`: with 0 class,
- `status`: with "no_faulty" string

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(7, 6.5))
ax.set_title("Count imgs", pad=30, fontdict={'fontsize': 14})
ax.xaxis.tick_top()  # Display x-axis ticks on top

(
    train_all[['ImageId', 'status']]
    .drop_duplicates()
    .status.value_counts().iloc[:-1]
    .plot.barh(table=True, ax=ax)
)
plt.show()

In [None]:
train_all.dtypes

In [None]:
train_all.describe(include='all')[:4]

In [None]:
count_pct(train_all)

## Train transforms

Loading the images for models requires some transformations to the `train` or `train_all` DataFrames.

### Pivot RLE encodings

The `get_train_pivot` is the pivoted version of `train_all`. All the images are in the index, for each image the `ClassId` encoding values are in the columns.

In [None]:
#| exports
train_pivot = steel_meta.train_pivot

In [None]:
train_pivot.head(2)

In [None]:
count_pct(train_pivot, column='n')

### Multi class defects

In [None]:
#| export
def get_classification_df(df: pd.DataFrame = None):
    """
    Get the DataFrame for the multiclass classification model
    """

    def assign_multi_ClassId(x):
        """Returns a string with multi ClassId sep with a blank space (' ')"""
        def fill_cols(c):
            return c.fillna(5).astype('int64').astype(str)

        cols = [fill_cols(x[i]) for i in range(5)]
        cols = [col.replace('5', '') for col in cols]
        ClassId_multi = cols[0] + " " + cols[1] + " " + \
            cols[2] + " " + cols[3] + " " + cols[4]
        ClassId_multi = ClassId_multi.str.strip()
        ClassId_multi = ClassId_multi.str.replace('  ', ' ')

        return ClassId_multi.str.strip()

    if not df:
        df = train_all
    train_multi = df.pivot(
        index="ImageId", columns="ClassId", values="ClassId")
    train_multi = train_multi.assign(
        ClassId_multi=lambda x: assign_multi_ClassId(x))
    return train_multi.reset_index()[["ImageId", "ClassId_multi"]]

The `get_classification_df` allows to build a DataFrame to classification models. In `ClassId_multi` are listed the `ClassId`s separated by a space.

In [None]:
#| exports
train_multi = steel_meta.train_multi

In [None]:
train_multi.head()

In [None]:
count_pct(train_multi, column='ClassId_multi').sort_values("freq", ascending=False)

In [None]:
train_multi.describe(include='all')

## Test data

In [None]:
test_pfiles

In [None]:
#| exports
test_df = steel_meta.test_df

In [None]:
test_df.head()

In [None]:
rows, cols = test_df.shape
test_eq(rows, 5506)
test_eq(cols, 3)

In [None]:
#| include: false
from nbdev import nbdev_export
nbdev_export()