In [None]:
!pip install pandas

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns

In [None]:
!pip install seaborn

In [None]:
df = pd.read_csv('data/train.csv')

---

### Expansion of Variables for EDA

In the following we are expanding our data frame to generate additional insights about our data. Looking at the `train.csv` file it becomes clear that for every defect (`ClassId`) there is a seperate line. This means also that if there is more than one defect on an image, there will be separate lines for the respective `ImageId`.

In [None]:
# Count occurrences of `ImageId` in df
df['count'] = df.ImageId.apply(lambda x: df['ImageId'].value_counts()[x])

In [None]:
df.head(7)

The `EncodedPixels` are encoded in the following way:

- columns are scanned from top to bottom (256 pixels per column)
- pixels are numbered from 1 to 409,600 (256 x 1600)
- the first number in `EncodedPixels` shows the start of a recognized defect in the image
- the second number in `EncodedPixels` number of consecutive pixels that belong to the recognized defect in the image

In the following we are looking at the size of defects in an image as total pixels the belong to the `ClassId` per `ImageId`.

In [None]:
def count_defect_pixels(EncodedPixels):
    sum = 0
    for idx, pixel in enumerate(EncodedPixels.split()):
        if idx%2 != 0:
            sum += int(pixel)

    return sum

In [None]:
df['DefectSizePixel'] = df.EncodedPixels.apply(lambda x: count_defect_pixels(x))
df.head()

In [None]:
df.groupby('ClassId')['DefectSizePixel'].mean().reset_index()

It becomes clear that, on average, defects of `ClassId` 3 and 4 are much bigger in size.

In [None]:
!pip3 install opencv-python

In [None]:
pip install --upgrade pip

In [None]:
# Calculate fraction of defect relative to the picture size
import cv2

def get_image_height(image):
    image = cv2.imread('data/train_images/' + image) # read an image by its path
    return image.shape[0] # image.shape returns a tupel (height, width, depth)

def get_image_width(image):
    image = cv2.imread('data/train_images/' + image)
    return image.shape[1]

def get_image_depth(image):
    image = cv2.imread('data/train_images/' + image)
    return image.shape[2]

In [None]:
# Create columns for `Height`, `Width`, and `Depth`
df['Height'] = df.ImageId.apply(lambda x: get_image_height(x))
df['Width'] = df.ImageId.apply(lambda x: get_image_width(x))
df['Depth'] = df.ImageId.apply(lambda x: get_image_depth(x))

df['DefectSizePixelRelative'] = df.DefectSizePixel / (df.Height * df.Width)

In [None]:
df.head()

In [None]:
# Check whether we have different dimensions for pictures
df.describe()

In [None]:
df.groupby('ClassId')['DefectSizePixelRelative'].mean().reset_index()

Relative defect size on average reaches up to 8.4% for defect class 4. As we can see above, however, the maximum coverage of a defect may reach **up to 89.9%** of the respective image.

Let's have a look at TOP5 and BOTTOM5 for `DefectSizePixelRelative`:

In [None]:
TOP5 = df.query('DefectSizePixelRelative > 0.7')[['ImageId','ClassId','DefectSizePixelRelative','count']]\
    .sort_values(by='DefectSizePixelRelative', ascending = False)[:5]

# save names of TOP5
TOP5_names = TOP5.ImageId.tolist()

In [None]:
# Cross-check whether TOP5 belongs to single defects or if there are mixed defects as well
df.groupby('ImageId').sum()[['DefectSizePixelRelative','count']]\
    .reset_index().sort_values(by='DefectSizePixelRelative', ascending=False)[:30]

We see that mostly defects of a single `ClassId` reach high scores. There is one interesting case in the TOP30 where more than 1 `ClassId` was involved to produce **0.586%** of relative error. The `ImageId` is "fb9558035.jpg".

In [None]:
df_duplicates = df.query('count > 1')
df_duplicates.info()

In [None]:
df.groupby('ClassId').count()

--> there is a strong **imbalance** in the representation of different defect type:
- error 1 = 0.135 %
- error 2 = 0.037 %
- error 3 = 0.773 % of all images
- error 4 = 0.120 %

In [None]:
# Dimension of our pictures
1600*256

In [None]:
print(f"The training data has {df.ImageId.nunique()} unique image IDs")

### To-Dos Daniela

In [None]:
df_classes=df_duplicates.groupby('ImageId')['ClassId'].sum().reset_index()
df_classes.rename(columns={'ClassId': 'SumClasses'}, inplace=True)
df_classes.head()

In [None]:
sns.histplot(df_classes)

In [None]:
# separate dataframes for image with 2 and 3 classes per Image
df_duplicates2 = df.query('1 < count < 3')
df_duplicates3 = df.query('count > 2')
df_duplicates2.info()
df_duplicates3.info()

In [None]:
# create pivot table for Images with 2 classes per Image
pivot1 = pd.pivot_table(df_duplicates2, index='ImageId', columns='ClassId', values='count', aggfunc = 'count', fill_value=0)
pivot1

In [None]:
# merge df_classes with pivot 1
result2 = pd.merge(df_classes, pivot1, on="ImageId")
result2.head()

In [None]:
# create interaction matrix for defect classes (2 classes per Image)
result2.rename(columns={1: '1',2: '2',3: '3',4: '4'}, inplace=True)
temp = result2[['1','2','3','4']]
a= temp.T.dot(temp)
a

In [None]:
# create pivot table for Images with 3 classes per Image
pivot2 = pd.pivot_table(df_duplicates3, index='ImageId', columns='ClassId', values='count', aggfunc = 'count', fill_value=0)
pivot2

In [None]:
# merge df_classes with pivot 2 
result3 = pd.merge(df_classes, pivot2, on="ImageId")
result3.head()

In [None]:
# create interaction matrix for defect classes (3 classes per Image)
result3.rename(columns={1: '1',2: '2',3: '3'}, inplace=True)
temp = result3[['1','2','3']]
a= temp.T.dot(temp)
a