In [None]:
import fnmatch
import os
import shutil
from collections import Counter
from fractions import Fraction
from pathlib import Path

import matplotlib.pyplot as plt
import pandas as pd
from pandas import DataFrame
from PIL import Image

In [None]:
BASE_WORKDIR = Path("work_dataset")

In [None]:
pd.set_option("display.max_rows", 20)

### Read the file

In [None]:
data = pd.read_excel("bmtfiles/Zestawienie ekspertyz 8.xlsx")

In [None]:
data.head()

In [None]:
data = data[["ExpertiseId", "Element", "Damage", "ImagePath"]]

In [None]:
data.head()

### Unique elements of car parts that can be analzed

In [None]:
data["Element"].unique()

In [None]:
data["Element"].value_counts()

In [None]:
ELEMENT = ["zderzak", "maska silnika"]

In [None]:
filtered = data[data["Element"].isin(ELEMENT)]

In [None]:
filtered.head()

In [None]:
filter_col = "Damage"
image_col = "ImagePath"

### Types of damage

In [None]:
filtered[["Element", "Damage"]].value_counts()

### Get damage class

In [None]:
def get_damage_class(df: DataFrame, col: str, damages: list[str]):
    return filtered[filtered["Damage"].isin(damages)]

In [None]:
DAMAGES = ["ubytek lakieru", "uszkodzenie", "wżery na lakierze", "wgniecenie 1-2"]

In [None]:
df = get_damage_class(filtered, filter_col, DAMAGES)

In [None]:
df.head()

In [None]:
df.shape

#### Save dmg info to csv

In [None]:
# 255 is for max filename length in Windows, Mac, Linux

csv_name =  "_".join([d for d in ELEMENT]) + "-" + "_".join([d for d in DAMAGES])[:255] + ".csv"

In [None]:
csv_name

In [None]:
def save_dmg_to_csv(df: DataFrame, workdir: Path, csv_name: str):
    os.makedirs(workdir, exist_ok=True)
    df.to_csv(f"{workdir}/{csv_name}")

In [None]:
save_dmg_to_csv(df, BASE_WORKDIR, csv_name)

### Read csv, extract filepaths and save photos to work_dataset

In [None]:
def prepare_dataset_folders(workdir: Path, csv_name: str, img_col: str, filter_col: str):
    csv_path = workdir.joinpath(csv_name)
    work_csv = pd.read_csv(csv_path)
    filepaths = work_csv[img_col]

    for folder in work_csv[filter_col].unique():

        dmg_folder = workdir.joinpath(folder)
        dmg_folder.mkdir(parents=True, exist_ok=True)

        src_df = work_csv[work_csv[filter_col] == folder]

        for f in src_df[img_col].values:
            src = Path(f)

            src_fldr = src.parts[-2]
            dest = dmg_folder.joinpath(f"{src_fldr}_{src.name}")

            try: 
                # print(dest)
                shutil.copy(src, dest)
            except Exception as e:
                print(e)
                continue

In [None]:
prepare_dataset_folders(BASE_WORKDIR, csv_name, image_col, filter_col)

In [None]:
dmg_paths = [BASE_WORKDIR.joinpath(dmg) for dmg in DAMAGES]

#### Difference between thoretically available files and physically copied ones:

In [None]:
def count_types_and_other(workdir: Path, pattern: str):
    bad_files_list = []

    match = [path for path in workdir.rglob(pattern)]
    not_match = [f for f in workdir.rglob("*") if not fnmatch.fnmatch(f.name, pattern)]
    bad_files_list += not_match
    print(f"{workdir}")
    print(f"Number of {pattern}: ", len(match))
    print(f"Number of not {pattern}: ", len(not_match))
    print()
    return bad_files_list

In [None]:
for dmg in dmg_paths:
    _ = count_types_and_other(dmg, "*.jp*g")

In [None]:
def diff_avail_copied(df: DataFrame, folder: str):
    loc = len(df)
    lwd = len([n for n in os.listdir(folder)])

    # print(df.head(1))
    print(f"number of files in class: ", loc)
    print("number of files copied: ", lwd)
    print("Difference: ", loc - lwd)
    print()

In [None]:
for dmg in DAMAGES:
    diff_avail_copied(df[df[filter_col] == dmg], BASE_WORKDIR.joinpath(dmg))

### Check files in all work_dataset folders

#### Delete no jpgs files

In [None]:
def delete_bad_types(workdir: Path, type_header: bytes):
    num_skipped = 0

    for path in workdir.rglob("*"):
        with open(path, "rb") as file:
            is_type =  type_header in file.peek(10)
        if not is_type:
            num_skipped += 1
            # print(path)
            os.remove(path)

    print(f"{workdir} ", num_skipped)

In [None]:
for dmg in dmg_paths:
    delete_bad_types(dmg, b"JFIF")

#### Count how many we have so far

In [None]:
not_jpgs = []

for dmg in dmg_paths:
    nj = count_types_and_other(dmg, "*.jp*g")
    not_jpgs.extend(nj)

#### Add proper suffix

In [None]:
def add_suffix(workdirs: list[Path], suffix: str):
    for f in workdirs:
        os.rename(f, f.with_suffix(suffix))

In [None]:
add_suffix(not_jpgs, ".jpg")

#### Verify using PIL.Image

In [None]:
def verify_images(directory):
    for filename in os.listdir(directory):
        try:
            with Image.open(os.path.join(directory, filename)) as img:
                img.verify()
        except Exception as e:
            print(f"Error in {filename}: {e}")

#### Conver to proper RGB

In [None]:
def convert_images(directory):
    err_cnt  = 0
    modes = set()
    for filename in os.listdir(directory):
        try:
            img = Image.open(os.path.join(directory, filename))
            modes.add(img.mode)
            if img.mode != "RGB":
                img = img.convert('RGB') 
                img.save(os.path.join(directory, filename), format='jpg')
        except Exception as e:
            err_cnt += 1
            print(f"Cannot convert {filename}: {e}")
    print(directory, modes)

In [None]:
for dmg in dmg_paths:
    verify_images(dmg)

In [None]:
for dmg in dmg_paths:
    convert_images(dmg)

### Visualize img properties: resolution and ration

In [None]:
sizes_dict = Counter()
ratio_dict = Counter()

for j in BASE_WORKDIR.rglob("*.jpg"):
    img = Image.open(j)
    w, h = img.size
    img_size = (w,h)
    sizes_dict[img_size] += 1
    ratio = Fraction(w, h).as_integer_ratio()
    ratio_dict[ratio] += 1 

In [None]:
for k, v in sizes_dict.items():
    print(f"{k}: {v}")

In [None]:
for k, v in ratio_dict.items():
    print(f"{k}: {v}")

In [None]:
x, y = zip(*[(str(k), v) for k, v in sizes_dict.items()])
plt.figure(figsize=(12, 6))  # Increase figure size if needed
plt.bar(x, y)
plt.xticks(rotation=45, ha='right')  # Rotate labels and align right
plt.tight_layout()  # Adjust layout to prevent clipping
plt.show()