In [None]:
import pandas as pd

In [None]:
pd.set_option("display.max_rows", 20)

### Read the file

In [None]:
data = pd.read_excel("bmtfiles/work.xlsx")

In [None]:
data.head()

### Unique elements of car parts that can be analzed

In [None]:
data["Element"].unique()

In [None]:
with pd.option_context('display.max_rows', None):
    print(data["Element"].value_counts())

In [None]:
ELEMENT = "zderzak"

In [None]:
filtered = data[data["Element"] == ELEMENT]

In [None]:
filtered.head()

### Types of damage

In [None]:
filtered["Damage"].value_counts()

### Get one damage class

In [None]:
DAMAGE = "rysa z uszkodz. lakieru"

In [None]:
one_class = filtered[filtered["Damage"] == DAMAGE]

In [None]:
one_class.shape

In [None]:
import os

os.makedirs("work_dataset", exist_ok=True)

one_class_file = f"work_dataset/{ELEMENT}_{DAMAGE}.csv"

one_class.to_csv(one_class_file)

### Read csv, extract filepaths and save photos to work_dataset

In [None]:
work_csv = pd.read_csv(one_class_file)

In [None]:
filepaths = work_csv["ImagePath"]

In [None]:
import shutil
from pathlib import Path

with pd.option_context('display.max_rows', None):
    workdir = Path(f"work_dataset/{ELEMENT}/{DAMAGE}/photos")
    workdir.mkdir(parents=True, exist_ok=True)

    for f in filepaths.values:
        src = Path(f)

        src_fldr = src.parts[-2]
        dest = workdir.joinpath(f"{src_fldr}_{src.name}")

        try: 
            shutil.copy(src, dest)
        except Exception as e:
            print(e)
            continue

In [None]:
# Difference between thoretically available files and physically copied ones:
loc = len(one_class)
lwd = len([n for n in os.listdir(workdir)])

print("number of files in one: ", loc)
print("number of files copied: ", lwd)
print("Difference: ", loc - lwd)

### Check files

In [None]:
# Are all files proper jpgs?

import fnmatch

jpgs = [path for path in workdir.rglob("*.jp*g", case_sensitive=False)]

not_jpgs_files = [f for f in workdir.rglob("*") if not fnmatch.fnmatch(f.name, '*.jp*g')]

num_jpgs = len(jpgs)

print("Number of jpgs: ", num_jpgs)
print("Number of not_jpgs: ", len(not_jpgs_files))
print("Difference between workidr and counted jpgs: ", lwd - num_jpgs)

In [None]:
num_skipped = 0

for path in workdir.rglob("*"):
    with open(path, "rb") as file:
        is_jfif = b"JFIF" in file.peek(10)
    if not is_jfif:
        num_skipped += 1
        print(path)
        os.remove(path)



print("No JFIF file: ", num_skipped)


In [None]:
# Add proper suffixes

for f in not_jpgs_files:
    print(f)
    os.rename(f, f.with_suffix(".jpg"))

In [None]:
jpgs = [path for path in workdir.rglob("*.jp*g", case_sensitive=False)]
not_jpgs_files = [f for f in workdir.rglob("*") if not fnmatch.fnmatch(f.name, '*.jp*g')]

print("Number of jpgs: ", len(jpgs))
print("Number of not_jpgs: ", len(not_jpgs_files))

Sprawdzaj czy mają tę samą rozdzielczość

jakie mają proporcje?

In [None]:
import PIL.Image
from fractions import Fraction
from collections import Counter

sizes_dict = Counter()
ratio_dict = Counter()

for j in jpgs:
    img = PIL.Image.open(j)
    w, h = img.size
    img_size = (w,h)
    sizes_dict[img_size] += 1
    ratio = Fraction(w, h).as_integer_ratio()
    ratio_dict[ratio] += 1 

In [None]:
for k, v in sizes_dict.items():
    print(f"{k}: {v}")

In [None]:
for k, v in ratio_dict.items():
    print(f"{k}: {v}")

In [None]:
import matplotlib.pyplot as plt

x, y = zip(*[(str(k), v) for k, v in sizes_dict.items()])
plt.figure(figsize=(12, 6))  # Increase figure size if needed
plt.bar(x, y)
plt.xticks(rotation=45, ha='right')  # Rotate labels and align right
plt.tight_layout()  # Adjust layout to prevent clipping
plt.show()