In [1]:
# import the libraries
# %pip install scikit-learn

import os
import glob
import pickle

import tensorflow as tf    # used for creating tensors out of the dataset and training the model
import tensorflow_hub as hub    # downloading and using pre-trained models
import numpy as np
import h5py   # to manipulate the dataset as it's in `.h5` format
import pandas as pd
import cv2
import plotly.express as px
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report   # essential for model's evaluation
from sklearn.metrics import f1_score
from tqdm import tqdm   # getting a progress bar for an iterable object




In [4]:
train_df64 = pd.read_csv(r"E:\KPDL-CK\data\raw\textiledefectdetection\train64.csv")
test_df64 = pd.read_csv(r"E:\KPDL-CK\data\raw\textiledefectdetection\test64.csv")
# angle là gốc chụp
#indication_type là loại vãi
#indication_value là giá trị theo bảng sau: 0-5
# good (bình thường)
# color (lỗi màu)
# cut (bị cắt)
# hole (lỗ thủng)
# thread (lỗi chỉ)
# metal_contamination (lẫn tạp chất kim loại)

In [5]:
print(train_df64.head())
print(test_df64.head())


   index  angle indication_type  indication_value  split
0  48000      0            good                 0  train
1  48001     80            good                 0  train
2  48002    100            good                 0  train
3  48003    100            good                 0  train
4  48004      0            good                 0  train
   index  angle indication_type  indication_value split
0      0    100            good                 0  test
1      1     40            good                 0  test
2      2     40            good                 0  test
3      3    120            good                 0  test
4      4     20            good                 0  test


In [6]:
# check the classes distribution
train_df64.indication_type.value_counts(normalize=True).round(2)

good                   0.17
color                  0.17
cut                    0.17
hole                   0.17
metal_contamination    0.17
thread                 0.17
Name: indication_type, dtype: float64

### Preprocessing

In [14]:
class H5ToStorage:
    def __init__(self, hdf_path, ds_name="train"):
        self.path = hdf_path
        self.classes = []
        with h5py.File(self.path, 'r') as hf:
            for class_ in hf:
                if class_ != "color":
                    self.classes.append(class_)

        self.name = ds_name

    # Generator để nạp (img, class, angle)
    def generate_img_arr(self):
        for class_ in self.classes:
            with h5py.File(self.path, 'r') as hf:
                for angle in hf[class_]:
                        for img in hf[class_][f"{angle}"]:
                            yield img, class_, angle

    # Tạo thư mục và xử lý ảnh cho tập huấn luyện
    def generate_train_dirs(self):
        # Tạo các thư mục cho từng lớp
        path = rf"E:\KPDL-CK\data\raw\dataset\{self.name}"
        for cl in self.classes:
            os.makedirs(f"{path}/{cl}/", exist_ok=True)

        gen = self.generate_img_arr()
        metadata = {}

        for i, data in enumerate(gen):
            img, label, angle = data
            # Lưu ảnh vào thư mục tương ứng với nhãn của nó
            if label != "color":
                img_path = f"{path}/{label}/{i}_{angle}.jpeg"
                plt.imsave(img_path, np.squeeze(img)*255.,cmap='gray')
                metadata[img_path] = angle

        return metadata
    
    # Tạo thư mục và xử lý ảnh cho tập kiểm thử
    def generate_test_dirs(self):
        # Tạo các thư mục cho từng lớp
        path = rf"E:\KPDL-CK\data\raw\dataset\{self.name}"
        for cl in self.classes:
            os.makedirs(f"{path}/{cl}/", exist_ok=True)

        gen = self.generate_img_arr()
        metadata = {}

        for i, data in enumerate(gen):
            img, label, angle = data
            if label != "color":
                img_path = f"{path}/{label}/{i}_{angle}.jpeg"
                plt.imsave(img_path, np.squeeze(img)*255.,cmap='gray')
                metadata[img_path] = angle

        return metadata
    def to_storage(self):
        if self.name == "train":
            self.generate_train_dirs()
        elif self.name == "test":
            self.generate_test_dirs()




In [15]:
train_gen = H5ToStorage(r"E:\KPDL-CK\data\raw\textiledefectdetection\matchingtDATASET_train_64.h5", "train")
train_dict = train_gen.to_storage()

In [16]:
# generate test data
test_gen = H5ToStorage(r"E:\KPDL-CK\data\raw\textiledefectdetection\matchingtDATASET_test_64.h5", "test")
test_dict = test_gen.to_storage()