In [1]:
import os
import time
import random
import timm
import torch
import albumentations as A
import pandas as pd
import numpy as np
import torch.nn as nn
from albumentations.pytorch import ToTensorV2
from torch.optim import Adam
from torchvision import transforms
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from tqdm import tqdm
from sklearn.metrics import accuracy_score, f1_score
from augraphy import *
from sklearn.model_selection import KFold
from glob import glob
import cv2

In [16]:
df = pd.read_csv('merged_df.csv')


In [29]:
raw = df[(~df.ID.str.contains('_'))]

In [31]:
raw.target2.value_counts()

target2
7     102
10    102
0     100
12    100
8     100
2     100
16    100
9     100
15    100
5     100
4     100
6     100
11     98
3      98
13     74
14     50
1      46
Name: count, dtype: int64

In [27]:
df_train

Unnamed: 0,ID,target
0,002f99746285dfdd.jpg,16
1,008ccd231e1fea5d.jpg,10
2,008f5911bfda7695.jpg,10
3,009235e4c9c07af5.jpg,4
4,00b2f44967580c74.jpg,16
...,...,...
1565,fed9e9ec4a77bc06.jpg,4
1566,feeade617aa68c45.jpg,7
1567,ff51dd281a8423f1.jpg,11
1568,ff8a6a251ce51c95.jpg,5


In [11]:
df[~df.ID.str.contains('aug')].target2.value_counts()

target2
7     102
10    102
0     100
12    100
8     100
2     100
16    100
9     100
15    100
5     100
4     100
6     100
11     98
3      98
13     74
14     50
1      46
Name: count, dtype: int64

In [6]:
image_folder = 'data/train/'


input_folder = "data/train_copy/"
output_folder = "aug_add_25000"
original_csv_file = "new_aug_add.csv"
image_files = glob(os.path.join(input_folder, '*.jpg'))
print(image_files)


In [39]:
# 1번 처리 : augraphy
paper_phase = [

    ColorPaper(
        hue_range=(0, 255),
        saturation_range=(10, 40),
        p=0.33,
    ),
            OneOf(
        [
            DelaunayTessellation(
                n_points_range=(500, 800),
                n_horizontal_points_range=(500, 800),
                n_vertical_points_range=(500, 800),
                noise_type="random",
                color_list="default",
                color_list_alternate="default",
            ),
            PatternGenerator(
                imgx=random.randint(256, 512),
                imgy=random.randint(256, 512),
                n_rotation_range=(10, 15),
                color="random",
                alpha_range=(0.25, 0.5),
            ),
            VoronoiTessellation(
                mult_range=(50, 80),
                seed=19829813472,
                num_cells_range=(500, 1000),
                noise_type="random",
                background_value=(200, 255),
            ),
        ],
        p=0.5,
    ),
    AugmentationSequence(
        [
            NoiseTexturize(
                sigma_range=(3, 10),
                turbulence_range=(2, 5),
            ),
            BrightnessTexturize(
                texturize_range=(0.9, 0.99),
                deviation=0.03,
            ),
        ],
    ),
]

post_phase = [
    VoronoiTessellation(p=0.5),
]
pipeline0 = AugraphyPipeline(paper_phase=paper_phase, post_phase=post_phase)


# albumentation
alb0 = A.Compose([
    A.HorizontalFlip(p=0.8),
    A.VerticalFlip(p=0.8),
    A.Rotate(p=0.8, limit=(-179, 179),),
])



In [40]:
#2번 처리



# augraphy
ink_phase1 = [
    Folding(p=0.3)
    ]

paper_phase1 = [
    ColorPaper(p=0.4),
    ColorShift(p=0.4),
    ReflectedLight(p=0.4)
]

post_phase1 = [
    VoronoiTessellation(p=0.6),
]
pipeline1 = AugraphyPipeline(ink_phase=ink_phase1, paper_phase=paper_phase1, post_phase=post_phase1)



In [41]:

# albumentation
pipeline2 = A.Compose([
    A.HorizontalFlip(p=0.6),
    A.VerticalFlip(p=0.6),
    A.Rotate(p=0.6),
    A.GaussianBlur(p=0.6),
    A.RandomBrightnessContrast(p=0.6),
    A.HueSaturationValue(p=0.6),
    A.RandomGamma(p=0.6),
    A.ColorJitter(p=0.6),
    A.CoarseDropout(p=0.6),
    A.GaussNoise(p=0.6),
    A.Resize(224, 224),
    #ToTensorV2()
])






In [53]:
#10개씩 만들되


pipeline3 = A.Compose([
A.RandomRotate90(),
A.Flip(),
A.GaussNoise(p=0.3),
A.OneOf([A.MotionBlur(p=.2), A.MedianBlur(blur_limit=3, p=0.1), A.Blur(blur_limit=3, p=0.1),], p=0.3),
A.ShiftScaleRotate(shift_limit=0.0625, scale_limit=0.2, rotate_limit=45, p=0.2, border_mode=cv2.BORDER_CONSTANT, value=[255, 255, 255]),
A.OneOf([A.OpticalDistortion(p=0.3), A.GridDistortion(p=.1), A.PiecewiseAffine(p=0.3), ], p=0.15),
A.OneOf([A.CLAHE(clip_limit=2), A.Sharpen(), A.Emboss(),], p=0.3),
A.RandomBrightnessContrast(p=0.3),
A.HueSaturationValue(p=0.3)
])

In [37]:
img = cv2.imread('data/train/002f99746285dfdd.jpg')

In [49]:
type(alb0(image = img)['image'])

numpy.ndarray

In [51]:
def augment_data_and_update_csv1(input_folder, output_folder, pipeline0, alb0, pipeline1, pipeline2, pipeline3, csv_file, num_augmented_per_image=4):
    original_df = pd.read_csv(csv_file)
    augmented_df = pd.DataFrame(columns=original_df.columns)

    image_files = glob(os.path.join(input_folder, '*.jpg'))

    for img_path in image_files:
        # Read the image
        img = cv2.imread(img_path)

        # Apply augmentation multiple times
        if original_df[original_df['ID'] == os.path.basename(img_path)]['target'].values[0] == 13:
            num_images_to_generate = 6  # Target이 13인 경우 5개의 이미지만 생성
            # Apply a 처리
            for i in tqdm(range(num_images_to_generate)):
                augraphy = pipeline0(img)
                augmented = alb0(image=augraphy)
                augmented_img = augmented['image']

                # Save augmented image
                output_path = os.path.join(output_folder, f"add_25000_{i}_{os.path.basename(img_path)}_a.jpg")
                cv2.imwrite(output_path, augmented_img)

                # Add entry to the augmented DataFrame
                augmented_entry = {
                    'ID': os.path.basename(output_path),
                    'target': 13  # Target 값은 13으로 설정
                }
                augmented_df = pd.concat([augmented_df, pd.DataFrame([augmented_entry])], ignore_index=True)

                #1번 처리
                augraphy1 = pipeline1(img)
                augmented_img1 = augraphy1

                # Save augmented image
                output_path1 = os.path.join(output_folder, f"add_25000_{i}_{os.path.basename(img_path)}_b.jpg")
                cv2.imwrite(output_path1, augmented_img1)

                # Add entry to the augmented DataFrame
                augmented_entry1 = {
                    'ID': os.path.basename(output_path1),
                    'target': 13  # Target 값은 13으로 설정
                }
                augmented_df = pd.concat([augmented_df, pd.DataFrame([augmented_entry1])], ignore_index=True)

                #2번 처리
                augmented2 = pipeline2(image=img)
                augmented_img2 = augmented2['image']

                # Save augmented image
                output_path2 = os.path.join(output_folder, f"add_25000_{i}_{os.path.basename(img_path)}_c.jpg")
                cv2.imwrite(output_path2, augmented_img2)

                # Add entry to the augmented DataFrame
                augmented_entry2 = {
                    'ID': os.path.basename(output_path2),
                    'target': 13  # Target 값은 13으로 설정
                }
                augmented_df = pd.concat([augmented_df, pd.DataFrame([augmented_entry2])], ignore_index=True)

                #3번 처리
                augmented3 = pipeline3(image=img)
                augmented_img3 = augmented3['image']

                # Save augmented image
                output_path3 = os.path.join(output_folder, f"add_25000_{i}_{os.path.basename(img_path)}_d.jpg")
                cv2.imwrite(output_path3, augmented_img3)

                # Add entry to the augmented DataFrame
                augmented_entry3 = {
                    'ID': os.path.basename(output_path3),
                    'target': 13  # Target 값은 13으로 설정
                }
                augmented_df = pd.concat([augmented_df, pd.DataFrame([augmented_entry3])], ignore_index=True)

        elif original_df[original_df['ID'] == os.path.basename(img_path)]['target'].values[0] in [1,14]:
            num_images_to_generate = 8  # Target이 13인 경우 5개의 이미지만 생성
            # Apply a 처리
            for i in tqdm(range(num_images_to_generate)):
                augraphy = pipeline0(img)
                augmented = alb0(image=augraphy)
                augmented_img = augmented['image']

                # Save augmented image
                output_path = os.path.join(output_folder, f"add_25000_{i}_{os.path.basename(img_path)}_a.jpg")
                cv2.imwrite(output_path, augmented_img)

                # Add entry to the augmented DataFrame
                augmented_entry = {
                    'ID': os.path.basename(output_path),
                    'target': original_df[original_df['ID'] == os.path.basename(img_path)]['target'].values[0]  # Target 값은 13으로 설정
                }
                augmented_df = pd.concat([augmented_df, pd.DataFrame([augmented_entry])], ignore_index=True)

                #1번 처리
                augraphy1 = pipeline1(img)
                augmented_img1 = augraphy1

                # Save augmented image
                output_path1 = os.path.join(output_folder, f"add_25000_{i}_{os.path.basename(img_path)}_b.jpg")
                cv2.imwrite(output_path1, augmented_img1)

                # Add entry to the augmented DataFrame
                augmented_entry1 = {
                    'ID': os.path.basename(output_path1),
                    'target': original_df[original_df['ID'] == os.path.basename(img_path)]['target'].values[0]  # Target 값은 13으로 설정
                }
                augmented_df = pd.concat([augmented_df, pd.DataFrame([augmented_entry1])], ignore_index=True)

                #2번 처리
                augmented2 = pipeline2(image=img)
                augmented_img2 = augmented2['image']

                # Save augmented image
                output_path2 = os.path.join(output_folder, f"add_25000_{i}_{os.path.basename(img_path)}_c.jpg")
                cv2.imwrite(output_path2, augmented_img2)

                # Add entry to the augmented DataFrame
                augmented_entry2 = {
                    'ID': os.path.basename(output_path2),
                    'target': original_df[original_df['ID'] == os.path.basename(img_path)]['target'].values[0]  # Target 값은 13으로 설정
                }
                augmented_df = pd.concat([augmented_df, pd.DataFrame([augmented_entry2])], ignore_index=True)

                #3번 처리
                augmented3 = pipeline3(image=img)
                augmented_img3 = augmented3['image']

                # Save augmented image
                output_path3 = os.path.join(output_folder, f"add_25000_{i}_{os.path.basename(img_path)}_d.jpg")
                cv2.imwrite(output_path3, augmented_img3)

                # Add entry to the augmented DataFrame
                augmented_entry3 = {
                    'ID': os.path.basename(output_path3),
                    'target': original_df[original_df['ID'] == os.path.basename(img_path)]['target'].values[0]  # Target 값은 13으로 설정
                }
                augmented_df = pd.concat([augmented_df, pd.DataFrame([augmented_entry3])], ignore_index=True)


                
        else:
            num_images_to_generate = 4  # Target이 13이 아닌 경우
            # Apply b 처리
            for i in tqdm(range(num_images_to_generate)):
                augraphy = pipeline0(img)
                augmented = alb0(image=augraphy)
                augmented_img = augmented['image']

                # Save augmented image
                output_path = os.path.join(output_folder, f"add_25000_{i}_{os.path.basename(img_path)}_a.jpg")
                cv2.imwrite(output_path, augmented_img)

                # Add entry to the augmented DataFrame
                augmented_entry = {
                    'ID': os.path.basename(output_path),
                    'target': original_df[original_df['ID'] == os.path.basename(img_path)]['target'].values[0]  # Target 값은 13으로 설정
                }
                augmented_df = pd.concat([augmented_df, pd.DataFrame([augmented_entry])], ignore_index=True)

                #1번 처리
                augraphy1 = pipeline1(img)
                augmented_img1 = augraphy1

                # Save augmented image
                output_path1 = os.path.join(output_folder, f"add_25000_{i}_{os.path.basename(img_path)}_b.jpg")
                cv2.imwrite(output_path1, augmented_img1)

                # Add entry to the augmented DataFrame
                augmented_entry1 = {
                    'ID': os.path.basename(output_path1),
                    'target': original_df[original_df['ID'] == os.path.basename(img_path)]['target'].values[0]  # Target 값은 13으로 설정
                }
                augmented_df = pd.concat([augmented_df, pd.DataFrame([augmented_entry1])], ignore_index=True)

                #2번 처리
                augmented2 = pipeline2(image=img)
                augmented_img2 = augmented2['image']

                # Save augmented image
                output_path2 = os.path.join(output_folder, f"add_25000_{i}_{os.path.basename(img_path)}_c.jpg")
                cv2.imwrite(output_path2, augmented_img2)

                # Add entry to the augmented DataFrame
                augmented_entry2 = {
                    'ID': os.path.basename(output_path2),
                    'target': original_df[original_df['ID'] == os.path.basename(img_path)]['target'].values[0]  # Target 값은 13으로 설정
                }
                augmented_df = pd.concat([augmented_df, pd.DataFrame([augmented_entry2])], ignore_index=True)

                #3번 처리
                augmented3 = pipeline3(image=img)
                augmented_img3 = augmented3['image']

                # Save augmented image
                output_path3 = os.path.join(output_folder, f"add_25000_{i}_{os.path.basename(img_path)}_d.jpg")
                cv2.imwrite(output_path3, augmented_img3)

                # Add entry to the augmented DataFrame
                augmented_entry3 = {
                    'ID': os.path.basename(output_path3),
                    'target': original_df[original_df['ID'] == os.path.basename(img_path)]['target'].values[0]  # Target 값은 13으로 설정
                }
                augmented_df = pd.concat([augmented_df, pd.DataFrame([augmented_entry3])], ignore_index=True)


                
    return augmented_df
    # # Concatenate original and augmented DataFrames
    # combined_df = pd.concat([original_df, augmented_df], ignore_index=True)

    # # Save the new CSV file
    # combined_df.to_csv(os.path.join(output_folder, csv_name), index=False)



In [60]:
# 예시로 사용할 변수
input_folder = "data/train/"
output_folder = "aug_25000"
original_csv_file = "train_copy.csv"

# 함수 호출
aug_df = augment_data_and_update_csv1(input_folder, output_folder,  pipeline0, alb0, pipeline1, pipeline2, pipeline3, original_csv_file, num_augmented_per_image=4)


  0%|          | 0/4 [00:00<?, ?it/s]

100%|██████████| 4/4 [00:02<00:00,  1.75it/s]
100%|██████████| 4/4 [00:03<00:00,  1.19it/s]
100%|██████████| 4/4 [00:02<00:00,  1.41it/s]
100%|██████████| 4/4 [00:01<00:00,  2.40it/s]
100%|██████████| 4/4 [00:01<00:00,  2.05it/s]
100%|██████████| 4/4 [00:01<00:00,  2.03it/s]
100%|██████████| 4/4 [00:01<00:00,  2.49it/s]
100%|██████████| 4/4 [00:02<00:00,  1.48it/s]
100%|██████████| 4/4 [00:01<00:00,  2.49it/s]
100%|██████████| 4/4 [00:02<00:00,  1.39it/s]
100%|██████████| 4/4 [00:02<00:00,  1.60it/s]
100%|██████████| 4/4 [00:02<00:00,  1.60it/s]
100%|██████████| 4/4 [00:03<00:00,  1.25it/s]
100%|██████████| 4/4 [00:01<00:00,  2.11it/s]
100%|██████████| 4/4 [00:02<00:00,  1.67it/s]
100%|██████████| 4/4 [00:02<00:00,  1.43it/s]
100%|██████████| 4/4 [00:02<00:00,  1.35it/s]
100%|██████████| 6/6 [00:04<00:00,  1.37it/s]
100%|██████████| 4/4 [00:02<00:00,  1.97it/s]
100%|██████████| 4/4 [00:01<00:00,  2.24it/s]
100%|██████████| 4/4 [00:02<00:00,  1.62it/s]
100%|██████████| 6/6 [00:03<00:00,

In [61]:
aug_df.shape

(27248, 2)

In [62]:
df = aug_df

search_terms = {'0583254a73b48ece' :10,'1ec14a14bbe633db':7  ,'38d1796b6ad99ddd':10,'45f0d2dfc7e47c03':7 ,'aec62dced7af97cd':14 }



for key, value in search_terms.items():
    df.loc[df['ID'].str.contains(key), 'target2' ] = value

     
df[~df.target2.isnull()]
df['target2'].fillna(df['target'], inplace=True)
df['target2'] = df['target2'].astype(int)


In [63]:
df

Unnamed: 0,ID,target,target2
0,add_25000_0_a1ab865095b2d312.jpg_a.jpg,2,2
1,add_25000_0_a1ab865095b2d312.jpg_b.jpg,2,2
2,add_25000_0_a1ab865095b2d312.jpg_c.jpg,2,2
3,add_25000_0_a1ab865095b2d312.jpg_d.jpg,2,2
4,add_25000_1_a1ab865095b2d312.jpg_a.jpg,2,2
...,...,...,...
27243,add_25000_2_6d41878ae3c7e40b.jpg_d.jpg,16,16
27244,add_25000_3_6d41878ae3c7e40b.jpg_a.jpg,16,16
27245,add_25000_3_6d41878ae3c7e40b.jpg_b.jpg,16,16
27246,add_25000_3_6d41878ae3c7e40b.jpg_c.jpg,16,16


In [65]:
merged_df = pd.read_csv('merged_df.csv')

merged_df.shape
final_df = pd.concat([merged_df, df], ignore_index=True)
print(merged_df.shape, df.shape, final_df.shape)



(25360, 3) (27248, 3) (52608, 3)


In [67]:
final_df.to_csv('final_df.csv', index = False)

In [58]:
dfa = pd.read_csv(original_csv_file)
dfa.shape

(1570, 2)

In [3]:
import os

def count_files_recursive(directory):
    # 지정된 디렉토리 및 모든 하위 디렉토리 내의 파일 수를 세는 함수
    total_files = sum(len(files) for _, _, files in os.walk(directory))
    print(f"The total number of files in '{directory}' and its subdirectories is: {total_files}")
    return total_files

# 사용 예:



In [5]:
source_dir = 'aug_25000'
count_files_recursive(source_dir)

The total number of files in 'aug_25000' and its subdirectories is: 27248


27248

In [75]:
pwd

'/data/ephemeral/home'

In [76]:
import os
import shutil

# 원본 디렉토리와 대상 디렉토리 설정
source_dir = 'aug_25000'
target_dir = 'data/dataset_50000/aug_2'


# 원본 디렉토리에서 파일 목록 가져오기
files = os.listdir(source_dir)

# 각 파일을 대상 디렉토리로 복사
for file in files:
    shutil.copy(os.path.join(source_dir, file), target_dir)

print(f'모든 파일이 {source_dir}에서 {target_dir}로 복사되었습니다.')


모든 파일이 aug_25000에서 data/dataset_50000/aug_2로 복사되었습니다.
