## 1. 라이브러리 불러오기

In [None]:
import sys
import glob
import cv2
import numpy as np

from tqdm import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, Subset
from torchvision import transforms, utils, datasets, models
from torch.nn.modules.loss import BCEWithLogitsLoss
from torch.optim import lr_scheduler

from torch.autograd import Variable

from matplotlib import pyplot as plt
from time import time

import os
import time
import random

import timm
import torch
import albumentations as A
import pandas as pd
import numpy as np
import torch.nn as nn
from albumentations.pytorch import ToTensorV2
from torch.optim import Adam
from torchvision import transforms
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from tqdm import tqdm
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import KFold

import wandb

from augraphy import *

In [None]:
# 이미지 데이터 추가 코드
import cv2
import os
import numpy as np
import pandas as pd
from glob import glob

def augment_data_and_update_csv(input_folder, output_folder, aug, alb, csv_file, csv_name, num_augmented_per_image=4):
    # Load the original CSV file
    original_df = pd.read_csv(csv_file)

    # Create a new DataFrame for augmented data
    augmented_df = pd.DataFrame(columns=original_df.columns)

    # Define augmentation pipeline
    aug = aug # augraphy 변환

    alb = alb # albumentation 변환
    # Get a list of all image files in the input folder
    image_files = glob(os.path.join(input_folder, '*.jpg'))

    for img_path in image_files:
        # Read the image
        img = cv2.imread(img_path)

        # Apply augmentation multiple times
        for i in tqdm(range(num_augmented_per_image)):
            augraphy = aug(img)
            augmented = alb(image=augraphy)
            augmented_img = augmented['image']

            # Save augmented image
            output_path = os.path.join(output_folder, f"augmented_{i}_{os.path.basename(img_path)}")
            cv2.imwrite(output_path, augmented_img)

            # Add entry to the augmented DataFrame
            augmented_entry = {
                'ID': os.path.basename(output_path),
                'target': original_df[original_df['ID'] == os.path.basename(img_path)]['target'].values[0]
            }
            augmented_df = pd.concat([augmented_df, pd.DataFrame([augmented_entry])], ignore_index=True)

    # Concatenate original and augmented DataFrames
    combined_df = pd.concat([original_df, augmented_df], ignore_index=True)

    # Save the new CSV file
    combined_df.to_csv(os.path.join(output_folder, csv_name), index=False)

input_folder = "/data/ephemeral/home/upstage_cv/data/train"
#output_folder = "/data/ephemeral/home/upstage_cv/data/aug_img/lmj"
original_csv_file = "/data/ephemeral/home/upstage_cv/data/train.csv"

# augment_data_and_update_csv(input_folder, output_folder, original_csv_file, num_augmented_per_image=4)

In [None]:
meta_path = '/data/ephemeral/home/upstage_cv/data/meta.csv'
train_path = '/data/ephemeral/home/upstage_cv/data/train.csv'
submission_path = '/data/ephemeral/home/upstage_cv/data/sample_submission.csv'

meta_data = pd.read_csv(meta_path)
df_train = pd.read_csv(train_path)
df_submission = pd.read_csv(submission_path)

merge = pd.merge(df_train, meta_data, how='inner')

In [None]:
# 시드를 고정합니다.
SEED = 42
os.environ['PYTHONHASHSEED'] = str(SEED)
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.benchmark = True

## 2. Custom Dataset

In [None]:
class ImageDataset(Dataset):
    def __init__(self, csv, path, album_transform=None, augraphy_transform=None):
        self.df = pd.read_csv(csv).values
        self.path = path 
        self.album_transform = album_transform
        self.augraphy_transform = augraphy_transform

    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        name, target = self.df[idx]
        img = np.array(Image.open(os.path.join(self.path, name)))
        
        if self.augraphy_transform:
            img = self.augraphy_transform(img)

        if self.album_transform:
            img = self.album_transform(image=img)['image']
        
        return img, target

In [None]:
def training(model, dataloader, dataset, device, criterion, optimizer, epoch, num_epochs):
    model.train()
    train_loss = 0.0
    preds_list = []
    targets_list = []

    tbar = tqdm(dataloader)
    for images, labels in tbar:
        images = images.type(torch.cuda.FloatTensor)
        images, labels = images.to(device), labels.to(device)

        outputs = model(images)
        loss = criterion(outputs, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        preds_list.extend(outputs.argmax(dim=1).detach().cpu().numpy())
        targets_list.extend(labels.detach().cpu().numpy())

        tbar.set_description(f"Epoch [{epoch+1}/{num_epochs}], Train Loss : {loss.item():.4f}")

    train_loss = train_loss / (len(dataloader))
    train_acc = accuracy_score(preds_list, targets_list)
    train_f1 = f1_score(preds_list, targets_list, average='macro')

    metrics = {
        'train_loss' : train_loss,
        'train_acc' : train_acc,
        'train_f1' : train_f1
    }

    return model, metrics

def evaluation(model, dataloader, dataset, device, criterion, epoch, num_epochs):
    model.eval()
    valid_loss = 0.0
    preds_list = []
    targets_list = []

    with torch.no_grad():
        tbar = tqdm(dataloader)
        for images, labels in tbar:
            images = images.type(torch.cuda.FloatTensor)
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            loss = criterion(outputs, labels)

            valid_loss += loss.item()
            preds_list.extend(outputs.argmax(dim=1).detach().cpu().numpy())
            targets_list.extend(labels.detach().cpu().numpy())

            tbar.set_description(f"Epcoh [{epoch+1}/{num_epochs}] Valid Loss : {valid_loss:.4f}")

    valid_loss /= len(dataloader)
    valid_acc = accuracy_score(preds_list, targets_list)
    valid_f1 = f1_score(preds_list, targets_list, average='macro')

    metrics = {
        'valid_loss' : valid_loss,
        'valid_acc' : valid_acc,
        'valid_f1' : valid_f1
    }

    return model, metrics

def training_loop(model, train_dataloader, valid_dataloader, train_dataset, valid_dataset, criterion, optimizer, device, num_epochs, model_path, model_name, patience, run):

    best_valid_loss = float('inf')
    valid_max_accuracy = -1
    valid_max_f1 = -1
    early_stop_counter = 0

    for epoch in range(num_epochs):
        model, train_metrics = training(model, train_dataloader, train_dataset, device, criterion, optimizer, epoch, num_epochs)
        model, valid_metrics = evaluation(model, valid_dataloader, valid_dataset, device, criterion, epoch, num_epochs)

        monitoring_value = {
            'train_loss' : train_metrics['train_loss'],
            'train_accuracy' : train_metrics['train_acc'],
            'train_f1' : train_metrics['train_f1'],
            'valid_loss' : valid_metrics['valid_loss'],
            'valid_accuracy' : valid_metrics['valid_acc'],
            'valid_f1' : valid_metrics['valid_f1']
        }
        run.log(monitoring_value, step=epoch)

        if valid_max_accuracy < valid_metrics['valid_acc']:
            valid_max_accuracy = valid_metrics['valid_acc']

            run.summary['best_train_acc'] = train_metrics['train_acc']
            run.summary['best_valid_acc'] = valid_metrics['valid_acc']
        
        if valid_max_f1 < valid_metrics['valid_f1']:
            valid_max_f1 = valid_metrics['valid_f1']
            torch.save(model.state_dict(), model_path+f"/model_{model_name}.pt")

            run.summary['best_train_f1'] = train_metrics['train_f1']
            run.summary['best_valid_f1'] = valid_metrics['valid_f1']

        if best_valid_loss > valid_metrics['valid_loss']:
            best_valid_loss = valid_metrics['valid_loss']

            run.summary['best_train_loss'] = train_metrics['train_loss']
            run.summary['best_valid_loss'] = valid_metrics['valid_loss']
        else:
            early_stop_counter += 1
            
        print(f"Epoch [{epoch+1}/{num_epochs}], Train Loss : {train_metrics['train_loss']:.4f}, Train Acc : {train_metrics['train_acc']:.4f}, 'Train F1 : {train_metrics['train_f1']:.4f}, Valid Loss : {valid_metrics['valid_loss']:.4f}, Valid Acc : {valid_metrics['valid_acc']:.4f}, Valid F1 : {valid_metrics['valid_f1']}")

        if early_stop_counter >= patience:
            print('Early Stopping!')        
            break

    return model, valid_max_accuracy, valid_max_f1


## 4. Hyper Parameter 정의

In [None]:
model = timm.create_model('resnet34', pretrained=True, num_classes=17)
class Cfg():
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    num_epochs = 100
    batch_size = 32
    model_path = '/data/ephemeral/home/upstage_cv/models'

## 5. Experiment (num_augmented=8, p=0.6, epoch=100)

In [None]:
output_folder = "/data/ephemeral/home/upstage_cv/data/aug_img/sjy"

# augraphy
ink_phase = [
    Folding(p=0.6)
    ]

paper_phase = [
    ColorPaper(p=0.6),
    ColorShift(p=0.6),
    ReflectedLight(p=0.6)
]

post_phase = [
    VoronoiTessellation(p=0.6),
]
pipeline = AugraphyPipeline(ink_phase=ink_phase, paper_phase=paper_phase, post_phase=post_phase)

aug_sjy = pipeline

# albumentation
alb_sjy = A.Compose([
    A.HorizontalFlip(p=0.6),
    A.VerticalFlip(p=0.6),
    A.Rotate(p=0.6),
    A.GaussianBlur(p=0.6),
    A.RandomBrightnessContrast(p=0.6),
    A.HueSaturationValue(p=0.6),
    A.RandomGamma(p=0.6),
    A.ColorJitter(p=0.6),
    A.CoarseDropout(p=0.6),
    A.GaussNoise(p=0.6),
    A.Resize(224, 224),
    #ToTensorV2()
])

csv_name = 'augmented_sjy.csv'
augment_data_and_update_csv(input_folder, output_folder, aug_sjy, alb_sjy, original_csv_file, csv_name=csv_name, num_augmented_per_image=8)

### 원본이미지 파일 옮겨주는 작업 해야함

In [None]:
aug_sjy_path = '/data/ephemeral/home/upstage_cv/data/aug_img/sjy/augmented_sjy.csv'
df_sjy = pd.read_csv(aug_sjy_path)
df_sjy.head()

## 5.2 Load Dataset

In [None]:
sjy_img_path = '/data/ephemeral/home/upstage_cv/data/aug_img/sjy'
test_img_path = '/data/ephemeral/home/upstage_cv/data/test/'
totensor_transform = A.Compose([A.Resize(224, 224), ToTensorV2()])
test_transform = A.Compose([
    A.Resize(224, 224),
    ToTensorV2()
])

train_dataset = ImageDataset(aug_sjy_path, sjy_img_path, album_transform=totensor_transform, augraphy_transform=None)
test_dataset = ImageDataset(submission_path, test_img_path, album_transform=test_transform, augraphy_transform=None)

print(len(train_dataset), len(test_dataset))

In [None]:
train_num, valid_num = int(len(train_dataset) * 0.8), int(len(train_dataset) * 0.2)
train_dataset, valid_dataset = torch.utils.data.random_split(train_dataset, [train_num, valid_num])

print(len(train_dataset), len(valid_dataset))

In [None]:
train_dataloader = DataLoader(train_dataset, batch_size=Cfg.batch_size, shuffle=True)
valid_dataloader = DataLoader(valid_dataset, batch_size=Cfg.batch_size, shuffle=False)
test_dataloader = DataLoader(test_dataset, batch_size=Cfg.batch_size, shuffle=False)

## 5.3 Train Model

In [None]:
#  run = wandb.init(project='AIStage-CV', name='resnet34-Aug-sjy')

device = Cfg.device
model = Cfg.model
criterion = Cfg.criterion
optimizer = Cfg.optimizer 
num_epochs = Cfg.num_epochs
model_name = 'resnet34-aug-sjy'
model_path = Cfg.model_path

# run.watch(model, criterion, log='all', log_graph=True)

model, valid_max_accuracy, valid_max_f1 = training_loop(model, train_dataloader, valid_dataloader, train_dataset, valid_dataset, criterion, optimizer, device, num_epochs, model_path, model_name, 20, run)

run.finish()

## Test data Inference

In [None]:
model_lmj = timm.create_model('resnet34', pretrained=True, num_classes=17)
model_lmj.load_state_dict(torch.load('/data/ephemeral/home/upstage_cv/models/model_resnet34-aug-lmj.pt'))
model_lmj = model_lmj.to(device)
model_lmj.eval()

In [None]:
preds_list = []

for images, labels in tqdm(test_dataloader):
    images = images.type(torch.cuda.FloatTensor)
    images = images.to(device)

    with torch.no_grad():
        preds = model_lmj(images)
    preds_list.extend(preds.argmax(dim=1).detach().cpu().numpy())

In [None]:
pred_df = pd.DataFrame(test_dataset.df, columns=['ID', 'target'])
pred_df['target'] = preds_list

In [None]:
sample_submission_df = pd.read_csv(submission_path)
assert (sample_submission_df['ID'] == pred_df['ID']).all()

In [None]:
pred_df.to_csv('resnet34-aug-lmj.csv', index=False)

In [None]:
pred_df.head()

In [None]:
model_sjy = timm.create_model('resnet34', pretrained=True, num_classes=17)
model_sjy.load_state_dict(torch.load('/data/ephemeral/home/upstage_cv/models/model_resnet34-aug-sjy.pt'))

In [None]:
model_sjy.to(device)
preds_list = []

for images, labels in tqdm(test_dataloader):
    images = images.type(torch.cuda.FloatTensor)
    images = images.to(device)

    with torch.no_grad():
        preds = model_sjy(images)
    preds_list.extend(preds.argmax(dim=1).detach().cpu().numpy())

In [None]:
pred_df = pd.DataFrame(test_dataset.df, columns=['ID', 'target'])
pred_df['target'] = preds_list

In [None]:
sample_submission_df = pd.read_csv(submission_path)
assert (sample_submission_df['ID'] == pred_df['ID']).all()

In [None]:
pred_df.to_csv('resnet34-aug-sjy.csv', index=False)

In [None]:
pred_df.head()