## 1. 사용할 패키지 불러오기

In [35]:
import pandas as pd
import cv2
import os
from sklearn.model_selection import train_test_split
from torch.optim import Adam
from torch.nn import L1Loss
from data_gen.data_gen import DatasetGenerator
import torch
import torchvision.models as models
import torch.nn as nn
import numpy as np
from tqdm import tqdm


## 2. 데이터 불러오기

### (1) Table 데이터

In [36]:
table_data = pd.read_excel('data/final_data.xlsx')
table_data.head()

Unnamed: 0,작가생존여부_사망,작가생존여부_생존,작가생존여부_알수없음,판매계절_가을,판매계절_겨울,판매계절_봄,판매계절_여름,재료_견본채색,재료_기타,재료_브론즈,...,판매처_칸옥션,판매처_케이옥션,판매처_헤럴드아트데이,가로,세로,작품 판매 횟수,판매가격,작가명,제목,이미지 고유 번호
0,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0.065372,0.065372,0.0,600000,임상진 Lim SangChin (1935~2013),무제,380410
1,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0.026429,0.026429,0.0,400000,정술원 Jung SulWon (1885~1959),화조,380460
2,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0.018878,0.018878,0.0,100000,정주상 Jeong JuSang (1925~2012),심정흥장 (선면),380491
3,0,1,0,1,0,0,0,0,0,0,...,0,0,0,0.025351,0.025351,0.0,360000,이양원 Lee YangWon (1944~),풍속도,380417
4,0,1,0,1,0,0,0,0,1,0,...,0,0,0,0.019957,0.019957,0.0,240000,이외수 Lee OiSoo (1946~),사람과 사람들,380391


In [37]:
remove_files = pd.read_excel('겹치는애들.xlsx')
table_data = table_data.loc[-table_data['이미지 고유 번호'].isin(remove_files['이미지 고유 번호']), :].reset_index(drop = True)
table_data.shape

(15197, 43)

### (2) 이미지 데이터

In [38]:
folder_list = sorted(os.listdir('data/image'))[3:] + sorted(os.listdir('data/image'))[:3]
folder_list

['1월_files',
 '2월_files',
 '3월_files',
 '4월_files',
 '5월_files',
 '6월_files',
 '7월_files',
 '8월_files',
 '9월_files',
 '10월_files',
 '11월_files',
 '12월_files']

In [39]:
serial = []
image_dir = []

for folder_name in folder_list:
    file_list = os.listdir(os.path.join('data/image', folder_name))
    for fname in file_list:
        if os.path.getsize(os.path.join(os.path.join('data/image', folder_name), fname)) > 3200:
            serial.append(fname[:-4])
            image_dir.append(os.path.join(os.path.join('data/image', folder_name), fname))

image_df = pd.DataFrame({'이미지 고유 번호': serial, '이미지경로': image_dir})
image_df.head()

Unnamed: 0,이미지 고유 번호,이미지경로
0,335298,data/image/1월_files/335298.jpg
1,340721,data/image/1월_files/340721.jpg
2,357485,data/image/1월_files/357485.jpg
3,357663,data/image/1월_files/357663.jpg
4,155,data/image/1월_files/155.jpg


### (3) 합치기

In [40]:
table_data['이미지 고유 번호'] = table_data['이미지 고유 번호'].map(str)
final_data = pd.merge(table_data, image_df)
final_data.head()

Unnamed: 0,작가생존여부_사망,작가생존여부_생존,작가생존여부_알수없음,판매계절_가을,판매계절_겨울,판매계절_봄,판매계절_여름,재료_견본채색,재료_기타,재료_브론즈,...,판매처_케이옥션,판매처_헤럴드아트데이,가로,세로,작품 판매 횟수,판매가격,작가명,제목,이미지 고유 번호,이미지경로
0,1,0,0,1,0,0,0,0,0,0,...,0,0,0.065372,0.065372,0.0,600000,임상진 Lim SangChin (1935~2013),무제,380410,data/image/11월_files/380410.jpg
1,1,0,0,1,0,0,0,0,0,0,...,0,0,0.026429,0.026429,0.0,400000,정술원 Jung SulWon (1885~1959),화조,380460,data/image/11월_files/380460.jpg
2,1,0,0,1,0,0,0,0,0,0,...,0,0,0.018878,0.018878,0.0,100000,정주상 Jeong JuSang (1925~2012),심정흥장 (선면),380491,data/image/11월_files/380491.jpg
3,0,1,0,1,0,0,0,0,0,0,...,0,0,0.025351,0.025351,0.0,360000,이양원 Lee YangWon (1944~),풍속도,380417,data/image/11월_files/380417.jpg
4,0,1,0,1,0,0,0,0,1,0,...,0,0,0.019957,0.019957,0.0,240000,이외수 Lee OiSoo (1946~),사람과 사람들,380391,data/image/11월_files/380391.jpg


### (4) 최종 데이터 저장

In [41]:
final_data.to_excel('final_data.xlsx', index = False, encoding = 'euc-kr')

## 3. Image 모델링

### (1) Dataset 생성

In [42]:
image_dir = final_data['이미지경로']
target = final_data['판매가격']

In [43]:
train_image_dir, test_image_dir, train_target, test_target = train_test_split(image_dir, target, train_size = 0.8, random_state = 1004)

In [44]:
train_image_dataset_generator = DatasetGenerator(list(train_image_dir), list(train_target), batch_size = 16, phase = 'train', train_valid_split = False)
train_dataloader = train_image_dataset_generator.dataloader()

test_image_dataset_generator = DatasetGenerator(list(test_image_dir), list(test_target), batch_size = 1, phase = 'test', train_valid_split = False)
test_dataloader = test_image_dataset_generator.dataloader()

### (2) Model 생성

In [45]:
model_name = 'resnet50'

In [46]:
# build model
vision_model = models.resnet50(pretrained=True)
num_ftrs = vision_model.fc.in_features
vision_model.fc = nn.Linear(num_ftrs, 1)

### (3) 학습 파라미터 지정

In [47]:
epoch = 10
learning_rate = 0.01
weight_decay = 0.0001
result_dir = './result/'

### (4) Loss, Optimizer 생성

In [48]:
# get loss function from LossFactory
loss_fn = L1Loss()

# get optimizer from OptimizerFactory
optimizer = Adam(params = vision_model.parameters(),
                lr=learning_rate,
                weight_decay = weight_decay)

### (5) 학습

In [55]:
print("{} start training!".format(model_name))
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
vision_model.to(device)
min_valid_loss = np.inf

# training
for e in range(epoch):
    train_loss = 0.0
    vision_model.train()     # Optional when not using Model Specific layer
    for data in tqdm(train_dataloader['train']):
        if torch.cuda.is_available():
            images, labels = data['image'].float().to(device), data['target'].float().to(device)
        
        optimizer.zero_grad()
        target = vision_model(images)
        loss = loss_fn(target,labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.item() / len(images)
            
    valid_loss = 0.0
    vision_model.eval()     # Optional when not using Model Specific layer
    for data in tqdm(test_dataloader['test']):
        if torch.cuda.is_available():
            data, labels = data['image'].float().to(device), data['target'].float().to(device)
        
        target = vision_model(data)
        loss = loss_fn(target,labels)
        valid_loss = loss.item() * len(data)

    print("Epoch: {}, Training Loss: {}, Test Loss: {}".format(e+1, train_loss / len(train_dataloader['train']), valid_loss))   
    if min_valid_loss > valid_loss:
        print(f'Validation Loss Decreased({min_valid_loss:.6f}--->{valid_loss:.6f}) \t Saving The Model')
        min_valid_loss = valid_loss
        # Saving State Dict
        torch.save(vision_model.state_dict(), result_dir + 'Best_model.pth')    

resnet50 start training!


  return F.l1_loss(input, target, reduction=self.reduction)
100%|██████████| 752/752 [02:23<00:00,  5.26it/s]
  return F.l1_loss(input, target, reduction=self.reduction)
100%|██████████| 3007/3007 [00:47<00:00, 63.19it/s]


Epoch: 1, Training Loss: 414309.85263048543, Test Loss: 593444.375
Validation Loss Decreased(inf--->593444.375000) 	 Saving The Model


100%|██████████| 752/752 [03:32<00:00,  3.54it/s]
100%|██████████| 3007/3007 [01:08<00:00, 44.00it/s]


Epoch: 2, Training Loss: 406888.6644780585, Test Loss: 555908.4375
Validation Loss Decreased(593444.375000--->555908.437500) 	 Saving The Model


100%|██████████| 752/752 [02:46<00:00,  4.52it/s]
100%|██████████| 3007/3007 [00:47<00:00, 63.35it/s]


Epoch: 3, Training Loss: 406806.1075797872, Test Loss: 535421.25
Validation Loss Decreased(555908.437500--->535421.250000) 	 Saving The Model


100%|██████████| 752/752 [02:24<00:00,  5.21it/s]
100%|██████████| 3007/3007 [00:43<00:00, 69.77it/s]


Epoch: 4, Training Loss: 406917.7557674327, Test Loss: 548890.625


100%|██████████| 752/752 [02:21<00:00,  5.30it/s]
100%|██████████| 3007/3007 [00:40<00:00, 74.68it/s]


Epoch: 5, Training Loss: 407005.51945385226, Test Loss: 504229.125
Validation Loss Decreased(535421.250000--->504229.125000) 	 Saving The Model


100%|██████████| 752/752 [02:24<00:00,  5.20it/s]
100%|██████████| 3007/3007 [00:46<00:00, 64.26it/s]


Epoch: 6, Training Loss: 406802.14358014043, Test Loss: 508328.4375


100%|██████████| 752/752 [02:23<00:00,  5.24it/s]
100%|██████████| 3007/3007 [00:47<00:00, 63.51it/s]


Epoch: 7, Training Loss: 417853.99824530416, Test Loss: 512093.1875


100%|██████████| 752/752 [02:23<00:00,  5.25it/s]
100%|██████████| 3007/3007 [00:48<00:00, 62.49it/s]


Epoch: 8, Training Loss: 407391.7421381524, Test Loss: 525854.0


100%|██████████| 752/752 [02:23<00:00,  5.22it/s]
100%|██████████| 3007/3007 [00:47<00:00, 63.25it/s]


Epoch: 9, Training Loss: 406849.31311087107, Test Loss: 501584.125
Validation Loss Decreased(504229.125000--->501584.125000) 	 Saving The Model


100%|██████████| 752/752 [02:23<00:00,  5.23it/s]
100%|██████████| 3007/3007 [00:45<00:00, 65.74it/s]

Epoch: 10, Training Loss: 406944.3276273687, Test Loss: 503394.9375





### (6) Load Best Model

In [56]:
vision_model.load_state_dict(torch.load('result/Best_model.pth'))

<All keys matched successfully>

### (7) 성능 평가

In [72]:
print('start prediction')
predictions = []
vision_model.to(device)

with torch.no_grad():  
    for data in test_dataloader['test']:
        images, labels = data['image'].float().to(device), data['target'].float().to(device)
        images = images.to(device)  
        labels = labels.to(device)  
        vision_model.eval()  
        yhat = vision_model(images)  
        pred = list(yhat.cpu().numpy())
        predictions.append(pred[0][0])

start prediction


In [73]:
from sklearn.metrics import mean_squared_error, r2_score

print("RMSE: {}".format(np.sqrt(mean_squared_error(test_target, predictions))))
print("R2 Score: {}".format(r2_score(test_target, predictions)))

RMSE: 42320947.60939045
R2 Score: -0.022448756793886027
