In [3]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
from transformers import GPT2LMHeadModel, PreTrainedTokenizerFast
import torch
from torch.optim import AdamW

  from tqdm.autonotebook import tqdm, trange


In [12]:
import os

# 현재 작업 디렉토리 확인
current_directory = os.getcwd()
print(f"현재 작업 디렉토리: {current_directory}")

# 현재 디렉토리의 파일 및 폴더 목록 확인
files_and_folders = os.listdir(current_directory)
print(f"현재 디렉토리의 파일 및 폴더: {files_and_folders}")


현재 작업 디렉토리: e:\OneDrive\MY CODE SPACE\2024_dacon_hansol
현재 디렉토리의 파일 및 폴더: ['open', 'ref_code.ipynb', 'test_code.ipynb']


In [None]:
#COLAB
from google.colab import drive
drive.mount('/content/drive')
train = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/2024_dacon_hansol/open/train.csv')
test = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/2024_dacon_hansol/open/test.csv')
train.info()

In [5]:
#local
train = pd.read_csv('./open/train.csv')
test = pd.read_csv('./open/test.csv')
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 644 entries, 0 to 643
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        644 non-null    object
 1   질문_1      644 non-null    object
 2   질문_2      644 non-null    object
 3   category  644 non-null    object
 4   답변_1      644 non-null    object
 5   답변_2      644 non-null    object
 6   답변_3      644 non-null    object
 7   답변_4      644 non-null    object
 8   답변_5      644 non-null    object
dtypes: object(9)
memory usage: 45.4+ KB


In [6]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 130 entries, 0 to 129
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      130 non-null    object
 1   질문      130 non-null    object
dtypes: object(2)
memory usage: 2.2+ KB


In [7]:
data = train
# Baseline 은 한국어 모델의 kogpt2
tokenizer = PreTrainedTokenizerFast.from_pretrained('skt/kogpt2-base-v2', eos_token='</s>')


The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'GPT2Tokenizer'. 
The class this function is called from is 'PreTrainedTokenizerFast'.


In [8]:
#질문과 답변을 연결해주는 전처리
formatted_data = []
for _, row in tqdm(data.iterrows()):
    for q_col in ['질문_1', '질문_2']:
        for a_col in ['답변_1', '답변_2', '답변_3', '답변_4', '답변_5']:
            # 질문과 답변 쌍을 </s> token으로 연결
            input_text = row[q_col] + tokenizer.eos_token + row[a_col]
            input_ids = tokenizer.encode(input_text, return_tensors='pt')
            formatted_data.append(input_ids)
print('Done.')

644it [00:01, 467.60it/s]

Done.





In [9]:
import torch

torch.__version__


'2.3.1+cu121'

In [10]:
# GPU 설정 (가능한 경우 GPU 또는 MPS 사용)
if torch.backends.mps.is_available():
    device = torch.device("mps")
elif torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")
print(f"Using device: {device}")

Using device: cuda


In [12]:
# 모델 로드
model = GPT2LMHeadModel.from_pretrained('skt/kogpt2-base-v2')
model.to(device) # 모델을 GPU단으로 이동

# 모델 학습 하이퍼파라미터(Hyperparameter) 세팅
CFG = {
    'LR' : 2e-5, # Learning Rate
    'EPOCHS' : 10, # 학습 Epoch
}

# 모델 학습 설정
optimizer = AdamW(model.parameters(), lr=CFG['LR'])
model.train()

# 모델 학습
for epoch in range(CFG['EPOCHS']):
    total_loss = 0
    progress_bar = tqdm(enumerate(formatted_data), total=len(formatted_data))
    for batch_idx, batch in progress_bar:
        # 데이터를 GPU로 이동
        batch = batch.to(device)
        outputs = model(batch, labels=batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        total_loss += loss.item()

        # 진행률 표시줄에 평균 손실 업데이트
        progress_bar.set_description(f"Epoch {epoch+1} - Avg Loss: {total_loss / (batch_idx+1):.4f}")

    # 에폭의 평균 손실을 출력
    print(f"Epoch {epoch+1}/{CFG['EPOCHS']}, Average Loss: {total_loss / len(formatted_data)}")

# 모델 저장
model.save_pretrained("./hansoldeco-kogpt2")
tokenizer.save_pretrained("./hansoldeco-kogpt2")

Epoch 1 - Avg Loss: 2.8393: 100%|██████████| 6440/6440 [11:12<00:00,  9.58it/s]


Epoch 1/10, Average Loss: 2.8392996145701557


Epoch 2 - Avg Loss: 1.7450: 100%|██████████| 6440/6440 [10:33<00:00, 10.17it/s]


Epoch 2/10, Average Loss: 1.7450216361963602


Epoch 3 - Avg Loss: 1.1245: 100%|██████████| 6440/6440 [21:08<00:00,  5.08it/s]


Epoch 3/10, Average Loss: 1.124494837826083


Epoch 4 - Avg Loss: 0.7624: 100%|██████████| 6440/6440 [19:30<00:00,  5.50it/s]


Epoch 4/10, Average Loss: 0.7624240437117608


Epoch 5 - Avg Loss: 0.5385: 100%|██████████| 6440/6440 [22:21<00:00,  4.80it/s]


Epoch 5/10, Average Loss: 0.5385020866562028


Epoch 6 - Avg Loss: 0.4000: 100%|██████████| 6440/6440 [17:20<00:00,  6.19it/s]


Epoch 6/10, Average Loss: 0.400041737689735


Epoch 7 - Avg Loss: 0.3180: 100%|██████████| 6440/6440 [23:26<00:00,  4.58it/s]


Epoch 7/10, Average Loss: 0.31795230985486084


Epoch 8 - Avg Loss: 0.2653: 100%|██████████| 6440/6440 [10:24<00:00, 10.32it/s]


Epoch 8/10, Average Loss: 0.2652712462990551


Epoch 9 - Avg Loss: 0.2301: 100%|██████████| 6440/6440 [09:20<00:00, 11.49it/s]


Epoch 9/10, Average Loss: 0.23014652467165025


Epoch 10 - Avg Loss: 0.2059: 100%|██████████| 6440/6440 [09:21<00:00, 11.46it/s]


Epoch 10/10, Average Loss: 0.2058892823518572


('./hansoldeco-kogpt2\\tokenizer_config.json',
 './hansoldeco-kogpt2\\special_tokens_map.json',
 './hansoldeco-kogpt2\\tokenizer.json')

In [13]:
# 저장된 Fine-tuned 모델과 토크나이저 불러오기
model_dir = "./hansoldeco-kogpt2"
model = GPT2LMHeadModel.from_pretrained(model_dir)
model.to(device)
tokenizer = PreTrainedTokenizerFast.from_pretrained(model_dir)



# test.csv의 '질문'에 대한 '답변'을 저장할 리스트
preds = []

# '질문' 컬럼의 각 질문에 대해 답변 생성
for test_question in tqdm(test['질문']):
    # 입력 텍스트를 토큰화하고 모델 입력 형태로 변환
    input_ids = tokenizer.encode(test_question + tokenizer.eos_token, return_tensors='pt')

    # 답변 생성
    output_sequences = model.generate(
        input_ids=input_ids.to(device),
        max_length=300,
        temperature=0.9,
        top_k=1,
        top_p=0.9,
        repetition_penalty=1.2,
        do_sample=True,
        num_return_sequences=1
    )

    # 생성된 텍스트(답변) 저장
    for generated_sequence in output_sequences:
        full_text = tokenizer.decode(generated_sequence, skip_special_tokens=False)
        # 질문과 답변의 사이를 나타내는 eos_token (</s>)를 찾아, 이후부터 출력
        answer_start = full_text.find(tokenizer.eos_token) + len(tokenizer.eos_token)
        answer_only = full_text[answer_start:].strip()
        answer_only = answer_only.replace('\n', ' ')
        preds.append(answer_only)

100%|██████████| 130/130 [05:46<00:00,  2.67s/it]


In [14]:

# Embedding Vector 추출에 활용할 모델(distiluse-base-multilingual-cased-v1) 불러오기
model = SentenceTransformer('distiluse-base-multilingual-cased-v1')

# 생성한 모든 응답(답변)으로부터 Embedding Vector 추출
pred_embeddings = model.encode(preds)
print(pred_embeddings.shape)


(130, 512)


In [15]:
#코랩용
submit = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/2024_dacon_hansol/open/sample_submission.csv')
# 제출 양식 파일(sample_submission.csv)을 활용하여 Embedding Vector로 변환한 결과를 삽입
submit.iloc[:,1:] = pred_embeddings
submit.head()
# 리더보드 제출을 위한 csv파일 생성
submit.to_csv('/content/drive/MyDrive/Colab Notebooks/2024_dacon_hansol/open/baseline_submit_epoch20.csv', index=False)


FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/MyDrive/Colab Notebooks/2024_dacon_hansol/open/sample_submission.csv'

In [16]:
#로컬용
submit = pd.read_csv('open/sample_submission.csv')
# 제출 양식 파일(sample_submission.csv)을 활용하여 Embedding Vector로 변환한 결과를 삽입
submit.iloc[:,1:] = pred_embeddings
submit.head()
# 리더보드 제출을 위한 csv파일 생성
submit.to_csv('open/baseline_submit_epoch20.csv', index=False)


  0.00123959  0.03049644 -0.03606354  0.03419179  0.00671222  0.01496738
  0.02259492 -0.00737623 -0.01161897  0.04312626  0.01172191 -0.00900819
 -0.00025256  0.01236259  0.02048203 -0.02315157 -0.04601259 -0.0371641
 -0.01039002  0.01948163  0.02356949 -0.00234097  0.017515   -0.00101012
  0.00126625 -0.02976476  0.00319811  0.01154732 -0.01327816 -0.04512741
 -0.01795852  0.03158206  0.05358488  0.00321818  0.05591501 -0.0546479
  0.01701197  0.05293179 -0.00119491 -0.01939138  0.00974781 -0.0084662
  0.01623422  0.00324985 -0.01799241  0.00464211  0.01095351 -0.00105947
  0.06814542 -0.04384268  0.02478046  0.05616709  0.01891069  0.00928531
 -0.03993795  0.00323683  0.02681037 -0.0087958   0.02054332 -0.02289689
 -0.00302444  0.01331899 -0.01079819 -0.03447196  0.01204421  0.03895477
  0.05972609 -0.02540491  0.01501654 -0.01737982  0.01588927 -0.01534529
  0.00566559 -0.00852909 -0.03816444 -0.04718981  0.02703209 -0.00847814
 -0.07210767 -0.0105179   0.02694749  0.02648474 -0.00

In [21]:
# 샘플에 대한 Cosine Similarity 산식
def cosine_similarity(a, b):
    dot_product = np.dot(a, b)
    norm_a = np.linalg.norm(a)
    norm_b = np.linalg.norm(b)
    return dot_product / (norm_a * norm_b) if norm_a != 0 and norm_b != 0 else 0
gts = train['답변_1'].tolist()
questions = train['질문_1'].tolist()  # '질문_1'을 질문으로 사용

sample_scores = []
for question, pred, gt in zip(questions, preds, gts):
    # 생성된 답변 내용을 512 Embedding Vector로 변환
    pred_embed = model.encode(pred)
    gt_embed = model.encode(gt)
    
    sample_score = cosine_similarity(gt_embed, pred_embed)
    # Cosine Similarity Score가 0보다 작으면 0으로 간주
    sample_score = max(sample_score, 0)
    print('질문 : ', question)
    print('예측 : ', pred)
    print('정답 : ', gt)
    print('Cosine Similarity Score : ', sample_score)
    print('-'*20)
    sample_scores.append(sample_score)

print('전체 샘플의 Cosine Similarity Score 평균 : ', np.mean(sample_scores))

질문 :  면진장치가 뭐야?
예측 :  방청페인트는 주로 두 가지 종류가 있습니다. 첫 번째로, 건물의 구조를 신중하게 선택하여 부주의한 시공과 온도차를 고려해야 합니다. 둘째로, 기존 페인트에 비해 높은 품질감을 제공한다는 점을 고려해야 합니다. 이러한 특성으로 인해 방청페인트는 건물 내부의 온도와 습도를 안정적으로 유지하는 데 중요한 역할을 합니다. 따라서 신중한 시공이 필요합니다. 추가적으로, 각 요소들을 고려하여 적절한 대책을 마련하는 것이 중요합니다. 예를 들어, 건물의 특성과 상황에 맞게 적합한 페인트를 선택하는 것이 중요합니다. 마지막으로, 원하는 분위기와 스타일의 재료를 활용하여 보수 및 수정이 용이해야 합니다. 이러한 페인트와 관련된 조언들을 참고하여 계획을 세워야 합니다. 최종적으로 페인트가 계획되어 있지 않은 각종 덕트와 접한 부분이니 전문가의 조언을 잘 숙지하는 것을 추천드립니다. 올바른 페인트와 습도 조절과 온도차가 가능한 환경을 조성하기 위해서는 몇 가지 요령이 있습니다. 적절한 대책은 공간을 최대한 활용하면서도 부드럽고 안정적인 분위기를 조성할 수 있도록 신중히 고려해 보시는 것이 중요합니다. 합리적인 조건에서의 적절한 조치를 취하여 페인트를 사용하는 것의 품질을 높이고자 하는 것이 중요합니다. 올바른 페인트와 습도 관리 방법을 이해하고 적용하여 건강한 실내환경을 유지할 수 있도록 유의하시기 바랍니다. 올바른 페인트와 습도와 온도를 유지함으로써 무드를 최대한 예방할 수 있으니 주의하셔야 합니다. 또한, 필요한 경우 전문가의 조언을 듣는 것을 권장합니다. 올바른 페인트를 사용하여 적정 온도는 어느 정도입니다. 적절한 환기를 유지
정답 :  면진장치란 지반에서 오는 진동 에너지를 흡수하여 건물에 주는 진동을 줄여주는 진동 격리장치입니다.
Cosine Similarity Score :  0.16459677
--------------------
질문 :  내진설계의 종류 좀 알려줘
예측 :  실내 습도가 높은 경우, 먼저, 제습시설을 사용하