In [None]:
# 필요한 라이브러리 설치
!pip install transformers torch accelerate
!pip install transformers
!pip install huggingface_hub
!pip install -U sentence-transformers

In [2]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import csv
from tqdm import tqdm
from huggingface_hub import login

# Hugging Face 토큰으로 로그인
# login(token="")  # 여기에 실제 토큰을 입력하세요

# Gemma 2B 모델 및 토크나이저 로드
model_name = "./hansoldeco-kogpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")

# GPU 사용 가능 여부 확인
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

def generate_answer(question, max_new_tokens=150):
    prompt = f"당신은 실내 마감재 전문가입니다. 다음 질문에 대해 간결하고 정확하게 답변해주세요.\n\n질문: {question}\n\n답변:"

    inputs = tokenizer(prompt, return_tensors="pt").to(device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            num_return_sequences=1,
            do_sample=True,
            top_k=50,
            top_p=0.95,
            temperature=0.7
        )

    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    answer = generated_text.split("답변:")[-1].strip()
    return answer

def process_csv(input_file, output_file):
    with open(input_file, 'r', encoding='utf-8') as infile, \
         open(output_file, 'w', encoding='utf-8', newline='') as outfile:

        reader = csv.reader(infile)
        writer = csv.writer(outfile)

        # 첫 번째 행 스킵 (헤더)
        next(reader)

        # 헤더 추가
        writer.writerow(['질문', '답변'])

        # tqdm으로 진행 상황 표시
        for row in tqdm(reader, desc="처리 중"):
            question = row[1]  # 두 번째 열에 질문이 있음
            answer = generate_answer(question)
            writer.writerow([question, answer])

# 사용 예시
input_file = './open/test.csv'  # 입력 파일명
output_file = './open/answers_skt.csv'   # 출력 파일명

process_csv(input_file, output_file)
print(f"처리가 완료되었습니다. 결과가 {output_file}에 저장되었습니다.")


Using device: cuda


  attn_output = torch.nn.functional.scaled_dot_product_attention(
처리 중: 130it [03:49,  1.77s/it]

처리가 완료되었습니다. 결과가 ./open/answers_skt.csv에 저장되었습니다.





In [3]:
import pandas as pd

# CSV 파일 읽기
df = pd.read_csv(output_file)
df

Unnamed: 0,질문,답변
0,"방청 페인트의 종류에는 어떤 것들이 있는지 알고 계신가요? 또한, 원목사이딩을 사용...","1. **. 높은 습도가 전혀도, 상대습도가 전혀도 되어핏 않아 실내 습도가 높아질..."
1,도배지에 녹은 자국이 발생하는 주된 원인과 그 해결 방법은 무엇인가요?,"1. Preven Timid, Prevenadding Prios. Adding Pr..."
2,"큐블럭의 단점을 알려주세요. 또한, 압출법 단열판을 사용하는 것의 장점은 무엇인가요?","습도가 높은 상태에서 실내 습도가 높아지면 결로가 생기고, 이로 인해 결로가 생기게..."
3,"철골구조를 사용하는 고층 건물에서, 단열 효과를 높이기 위한 시공 방법은 무엇이 있...",건축자재로 도장되어 있는 건축자재가 실내로 침투할 때 온도가 낮은 외벽이나 샤시를 ...
4,도배지의 완전한 건조를 위해 몇 주 동안 기다려야 하나요?,"1. 내부 온도가 낮은 반면, 실내 온도가 낮은 상태에서 창호나 통풍이 잘 이루어지..."
...,...,...
125,분말 소화기를 사용할 때 주의해야 할 사항은 무엇인가요? 그리고 아파트 도배 평수를...,"일반적으로, 저온 고온도의 영향을 받습니다. 내부의 온도가 외부로부터 제대로 전달되..."
126,"압출법 보온판의 가장 큰 장점은 무엇인가요?""","1. 습도가 높은 상태에서 실내 온도가 낮은 반면, 외부 환기가 충분히 이루어지지 ..."
127,평지붕의 누수 문제를 방지하기 위해 수성 벽체용 탄성 방수 도료를 사용하는 것이 어...,경량목구조물에 비해 상대적으로 습도가 높고 내부 온도가 낮은 것이 단점으로 꼽힙니다...
128,석고수정이 발생하는 가장 큰 원인은 무엇인가요? 그리고 이를 해결하는 방법에 대해 ...,"고형 환기가 축적되어 수분이 응축되어 실내공기가 손상되는 것을 말하며, 추가로, 도..."


In [6]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer # SentenceTransformer Version 2.2.2

# Embedding Vector 추출에 활용할 모델(distiluse-base-multilingual-cased-v1) 불러오기
model = SentenceTransformer('distiluse-base-multilingual-cased-v1')

# Load the provided CSV files
answers_path = './open/answers_skt.csv'
sample_submission_path = './open/sample_submission.csv'

answers = pd.read_csv(answers_path)
sample_submission = pd.read_csv(sample_submission_path)

# Ensure all entries are strings, replace NaN with empty string
answers['답변'] = answers['답변'].astype(str).fillna('')

# Function to convert text to embedding
def get_embedding(text):
    return model.encode(text)

# Convert each answer into a 512-dimensional embedding vector
answers['embedding'] = answers['답변'].apply(get_embedding)

# Create a DataFrame for the embeddings and ensure 512 dimensions
embedding_df = pd.DataFrame(answers['embedding'].to_list(), index=sample_submission.index)
embedding_df = embedding_df.reindex(columns=range(512), fill_value=0)

# Combine with the sample_submission template
for i in range(512):
    sample_submission[f'vec_{i}'] = embedding_df[i]

# Save the result to a new CSV file
output_path = './final/sample_submission_with_embeddings_trained.csv'
sample_submission.to_csv(output_path, index=False)

print("Embeddings have been successfully saved to:", output_path)


Embeddings have been successfully saved to: ./sample_submission_with_embeddings_trained.csv


In [7]:
sample_submission

Unnamed: 0,id,vec_0,vec_1,vec_2,vec_3,vec_4,vec_5,vec_6,vec_7,vec_8,...,vec_502,vec_503,vec_504,vec_505,vec_506,vec_507,vec_508,vec_509,vec_510,vec_511
0,TEST_000,-0.012852,-0.053077,-0.022830,-0.027986,0.069579,-0.029558,0.035734,0.013880,0.046029,...,0.006788,-0.001274,0.054151,-0.042950,-0.032432,0.041900,-0.005628,-0.015396,-0.061844,0.019704
1,TEST_001,0.015817,0.015109,0.015257,-0.014393,-0.018782,-0.022700,0.029730,0.056620,-0.001951,...,0.005766,-0.024597,-0.047899,-0.016512,-0.020101,0.033342,0.002495,-0.023403,-0.069121,0.012123
2,TEST_002,-0.004274,-0.003734,-0.040072,-0.004422,0.096414,0.008297,0.042701,0.015695,-0.002080,...,-0.030583,-0.007550,0.021147,-0.001045,-0.029016,0.070291,-0.039460,-0.015843,0.000111,0.007454
3,TEST_003,-0.000282,-0.002140,-0.032500,0.001613,0.100870,-0.056339,0.004033,-0.001380,-0.030309,...,-0.006670,-0.035567,0.012202,-0.028831,0.021443,0.063923,-0.022734,-0.038522,-0.020973,0.009925
4,TEST_004,-0.008516,-0.034382,-0.002047,-0.031631,0.076090,-0.078046,0.004191,0.054888,0.063646,...,-0.008363,-0.008381,0.025540,-0.030937,-0.006132,0.048394,-0.038231,-0.020651,-0.039662,0.021891
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
125,TEST_125,0.011460,-0.048257,-0.023042,-0.020692,0.116127,-0.011912,0.020111,0.043532,0.047957,...,-0.050536,-0.033430,0.045998,-0.019503,0.025994,0.064866,-0.002009,-0.050631,0.040195,-0.003825
126,TEST_126,-0.010732,-0.046890,-0.028508,-0.034637,0.083614,-0.051073,0.036264,0.053852,0.067931,...,-0.010751,-0.011286,0.032434,-0.017250,-0.015150,0.072632,-0.015466,-0.038634,-0.029835,0.064603
127,TEST_127,0.018194,-0.043111,0.050815,-0.020248,0.108433,0.028595,-0.006617,0.009889,0.005032,...,-0.017238,0.000355,0.006720,-0.004657,0.001050,0.052912,-0.026430,0.005408,-0.026637,0.020385
128,TEST_128,-0.030450,-0.012355,-0.030267,-0.010882,0.099009,-0.014335,0.053318,0.007101,0.053885,...,-0.029538,-0.011564,0.063815,0.007177,-0.030805,0.039701,0.007489,0.036237,-0.001153,0.070288
