In [None]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import random
import torch
import os
import re

def reset_seeds(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

SEED = 42
reset_seeds(SEED)

In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import pairwise_distances

bert_model = SentenceTransformer('jhgan/ko-sroberta-multitask')

def evaluate_similarity(question, answer, bert_model=bert_model, alpha=0.95): # cosine 가중치 조절
    input_embedding = bert_model.encode(question)
    generated_embedding = bert_model.encode(answer)

    # 코사인 유사도
    cosine_sim = 1 - pairwise_distances([input_embedding], [generated_embedding], metric='cosine')[0][0]

    # 자카드 유사도
    input_tokens = set(question.split())
    generated_tokens = set(answer.split())
    jaccard_sim = len(input_tokens.intersection(generated_tokens)) / len(input_tokens.union(generated_tokens))

    # 가중 평균
    weighted_sim = alpha*cosine_sim + (1 - alpha)*jaccard_sim

    return weighted_sim

In [None]:
train = pd.read_csv('train_gpt.csv')

train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 644 entries, 0 to 643
Data columns (total 16 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        644 non-null    object
 1   category  644 non-null    object
 2   질문_1      644 non-null    object
 3   질문_2      644 non-null    object
 4   질문_3      644 non-null    object
 5   질문_4      644 non-null    object
 6   답변_1      644 non-null    object
 7   답변_2      644 non-null    object
 8   답변_3      644 non-null    object
 9   답변_4      644 non-null    object
 10  답변_5      644 non-null    object
 11  답변_6      644 non-null    object
 12  답변_7      644 non-null    object
 13  답변_8      644 non-null    object
 14  답변_9      644 non-null    object
 15  답변_10     644 non-null    object
dtypes: object(16)
memory usage: 80.6+ KB


#### 간단한 전처리 - 질문

In [None]:
# gpt 질문에서 따옴표 삭제
train['질문_3'] = train['질문_3'].str.replace('"', '').str.replace("'", '').str.strip()
train['질문_4'] = train['질문_4'].str.replace('"', '').str.replace("'", '').str.strip()

# gpt 질문에서 질문이 아닌 문장 확인
question_3 = train.loc[((train['질문_3'].str.endswith('다.')) | (train['질문_3'].str.endswith('요.'))) , ['질문_1', '질문_3']]
question_3 = question_3.loc[(~(question_3['질문_3'].str.endswith('주세요.'))) & (~(question_3['질문_3'].str.endswith('싶습니다.'))) & (~(question_3['질문_3'].str.endswith('궁금합니다.')))]

question_4 = train.loc[((train['질문_4'].str.endswith('다.')) | (train['질문_4'].str.endswith('요.'))) , ['질문_2', '질문_4']]
question_4 = question_4.loc[~(question_4['질문_4'].str.endswith('주세요.')) & (~(question_4['질문_4'].str.endswith('싶습니다.'))) & (~(question_4['질문_4'].str.endswith('궁금합니다.')))]

# gpt로 질문 재생성
replace_question_3 = pd.read_csv('질문_3_replace.csv')
replace_question_4 = pd.read_csv('질문_4_replace.csv')

for _, row in replace_question_3.iterrows():
    train['질문_3'][row['index']] = row['질문_3_replace']

for _, row in replace_question_4.iterrows():
    train['질문_4'][row['index']] = row['질문_4_replace']

# 67, 139, 313, 319 인덱스의 장단점은 수기로 수정

# 장점 또는 단점을 틀리게 표현한 문장 수정
for _, row in train.iterrows():
    if row['질문_1'].find('장점') != -1:
        if row['질문_2'].find('단점') != -1:
            row['질문_2'] = row['질문_2'].replace('단점', '장점')
            row['질문_4'] = row['질문_2'].replace('단점', '장점')

    if row['질문_1'].find('단점') != -1:
        if row['질문_2'].find('장점') != -1:
            row['질문_2'] = row['질문_2'].replace('장점', '단점')
            row['질문_4'] = row['질문_2'].replace('장점', '단점')

#### 질문_1과 다른 질문들과의 유사도 비교를 통해 0.6 미만인 질문 확인 후 수정

In [None]:
# 유사도 0.6 미만 질문 확인
tmp = pd.DataFrame()
for idx, row in tqdm(train.iterrows(), total=train.shape[0]):
    for i in range(2, 5):
        score = evaluate_similarity(row['질문_1'], row[f'질문_{i}'])
        if score < 0.6:
            row_sim_df = pd.DataFrame({'index':idx,
                                        'question':f'질문_{i}',
                                        'score':score,
                                        '질문_1':row['질문_1'],
                                        '비교':f'질문_{i} : ' + row[f'질문_{i}'],
                                        '답변':row[f'답변_2']}, index=[0])
            tmp = pd.concat([tmp, row_sim_df])

tmp.reset_index(drop=True, inplace=True)
# csv 파일로 생성후 엑셀에서 수정
# tmp.to_csv('row_sim_question.csv', index=False, encoding='utf-8-sig')

In [None]:
# 수정 질문 교체
row_sim_que_df = pd.read_csv('row_sim_question.csv')
for _, row in row_sim_que_df.iterrows():
    train.loc[row['index'], row['question']] = row['비교'][7:]

#### 간단한 전처리 - 답변

In [None]:
# gpt 질문에서 따옴표 삭제
for i in range(6, 11):
    train[f'답변_{i}'] = train[f'답변_{i}'].str.replace('"', '').str.replace("'", '').str.strip()

In [None]:
# gpt로 생성한 답변 중 답변을 여러개 생성한 답변 수정.
tmp = pd.DataFrame()
for idx, row in tqdm(train.iterrows(), total=train.shape[0]):
    for i in range(1, 6):
        if row[f'답변_{i}'].find('\n') == -1:
            if row[f'답변_{i+5}'].find('\n') != -1:
                text_1 = row[f'답변_{i+5}'].split('\n')[0]
                text_2 = row[f'답변_{i+5}'].split('\n')[1]
                score = evaluate_similarity(text_1, text_2)
                if score >= 0.9:
                    answer_df = pd.DataFrame({'index':idx,
                                        'column':f'답변_{i+5}',
                                        '원본답변':row[f'답변_{i}'],
                                        '답변':row[f'답변_{i+5}']}, index=[0])
                    tmp = pd.concat([tmp, answer_df])
tmp.reset_index(drop=True, inplace=True)

# 여러개 생성된 답변 중 증강 전 답변이랑 가장 유사도가 높은 답변 하나만 선택
best_lst = []
for idx, row in tmp.iterrows():
    best_score = 0
    best_answer = ''
    texts = row['답변'].split('\n')
    for text in texts:
        text = text.replace('-', '').strip()
        text = re.sub('[0-9]. {1}', '', text).strip()
        score = evaluate_similarity(row['원본답변'], text)
        if score > best_score:
            best_score = score
            best_answer = text
    best_lst.append(best_answer)

tmp['수정답변'] = best_lst
for _, row in tmp.iterrows():
    train.loc[row['index'], row['column']] = row['수정답변']

  0%|          | 0/644 [00:00<?, ?it/s]

In [None]:
train.to_csv('train_final.csv', index=False, encoding='utf-8-sig')