In [1]:
## 모듈 로드
import pandas as pd
from konlpy.tag import Okt
from collections import Counter
import numpy as np
import torch
from torch.utils.data import TensorDataset, DataLoader
from torch import optim
from reviewclassifiermodel import reviewClassifierModel
from torch import nn

import os
import json
import re
import string

import pickle

In [2]:
# 데이터 로드함수
def load_data(csvfile1,csvfile2):                           # csv 파일 읽기
    trainDF = pd.read_csv(csvfile1, usecols=[1, 2, 4])      # 필요한 컬럼 추출
    testDF = pd.read_csv(csvfile2, usecols=[1, 2, 4])
    return trainDF, testDF                                  # 리턴


In [3]:
# 데이터 인코딩 함수
def data_encoding(DF):
    labelCD = DF.Aspect.unique().tolist()                   # Aspect 컬럼의 유니크 값 리스트 
    DF['Aspect'] = DF['Aspect'].map(lambda x: labelCD.index(x))         # 다중 분류 라벨링 인코딩
    DF.loc[DF['SentimentPolarity'] == -1, 'SentimentPolarity'] = 0      # 2진 분류 인코딩
    return DF, labelCD


In [4]:
# 단어사전 만드는 함수
def build_vocab(corpus, n_vocab, special_tokens):
    counter = Counter()                                     # Counter 인스턴스 생성
    for tokens in corpus:                                   # 입력받은 corpus로 카운터 모델 초기화
        counter.update(tokens)
    vocab = special_tokens.copy()                           
    for token, count in counter.most_common(n_vocab):       # 상위 중복 언어 단어사전에 추가
        vocab.append(token)
    return vocab


In [5]:
# 패딩함수
def pad_sequences(sequences, max_length, pad_value):
    result = list()
    for sequence in sequences:                              
        sequence = sequence[:max_length]                    # max_length 만큼 자르기
        pad_length = max_length - len(sequence)             # max_length보다 단어가 적다면
        padded_sequence = sequence + [pad_value] * pad_length   # 정해진 수 채우기
        result.append(padded_sequence)
    return np.asarray(result)


In [6]:
# 자연어 인코딩 함수
def encoding_ids(token_to_id, tokens, unk_id):
    return [
        [token_to_id.get(token, unk_id) for token in review] for review in tokens
    ]   # 자연어 정수화


In [7]:
# 학습함수
def model_train(model, datasets, cl_criterion, bn_criterion, optimizer, device, interval):
    model.train()
    losses = []

    for step, (input_ids, labels) in enumerate(datasets):
        input_ids = input_ids.to(device)                    # 인풋데이터
        cl_labels = labels[:, 0].to(device)                 # 라벨 다중분류
        bn_labels = labels[:, 1].to(device).float()         # 라벨 2진분류  (float형)

        # Forward pass
        classesd, logits = model(input_ids)

        # Calculate losses
        loss_cl = cl_criterion(classesd, cl_labels)         # 
        loss_bn = bn_criterion(logits.squeeze(), bn_labels) # 
        loss = loss_cl + loss_bn
        losses.append(loss.item())

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if step % interval == 0:
            print(f'Train Loss {step} : {np.mean(losses)}')


In [8]:
def save_vocab(vocab,save_file):
    # vocab 저장
    with open(save_file, 'wb') as f:
        pickle.dump(vocab, f)

In [9]:
# test 함수

def model_test(model, datasets, cl_criterion, bn_criterion, device, epoch, results_df):
    model.eval()
    losses = []
    cl_score = []
    bn_score = []

    with torch.no_grad(): 
        for step, (input_ids, labels) in enumerate(datasets):
            input_ids = input_ids.to(device)
            cl_labels = labels[:, 0].to(device).long()
            bn_labels = labels[:, 1].to(device).float() 

            # Forward pass
            classesd, logits = model(input_ids)

            # Calculate losses
            loss_cl = cl_criterion(classesd, cl_labels)
            loss_bn = bn_criterion(logits.squeeze(), bn_labels)
            loss = loss_cl + loss_bn
            losses.append(loss.item())
            
            # Calculate class accuracy
            cl_predictions = torch.argmax(torch.softmax(classesd, dim=1), dim=1)  # 다중 클래스 예측
            cl_score.extend(cl_predictions.eq(cl_labels).cpu().numpy())  # 정확도 계산
            
            # Calculate binary accuracy
            bn_predictions = (torch.sigmoid(logits) > 0.5).int().squeeze()  # 이진 예측
            bn_score.extend(bn_predictions.eq(bn_labels.int()).cpu().numpy())  # 정확도 계산
        
        # 정확도 계산
        cl_accuracy = np.mean(cl_score)
        bn_accuracy = np.mean(bn_score)
        
        print(f'Epoch {epoch} - Val Loss: {np.mean(losses)}, bn_score Val Accuracy: {bn_accuracy}, cl_score Val Accuracy: {cl_accuracy}')

        # 결과를 DataFrame에 추가
        results_df.loc[epoch] = [np.mean(losses), bn_accuracy, cl_accuracy]  # 각 에포크의 결과를 DataFrame에 추가


In [16]:
# 실행함수
def main():
    N_VOCAB = 5000
    MAX_LENGTH = 38
    EPOCHS = 30
    INTERVAL = 500
    BATCH_SIZE = 8
    LR = 0.001
    special_tokens = ['<pad>', '<unk>']

    trainDF, testDF = load_data('./data/train.csv','./data/test.csv')

    trainDF, aspectCD = data_encoding(trainDF)
    testDF, _ = data_encoding(testDF)
    print(aspectCD)

    ## 토큰화 및 불용어 처리 ------------------------------------------------------------------------------------------------------------
    punc=string.punctuation

    for p in punc:
        trainDF['SentimentText'] = trainDF['SentimentText'].str.replace(p, '')
        testDF['SentimentText']=testDF['SentimentText'].str.replace(p,'')

    m=re.compile('[^ ㄱ-ㅣ가-힣]+')     # 한글만 남김

    trainDF['SentimentText']=trainDF['SentimentText'].apply(lambda x: m.sub(' ', x))
    testDF['SentimentText']=testDF['SentimentText'].apply(lambda x: m.sub(' ', x))

    stop_word='./data/stopwords.txt'    # 잘 사용하지 않는 단어 

    with open(stop_word, 'r', encoding='utf-8') as f:
        stop_words = [line.strip() for line in f]
    

    tokenizer = Okt()
    train_tokens = [[token for token in tokenizer.morphs(text) if token not in stop_words] for text in trainDF['SentimentText']]
    test_tokens = [[token for token in tokenizer.morphs(text) if token not in stop_words] for text in testDF['SentimentText']]
    # -------------------------------------------------------------------------------------------------------------------------------
    
    
    # -------------------------------------------------------------------------------------------------------------------------------


    vocab = build_vocab(train_tokens, N_VOCAB, special_tokens)
    token_to_id = {token: idx for idx, token in enumerate(vocab)}
    id_to_token = {idx: token for idx, token in enumerate(vocab)}
    save_vocab(vocab,'./data/cosmetic_vocab.pkl')


    # pad_id = token_to_id['<pad>']
    # unk_id = token_to_id['<unk>']
    # train_ids = encoding_ids(token_to_id, train_tokens, unk_id)     #정수화
    # test_ids = encoding_ids(token_to_id, test_tokens, unk_id)       #정수화
    # train_ids = pad_sequences(train_ids, MAX_LENGTH, pad_id)
    # test_ids = pad_sequences(test_ids, MAX_LENGTH, pad_id)

    # # 텐서화
    # train_ids = torch.tensor(train_ids, dtype=torch.long)
    # test_ids = torch.tensor(test_ids, dtype=torch.long)

    # # 레이블 텐서화
    # train_labels = torch.tensor(list(zip(trainDF['Aspect'].values, trainDF['SentimentPolarity'].values)), dtype=torch.long)
    # test_labels = torch.tensor(list(zip(testDF['Aspect'].values, testDF['SentimentPolarity'].values)), dtype=torch.float32)

    # # 데이터셋 생성
    # train_dataset = TensorDataset(train_ids, train_labels)
    # test_dataset = TensorDataset(test_ids, test_labels)

    # # 데이터 로더 생성
    # train_loader = DataLoader(train_dataset, BATCH_SIZE, shuffle=True)
    # test_loader = DataLoader(test_dataset, BATCH_SIZE, shuffle=False)

    # # 모델 초기화
    # n_vocab = len(token_to_id)  # 어휘 크기 계산
    # hidden_dim = 64 
    # embedding_dim = 128
    # n_layers = 2
    # device = 'cuda' if torch.cuda.is_available() else 'cpu'
    # classifier = reviewClassifierModel(
    #     n_vocab=n_vocab, hidden_dim=hidden_dim, embedding_dim=embedding_dim, n_classes=len(aspectCD), n_layers=n_layers
    # ).to(device)

    # # 손실 함수 및 최적화기 설정
    # cl_criterion = nn.NLLLoss().to(device)          
    # bn_criterion = nn.BCEWithLogitsLoss().to(device)    
    # optimizer = optim.RMSprop(classifier.parameters(), lr=LR)

 
    # # 결과를 저장할 DataFrame 생성
    # results_df = pd.DataFrame(columns=['Val Loss', 'bn_score Val Accuracy', 'cl_score Val Accuracy'])

    # for epoch in range(EPOCHS):
    #     model_train(classifier, train_loader, cl_criterion, bn_criterion, optimizer, device, INTERVAL)
    #     model_test(classifier, test_loader, cl_criterion, bn_criterion, device, epoch, results_df)  # DataFrame 전달

    #     # 모델 저장 (에포크 번호 추가)
    #     model_save_path = f'./saved_model/review_classifier_BATCH_8_epoch_{epoch}.pth'  # 에포크 번호 포함
    #     torch.save(classifier.state_dict(), model_save_path)
    #     print(f'Model saved at {model_save_path}')

    # # 결과 DataFrame 저장
    # results_df.to_csv('./saved_model/evaluation_results.csv', index=True)
    # print("평가 결과가 저장되었습니다.")



In [11]:
TRAIN_PATH = './data/Training/'
TEST_PATH='./data/Validation/'
# 여러 폴더 경로를 리스트로 저장
train_folder_paths = os.listdir(TRAIN_PATH)
test_folder_paths = os.listdir(TEST_PATH)

In [12]:
# 빈 데이터프레임 리스트 생성
dataframes = []

In [13]:

# 각 폴더 내의 JSON 파일을 읽어와 데이터프레임으로 변환
def json2df(folder_paths,csv_paths):
    for folder_path in folder_paths:
        FOLDER_PATH = TRAIN_PATH+folder_path
        print(f"Processing folder: {folder_path}")
        
        # 폴더 내의 모든 JSON 파일 리스트
        json_files = [file for file in os.listdir(FOLDER_PATH) if file.endswith('.json')]

        for file in json_files:
            file_path = os.path.join(FOLDER_PATH, file)
            print(f"Loading file: {file_path}")
            
            try:
                with open(file_path, 'r', encoding='utf-8') as f:
                    data = json.load(f)
                    
                    # 파일 내용 확인 및 데이터프레임으로 변환
                    if data:
                        # Aspects만 추출
                        for review in data:
                            aspects = pd.json_normalize(review.get('Aspects'))
                            dataframes.append(aspects)
                    else:
                        print(f"No data found in {file}")
                        
            except json.JSONDecodeError:
                print(f"Error loading {file}: Invalid JSON")

    # 데이터프레임 결합
    if dataframes:
        final_dataframe = pd.concat(dataframes, ignore_index=True)
        print(final_dataframe)
    else:
        print("No valid dataframes to concatenate.")
        
    final_dataframe.to_csv(csv_paths)

    return final_dataframe


In [14]:
json2df(train_folder_paths,'./data/train.csv')
json2df(test_folder_paths,'./data/test.csv')


Processing folder: 2-1
Loading file: ./data/Training/2-1\2-1.스킨케어(1).json
Loading file: ./data/Training/2-1\2-1.스킨케어(10).json
Loading file: ./data/Training/2-1\2-1.스킨케어(100).json
Loading file: ./data/Training/2-1\2-1.스킨케어(101).json
Loading file: ./data/Training/2-1\2-1.스킨케어(102).json
Loading file: ./data/Training/2-1\2-1.스킨케어(103).json
Loading file: ./data/Training/2-1\2-1.스킨케어(104).json
Loading file: ./data/Training/2-1\2-1.스킨케어(105).json
Loading file: ./data/Training/2-1\2-1.스킨케어(106).json
Loading file: ./data/Training/2-1\2-1.스킨케어(107).json
Loading file: ./data/Training/2-1\2-1.스킨케어(108).json
Loading file: ./data/Training/2-1\2-1.스킨케어(109).json
Loading file: ./data/Training/2-1\2-1.스킨케어(11).json
Loading file: ./data/Training/2-1\2-1.스킨케어(110).json
Loading file: ./data/Training/2-1\2-1.스킨케어(111).json
Loading file: ./data/Training/2-1\2-1.스킨케어(112).json
Loading file: ./data/Training/2-1\2-1.스킨케어(113).json
Loading file: ./data/Training/2-1\2-1.스킨케어(114).json
Loading file: ./data/Traini

Unnamed: 0,Aspect,SentimentText,SentimentWord,SentimentPolarity
0,유통기한,유통기한도 넉넉하고,2,1
1,제품구성,구성도 많아서 선물 하기 좋네요.,5,1
2,제품구성,구성도알차고,1,1
3,보습력/수분감,촉촉하고너무좋아용,1,1
4,용량,대용량으로 넉넉하게 사용할 수 있고,5,1
...,...,...,...,...
231667,편의성/활용성,간편하게 하나만 발라도 되어서,4,1
231668,향,향기도 끝내줍니다.,2,1
231669,향,좋아하는 향이라,2,1
231670,가격,할인이 없어서 비싸게 처음 구매했어요,5,-1


In [17]:
main()

['유통기한', '제품구성', '보습력/수분감', '용량', '자극성', '가격', '흡수력', '제형', '향', '발림성', '품질', '기능/효과', '사용감', '피부타입', '윤기/피부(톤)', '용기', '성분', '탄력', '편의성/활용성', '지속력', '디자인', '색상', '밀착력/접착력', '커버력', '탈모개선', '청량감/쿨링감', '세정력', '지속력/유지력', '두피보호', '머릿결관리', '향/냄새', '용량/사이즈', '거품력', '스타일링효과', '세팅력/고정력', '염색력', '발색력', '클렌징/제거력', '이염', '분사력', '그립감', '보습력/수분감/쿨링감', '사이즈/두께', '용량/개수']
