# 사전작업

In [None]:
# # -- mecab 설치 --
!bash <(curl -s https://raw.githubusercontent.com/konlpy/konlpy/master/scripts/mecab.sh)
!pip3 install mecab-python3
!pip install konlpy
!pip install transformers

In [None]:
# !pip install portalocker>=2.0.0

In [3]:
import pandas as pd

def bring_and_preprocess_df():
    df = pd.read_csv('/content/drive/MyDrive/bungae_image_df.csv', encoding='utf-8-sig')
    df = df.dropna(axis=0)
    df = df.reset_index(drop=True)
    df['cat_id'] = df['cat_id'].astype(int).astype(str)
    return df

df = bring_and_preprocess_df()
df.shape

(1391022, 5)

# pytorch code

In [4]:
import torch
from torchtext import data
import torch.nn as nn
import torch.nn.functional as F

from torch.utils.data import DataLoader, Dataset
from torchtext.data import get_tokenizer

import transformers

from konlpy.tag import Mecab
import string


In [None]:
def generate_bigrams(x):
    """
    bi-gram 생성 함수
    """
    n_grams = set(zip(*[x[i:] for i in range(2)]))
    for n_gram in n_grams:
        x.append(' '.join(n_gram))
    return x

def remove_punct(text):
    """
    문장기호 없애는 함수
    """
    remove_punct_dict = dict((ord(punct), ' ') for punct in string.punctuation)
    text = text.lower().translate(remove_punct_dict)
    return text


def tokenizer(text):
    """
    data.Field에 사용될 tokenizer 함수
    """
    mecab = Mecab()
    text = remove_punct(text)
    tokens = mecab.nouns(text)
    tokens = [token for token in tokens if len(token) >1]
    return tokens

# -- koelectra tokenizer --
model_path = 'monologg/koelectra-base-v3-discriminator'
tokenizer = transformers.AutoTokenizer.from_pretrained(model_path)

In [None]:
# -- CustomDataset 베이스 코드 --
class CustomDataset(Dataset):
    def __init__(self, text, label, tokenizer=None, preprocessing=None):
        # -- 데이터셋의 전처리를 해주는 부분 --
        self.text = text
        self.label = label # 여기 부분 어떻게?
        self.tokenizer = tokenizer
        self.preprocessing = generate_bigrams
        
    def __getitem__(self, index):
        # -- 데이터셋에서 특정 1개의 샘플을 가져오는 함수 -- 
        text = self.text[index]
        
        label = self.label[index]
        
        if self.tokenizer is not None:
            tokens = self.tokenizer(text)
        
        if self.preprocessing is not None:
            data = self.preprocessing(tokens)

        return data, label
        
    def __len__(self):
        # -- 데이터셋의 길이. 즉, 총 샘플의 수를 적어주는 부분 --
        return len(self.text)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def collate_batch(batch):
    label_list, text_list, offsets = [], [], [0]
    for (_label, _text) in batch:
        label_list.append(label_pipeline(_label))
        processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
        text_list.append(processed_text)
        offsets.append(processed_text.size(0))
    label_list = torch.tensor(label_list, dtype=torch.int64)
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    text_list = torch.cat(text_list)
    return label_list.to(device), text_list.to(device), offsets.to(device)

In [None]:

# -- train_df와 test_df 이전에 나누어주기 -- 
class BungaeDataset(Dataset):
    def __init__(self, tokenizer=None, preprocessing=None):
        # -- 데이터셋의 전처리를 해주는 부분 --
        df = bring_and_preprocess_df()

        self.data = df['product_name']
        self.labels = df['cat_id']

        self.tokenizer = tokenizer
        self.preprocessing = generate_bigrams
        
    def __getitem__(self, index):
        # -- 데이터셋에서 특정 1개의 샘플을 가져오는 함수 -- 
        data = self.data[index]
        label = self.labels[index]
    
        tokens = self.tokenizer(text, 
                                # padding=True, truncation=True, max_length=32
                                )
        data = self.preprocessing(tokens)

        return data, label
        
    def __len__(self):
        # -- 데이터셋의 길이. 즉, 총 샘플의 수를 적어주는 부분 --
        return len(self.labels)

dataset_bungae = BungaeDataset(tokenizer=tokenizer, preprocessing=generate_bigrams)

# with ChatGPT

In [5]:
import transformers
def generate_bigrams(x):
    """
    bi-gram 생성 함수
    """
    n_grams = set(zip(*[x[i:] for i in range(2)]))
    for n_gram in n_grams:
        x.append(' '.join(n_gram))
    return x

def remove_punct(text):
    """
    문장기호 없애는 함수
    """
    remove_punct_dict = dict((ord(punct), ' ') for punct in string.punctuation)
    text = text.lower().translate(remove_punct_dict)
    return text


def tokenizer(text):
    """
    data.Field에 사용될 tokenizer 함수
    """
    mecab = Mecab()
    text = remove_punct(text)
    tokens = mecab.nouns(text)
    tokens = [token for token in tokens if len(token) >1]
    return tokens

# -- koelectra tokenizer --
model_path = 'monologg/koelectra-base-v3-discriminator'
tokenizer = transformers.AutoTokenizer.from_pretrained(model_path)

In [6]:
# -- train_df와 test_df 나누기 --
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(df['product_name'], df['cat_id'].values, shuffle=True, test_size=.2)
train_df = df.iloc[x_train.index]
test_df = df.iloc[x_test.index]
train_df

Unnamed: 0,product_id,product_name,image_url,image_cnt,cat_id
1319949,178523724,일꼬르소 바지 82,https://media.bunjang.co.kr/product/178523724_...,2.0,320120100
718988,210988907,JPN 빈티지 갈피 퍼 후드 자켓,https://media.bunjang.co.kr/product/210988907_...,2.0,320090999
557721,223587526,1169)챔피온 패턴 폴리반바지 남성 34인치권장,https://media.bunjang.co.kr/product/223587526_...,5.0,320130999
1208754,193520347,쏭스튜디오 LUCKY SHOULDER BAG MINI,https://media.bunjang.co.kr/product/193520347_...,3.0,430200300
1152336,219683896,남성 자켓,https://media.bunjang.co.kr/product/219683896_...,6.0,320100200
...,...,...,...,...,...
528201,221503022,[ XL ] 빈폴 남성 로고 울 싱글 패딩 코트,https://media.bunjang.co.kr/product/221503022_...,8.0,320080700
416342,224602344,나이키테크팩,https://media.bunjang.co.kr/product/224602344_...,3.0,320150200
185317,183321485,앤아더스토리즈 청바지 EUR25 가격내림,https://media.bunjang.co.kr/product/183321485_...,6.0,310140999
950528,225058767,시스템 바지 여성32인치,https://media.bunjang.co.kr/product/225058767_...,12.0,310150010


In [7]:
import torchtext

data = list(df['product_name'])
# -- tokenizing --
tokenized_data = [tokenizer(text) for text in data]
flattened_data = [word for text in tokenized_data for word in text]

# -- build the vocabulary -- 
vocabulary = torchtext.vocab.build_vocab_from_iterator([flattened_data])

In [12]:
import torch
from torch.utils.data import Dataset, DataLoader

class CustomDataset(Dataset):
    def __init__(self, data, labels, tokenizer, preprocessing):
        self.data = data
        self.labels = labels
        self.tokenizer = tokenizer
        self.preprocessing = preprocessing

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        data = self.data[index]
        label = self.labels[index]
        tokens = self.tokenizer(data, 
                                # padding=True, truncation=True, max_length=32
                                )
        data = self.preprocessing(tokens)

        sample = {
            'data': data,
            'label': label
        }
        return sample

# Example usage
# Assuming you have your own data and labels

# Create the train dataset
train_data = list(train_df['product_name'])
train_labels = list(train_df['cat_id'])
train_dataset = CustomDataset(train_data, train_labels, tokenizer, generate_bigrams)

# Create the test dataset
test_data = list(test_df['product_name'])
test_labels = list(test_df['cat_id'])
test_dataset = CustomDataset(test_data, test_labels, tokenizer, generate_bigrams)

# Create the data loader
batch_size = 64
shuffle = True
num_workers = 2
drop_last=True

# train dataloader
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=shuffle,
                         num_workers=num_workers, drop_last=drop_last)
# test dataloader
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=shuffle,
                         num_workers=num_workers, drop_last=drop_last)


In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader
import torchtext

from sklearn.metrics import accuracy_score

# Load the pre-trained embeddings
embedding_file = '/content/drive/MyDrive/wiki.ko.vec'
embeddings = torchtext.vocab.Vectors(embedding_file)

embeddings_tensor = torch.Tensor(embeddings.vectors)
embedding_dim = embeddings.dim

class FastText(nn.Module):
    def __init__(self, vocab_size, embedding_dim, num_classes, embeddings):
        super(FastText, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(embeddings)
        self.fc = nn.Linear(embedding_dim, num_classes)

    def forward(self, x):
        embedded = self.embedding(x)  # shape: (batch_size, sequence_length, embedding_dim)
        embedded = embedded.permute(1, 0, 2) 
        pooled = F.avg_pool2d(embedded, (embedded.shape[1],1)).squeeze(1) 
        return self.fc(pooled)

# Example usage
# Assuming you have your own dataset and vocabulary

# Hyperparameters
vocab_size = len(embeddings.itos)
embedding_dim = 300
num_classes = len(df['cat_id'].unique())
learning_rate = 0.001
batch_size = 64
num_epochs = 10

# Create model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = FastText(vocab_size, embedding_dim, num_classes, embeddings_tensor).to(device)

# Define loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(num_epochs):
    model.train()
    for batch_inputs, batch_labels in train_loader:

        batch_inputs = batch_inputs.to(device)
        batch_labels = batch_labels.to(device)

        optimizer.zero_grad()
        outputs = model(batch_inputs)
        loss = criterion(outputs, batch_labels)
        loss.backward()
        optimizer.step()

        # calculate accuracy
        _, predicted = torch.max(outputs.data, 1)
        acc = accuracy_score(predicted.cpu(), batch_labels.cpu())
        print(f"Accuracy: {acc}")

    # Evaluation
    model.eval()
    with torch.no_grad():
        correct = 0
        total = 0
        for batch_inputs, batch_labels in test_loader:
            # move data to GPU
            batch_inputs = batch_inputs.to(device)
            batch_labels = batch_labels.to(device)

            outputs = model(batch_inputs)
            _, predicted = torch.max(outputs.data, dim=1)
            total += batch_labels.size(0)
            correct += (predicted == batch_labels).sum().item()
        
        accuracy = 100 * correct / total
        print(f"Epoch {epoch+1}/{num_epochs}, Test Accuracy: {accuracy:.2f}%")

NameError: ignored