## 다양한 토큰화, 임베딩 방법 사용하여 모델 성능 높이기
해당 데이터셋은 sports, entertainment, medical, politics라는 4가지의 다른 영역의 텍스트 데이터입니다.<br>
텍스트를 통해 각 영역을 맞추는 task를 수행하시오.

다양한 전처리, tokenization, embedding 방법을 사용하여 X_train, y_train을 통해 모델링을 수행하고 X_test와 y_test를 사용하여 성능을 검증, 비교하시오.

성능은 accuracy로 측정하시오.

<font color='red'>**※ 단, 검증 이외에 X_test, y_test를 사용해서는 안됩니다.**</font>

In [None]:
!pip install tensorflow

In [113]:
import pandas as pd
import numpy as np
import random

import torch
import torch.nn as nn

from tensorflow.keras.preprocessing.sequence import pad_sequences

from sklearn.model_selection import train_test_split

### Reading Data

# huggingface

In [1]:
from transformers import AutoTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding
import pandas as pd
import numpy as np
from datasets import load_dataset, load_metric

#MODEL = "klue/roberta-base"
INPUT = "text_classification_dataset.csv"
MAX_LEN = 256
dataset = load_dataset("csv", data_files=INPUT,split='train')
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-large-finetuned-conll03-english")
def example_fn(examples):
    outputs = tokenizer(examples['text'],padding=True, max_length=MAX_LEN,truncation=True)
    if 'type' in examples:
        outputs["type"] = examples["type"]
    return outputs

dataset = dataset.map(example_fn, remove_columns=['text', 'type'])

FileNotFoundError: Unable to find 'C:\Users\82103\deep-세션\text_classification_dataset.csv' at C:\Users\82103\deep-세션

In [132]:

dataset = dataset.train_test_split(0.2)


In [133]:
X_train=pd.DataFrame(dataset['train'])['input_ids']

In [134]:
X_test=pd.DataFrame(dataset['test'])['input_ids']

In [135]:
y_train=pd.DataFrame(dataset['train'])['type']

In [136]:
y_test=pd.DataFrame(dataset['test'])['type']

# 기존 방식

In [209]:
corpus = pd.read_csv('text_classification_dataset.csv')
corpus.head()

Unnamed: 0,text,type
0,@ACNI2012 @TheToka920 Never knew having 1 or 2...,sports
1,"MYCA Magical Moments:\n\nSeptember, 2011: Sham...",sports
2,The current state of last year's @BBL finalist...,sports
3,@HOLLYJISOO Why did you bring a cricket...,sports
4,Babar Azam only Pakistani included in the ICC ...,sports


In [210]:
corpus.type.unique()

array(['sports', 'entertainment', 'medical', 'politics'], dtype=object)

In [211]:
# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(corpus['text'], corpus['type'], stratify=corpus['type'], random_state=220510, test_size=.2)

### Preprocessing

In [212]:
import string
def preprocessing(data):
    data = data.map(lambda x: x.lower()) # 소문자화
    data.str.replace(pat=r'[^\w]', repl=r' ', regex=True)
    data.apply(lambda x : ''.join([k for k in x if k not in string.punctuation]))

    data.apply(lambda x : ''.join([k for k in x if k not in string.digits]))
    # 이후 전처리를 추가하여 함수를 만드시오
    
    return data

### Tokenization

In [213]:
# 띄어쓰기 기준 tokenization
def sep_based_tok(sentences):
    toks = sentences.map(lambda x: x.split())
    return toks

# 다른 tokenization 방법을 사용해보시오.


### Word Embedding

In [214]:
# label encoding
def le_emb(toks, token_cb):
    le = {token:i for i, token in enumerate(token_cb)}
    embs = toks.map(lambda x: [le.get(tok, 0) for tok in x])
    return embs

## 다른 embedding 방법을 사용해보시오.


In [215]:
def sentence_vectorization(sentences, tok_method, emb_method):
    sentences = preprocessing(sentences) # preprocessing
    toks = tok_method(sentences) # tokenization
    token_cb = ['<unk>'] + list({word for sentence in toks for word in sentence}) # make vocabulary
    embs = emb_method(toks, token_cb) # word embedding
    return embs

In [216]:
X_train = sentence_vectorization(X_train, sep_based_tok, le_emb)
X_test = sentence_vectorization(X_test, sep_based_tok, le_emb)

### Modeling

In [218]:
# 장비 할당
if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

In [219]:
# 패딩: 입력 벡터 길이 맞추기
X_train = torch.FloatTensor(pad_sequences(X_train)).unsqueeze(2)
X_test = torch.FloatTensor(pad_sequences(X_test)).unsqueeze(2)

In [220]:
# 타겟 데이터 전처리
idx2label = dict(enumerate(y_train.unique()))
label2idx = {label:idx for idx, label in idx2label.items()}
y_train = [label2idx[x] for x in y_train]
y_test = [label2idx[x] for x in y_test]
y_train = torch.LongTensor(y_train)
y_test = torch.LongTensor(y_test)

In [221]:
from torch.utils.data import TensorDataset

# Combine the training inputs into a TensorDataset.
train_dataset = TensorDataset(X_train, y_train)
test_dataset = TensorDataset(X_test, y_test)

train_dataset, val_dataset = train_test_split(train_dataset, random_state=220510, test_size=.2)

In [222]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

# The DataLoader needs to know our batch size for training, so we specify it 
batch_size = 32

# Create the DataLoaders for our training and validation sets.
# We'll take training samples in random order. 
train_dataloader = DataLoader(
            train_dataset,  # The training samples.
            sampler = RandomSampler(train_dataset), # Select batches randomly
            batch_size = batch_size # Trains with this batch size.
        )

# For validation the order doesn't matter, so we'll just read them sequentially.
val_dataloader = DataLoader(
            val_dataset, # The validation samples.
            sampler = SequentialSampler(val_dataset), # Pull out batches sequentially.
            batch_size = batch_size # Evaluate with this batch size.
        )

test_dataloader = DataLoader(
            test_dataset, # The validation samples.
            sampler = SequentialSampler(val_dataset), # Pull out batches sequentially.
            batch_size = batch_size # Evaluate with this batch size.
        )

In [223]:
class SentenceClf(nn.Module):
    def __init__(self, input_size=1, hidden_size=32, num_layers=5, dropout=.3, bidirectional=True):
        super(SentenceClf, self).__init__()
        self.gru = nn.GRU(input_size=input_size,
                            hidden_size = hidden_size,
                            num_layers = num_layers,
                            dropout = dropout,
                            bidirectional=bidirectional)
        if bidirectional:
            self.fc = nn.Linear(hidden_size*2, 4)
        else:
            self.fc = nn.Linear(hidden_size, 4)
    def forward(self, x):
        output, hidden = self.gru(x)
        last_output = output[:,-1,:]
        return self.fc(last_output)

In [224]:
def train(model, train_dataloader, optim, loss_fc):
    model.train()
    train_loss = 0
    correct = 0
    num_samples = 0
    for idx, batch in enumerate(train_dataloader):
        text = batch[0].to(device)
        target = batch[1].to(device)
        
        optimizer.zero_grad()
        preds = model(text)
        loss = loss_fc(preds, target)
        train_loss += loss.item()
        loss.backward()
        optimizer.step()
        
        prediction = preds.max(1, keepdim=True)[1]
        correct += prediction.eq(target.view_as(prediction)).sum().item()
        
        num_samples += target.size(0)
        
    train_loss /= num_samples
    train_acc = 100 * correct /num_samples
    
    print(f'Train Loss : {train_loss:.4} | Train Acc : {train_acc:.4}')

def evaluate(model, test_dataloader, loss_fc):
    model.eval() # 모델을 평가상태로 지정
    test_loss = 0
    correct = 0
    num_samples = 0
    with torch.no_grad(): # 평가 과정에서 gradient 업데이트를 하지 않기 위해
        for batch in test_dataloader:
            text = batch[0].to(device)
            target = batch[1].to(device)
            output = model(text)
            test_loss += criterion(output, target).item()
            
            prediction = output.max(1, keepdim=True)[1] # 벡터 값 내 최대값으로 예측
            correct += prediction.eq(target.view_as(prediction)).sum().item()
            
            num_samples += target.size(0)
            
    test_loss /= num_samples
    test_accuracy = 100 * correct /num_samples
    
    return test_loss, test_accuracy

In [225]:
def train_epochs(model, train_dataloader, val_dataloader, optim, loss_fc, n_epoch):
    for epoch in range(n_epoch):
        print(f'-----Epoch : {epoch+1}-----')
        train(model, train_dataloader, optim, loss_fc)
        valid_loss, valid_acc = evaluate(model, val_dataloader, loss_fc)
        print(f'Valid Loss : {valid_loss:.4} | Valid Acc : {valid_acc:.4}')

In [226]:
torch.manual_seed(0)
torch.cuda.manual_seed(0)
torch.cuda.manual_seed_all(0)
np.random.seed(0)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True
random.seed(0)

model = SentenceClf().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
criterion = torch.nn.CrossEntropyLoss()

train_epochs(model, train_dataloader, val_dataloader, optimizer, criterion, 50)

-----Epoch : 1-----
Train Loss : 0.04549 | Train Acc : 22.75
Valid Loss : 0.04587 | Valid Acc : 23.12
-----Epoch : 2-----
Train Loss : 0.0453 | Train Acc : 24.36
Valid Loss : 0.04567 | Valid Acc : 23.66
-----Epoch : 3-----
Train Loss : 0.04532 | Train Acc : 27.19
Valid Loss : 0.04552 | Valid Acc : 24.19
-----Epoch : 4-----
Train Loss : 0.04515 | Train Acc : 24.23
Valid Loss : 0.04534 | Valid Acc : 23.12
-----Epoch : 5-----
Train Loss : 0.04503 | Train Acc : 24.23
Valid Loss : 0.04522 | Valid Acc : 23.12
-----Epoch : 6-----
Train Loss : 0.04509 | Train Acc : 24.09
Valid Loss : 0.0451 | Valid Acc : 25.27
-----Epoch : 7-----
Train Loss : 0.04474 | Train Acc : 27.86
Valid Loss : 0.04499 | Valid Acc : 24.19
-----Epoch : 8-----
Train Loss : 0.04491 | Train Acc : 25.03
Valid Loss : 0.04491 | Valid Acc : 23.66
-----Epoch : 9-----
Train Loss : 0.04479 | Train Acc : 25.57
Valid Loss : 0.04483 | Valid Acc : 28.49
-----Epoch : 10-----
Train Loss : 0.04476 | Train Acc : 24.63
Valid Loss : 0.04478 |

### Prediction

In [146]:
# huggingface roberta pretrained conl33 tokenizer 활용
test_loss, test_acc = evaluate(model, test_dataloader, criterion)
print(f'Test Loss : {test_loss:.4} | Test Acc : {test_acc:.4}')

Test Loss : 0.04403 | Test Acc : 32.8


In [130]:
# huggingface klue/roberta_tokenizer 활용
test_loss, test_acc = evaluate(model, test_dataloader, criterion)
print(f'Test Loss : {test_loss:.4} | Test Acc : {test_acc:.4}')

Test Loss : 0.04515 | Test Acc : 24.73


In [20]:
# 띄어쓰기 기반 tokenization + Label Encoding
test_loss, test_acc = evaluate(model, test_dataloader, criterion)
print(f'Test Loss : {test_loss:.4} | Test Acc : {test_acc:.4}')

Test Loss : 0.04464 | Test Acc : 28.49


In [183]:
# 띄어쓰기 기반 tokenization + Label Encoding 불용어처리
test_loss, test_acc = evaluate(model, test_dataloader, criterion)
print(f'Test Loss : {test_loss:.4} | Test Acc : {test_acc:.4}')

Test Loss : 0.04462 | Test Acc : 27.96
