# import

In [1]:
import torch
from transformers import AutoTokenizer, AutoModel
import torch.nn as nn
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader
from torch.utils.data import TensorDataset
import numpy as np
import pandas as pd
import pytorch_lightning as pl
import kss
from torch.optim.lr_scheduler import LambdaLR
import torch.optim as optim
import torch.nn.functional as F
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
import math
from sklearn.model_selection import train_test_split
from torch.optim import Adam

In [2]:
device = torch.device('mps:0' if torch.backends.mps.is_available() else 'cpu')

# DataSet & Data Loader

## Old Ver.

In [None]:
class MyDataset(Dataset):
  def __init__(self, data, label):
    super().__init__()
    self.docs = data
    self.label = label

  def __len__(self):
      return len(self.docs)

  def __getitem__(self, index):
    news = self.docs[index]
    target = self.label[index]
    return news, target

train_data = MyDataset(data=X_train, label=y_train)
val_data = MyDataset(data=X_val, label=y_val)
test_data = MyDataset(data=X_test, label=y_test)

batch_size = 32
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True, drop_last=True, collate_fn=lambda x: tuple(zip(*x)))
val_loader = DataLoader(val_data, batch_size=batch_size, shuffle=True, drop_last=True, collate_fn=lambda x: tuple(zip(*x)))
test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=True, drop_last=True, collate_fn=lambda x: tuple(zip(*x)))

## New Ver.

In [4]:
class MyDataset(Dataset):
  def __init__(self, file_path):
    super().__init__()
    self.tokenizer = AutoTokenizer.from_pretrained("snunlp/KR-Medium", do_lower_case=False)
    self.bert = AutoModel.from_pretrained("snunlp/KR-Medium")
    for param in self.bert.parameters():
        param.requires_grad = False
        
    df = pd.read_json(file_path)
    for idx in range(len(df)) :
      tokenized = self.tokenizer(df.iloc[idx,0], padding='longest', return_tensors='pt')
      contextualized_sentences = self.bert(**tokenized)
      sentence_embeddings = contextualized_sentences.pooler_output
      df.iat[idx,1] = sentence_embeddings
    self.vecs = df['vecs']
    self.label = df['label']

  def __len__(self):
      return len(self.vecs)

  def __getitem__(self, index):
    docs = self.vecs[index]
    label = self.label[index]
    return docs, label

In [5]:
train_data = MyDataset('data/전세사기_라벨링.json')
train_loader = DataLoader(train_data, batch_size=32, shuffle=True, drop_last=True, collate_fn=lambda x: tuple(zip(*x)))

In [254]:
# TEST
# sentence_embeddings = j[0]
# max_len = max([e.size(0) for e in sentence_embeddings])
# padded_embeddings = torch.zeros(len(sentence_embeddings), max_len, sentence_embeddings[0].size(1))
# for i, emb in enumerate(sentence_embeddings):
#     seq_len = emb.size(0)
#     padded_embeddings[i, :seq_len, :] = emb
    
# random_tensor = torch.randn(padded_embeddings.size(0), 1, padded_embeddings.size(2))
# batch_tensor = torch.cat((random_tensor, padded_embeddings), dim=1)
# batch_t = batch_tensor.permute(1 ,0 ,2).float()

# padding_mask = batch_t.sum(dim=-1).permute(1 ,0) == 0
# title_level = nn.TransformerEncoder(nn.TransformerEncoderLayer(d_model=768, nhead=1),num_layers=1)
# output_batch = title_level(batch_t.float(), src_key_padding_mask=padding_mask)
# output_batch = output_batch.permute(1 ,0 ,2)

# Model Structure

## Old Ver.

In [3]:
# for문 Ver.
class Model(nn.Module):
  def __init__(self, num_classes, input_dim, num_heads, num_layers):
    super().__init__()
    self.fc = nn.Linear(input_dim, num_classes)
    self.tokenizer = AutoTokenizer.from_pretrained("snunlp/KR-Medium", do_lower_case=False)
    self.bert = AutoModel.from_pretrained("snunlp/KR-Medium")
    for param in self.bert.parameters():
        param.requires_grad = False
    self.title_level = nn.TransformerEncoder(
      nn.TransformerEncoderLayer(d_model=input_dim, nhead=num_heads),
      num_layers=num_layers)
    self.sentecne_level = nn.TransformerEncoder(
      nn.TransformerEncoderLayer(d_model=input_dim, nhead=num_heads),
      num_layers=num_layers)

  def forward(self, data):
    doc_ems = torch.empty((len(data), 768)).to(device)
    for num, doc in enumerate(data):
      tokenized = self.tokenizer(doc, padding='longest', return_tensors='pt').to(device)
      with torch.no_grad():
        contextualized_sentences = self.bert(**tokenized)
      sentence_embeddings = contextualized_sentences.pooler_output
      title = sentence_embeddings[0, :]
      transformer_output = self.title_level(sentence_embeddings)[1:, :]
      combined_input = torch.cat((title.unsqueeze(1).view(1, 768), transformer_output), dim=0)
      doc_ems[num] = self.sentecne_level(combined_input)[0]
    out = self.fc(doc_ems)
    predicted_label = torch.argmax(out).to(device)
    return out, predicted_label

## New Batch Ver.

In [3]:
class Doc_Encoder(nn.Module):
    def __init__(self, num_heads, num_layers):
        super().__init__()
        self.sentecne_level = nn.TransformerEncoder(nn.TransformerEncoderLayer(d_model=768, nhead=num_heads),num_layers=num_layers)

    def forward(self, batch):
        max_len = max([e.size(0) for e in batch])
        padded_embeddings = torch.zeros(len(batch), max_len, batch[0].size(1)).to(device)
        for i, emb in enumerate(batch):
            seq_len = emb.size(0)
            padded_embeddings[i, :seq_len, :] = emb
        random_tensor = torch.randn(padded_embeddings.size(0), 1, padded_embeddings.size(2)).to(device)
        batch_tensor = torch.cat((random_tensor, padded_embeddings), dim=1)
        batch_tensor = batch_tensor.permute(1 ,0 ,2).float()
        padding_mask = batch_tensor.sum(dim=-1).permute(1 ,0) == 0
        output_batch = self.sentecne_level(batch_tensor.float(), src_key_padding_mask=padding_mask)
        output_batch = output_batch.permute(1 ,0 ,2)
        doc_vecs = output_batch[:,0,:]
        return doc_vecs

In [52]:
# class Doc_Encoder(nn.Module):
#     def __init__(self, num_classes, num_heads, num_layers):
#         super().__init__()
#         self.fc = nn.Linear(768, num_classes)
#         self.title_level = nn.TransformerEncoder(nn.TransformerEncoderLayer(d_model=768, nhead=num_heads),num_layers=num_layers)
#         self.sentecne_level = nn.TransformerEncoder(nn.TransformerEncoderLayer(d_model=768, nhead=num_heads),num_layers=num_layers)

#     def forward(self, batch):
#         max_len = max([e.size(0) for e in batch])
#         padded_embeddings = torch.zeros(len(batch), max_len, batch[0].size(1)).to(device)
#         for i, emb in enumerate(batch):
#             seq_len = emb.size(0)
#             padded_embeddings[i, :seq_len, :] = emb
#         title = padded_embeddings[:,0,:].reshape(padded_embeddings.size(0), 1, padded_embeddings.size(2))
#         padded_embeddings = padded_embeddings.permute(1 ,0 ,2).float()
#         padding_mask = padded_embeddings.sum(dim=-1).permute(1 ,0) == 0
#         title_level_out = self.title_level(padded_embeddings.float(), src_key_padding_mask=padding_mask)
#         title_level_out = title_level_out.permute(1 ,0 ,2)
        
#         batch_tensor = torch.cat((title, title_level_out), dim=1)
#         random_tensor = torch.randn(batch_tensor.size(0), 1, batch_tensor.size(2)).to(device)
#         batch_tensor = torch.cat((random_tensor, batch_tensor), dim=1)
#         batch_tensor = batch_tensor.permute(1 ,0 ,2).float()
#         padding_mask = batch_tensor.sum(dim=-1).permute(1 ,0) == 0
#         output_batch = self.sentecne_level(batch_tensor.float(), src_key_padding_mask=padding_mask)
#         output_batch = output_batch.permute(1 ,0 ,2)
#         doc_vecs = output_batch[:,0,:]
#         return doc_vecs

In [4]:
class Model(nn.Module) :
    def __init__(self, num_classes, encoder) :
        super().__init__()
        self.fc1 = nn.Linear(768, 768)
        self.ac1 = nn.ReLU()
        self.fc2 = nn.Linear(768,num_classes)
        self.encoder = encoder
    
    def forward(self, batch) :
        doc_vecs = self.encoder(batch)
        out = self.fc1(doc_vecs)
        out = self.ac1(out)
        out = self.fc2(out)
        return out

# Parameter & Schedular

In [23]:
num_epochs = 1
num_classes = 2
input_dim = 768
num_heads = 1
num_layers = 1
encoder = Doc_Encoder(num_heads, num_layers)
model = Model(num_classes, encoder).to(device)
criterion = nn.CrossEntropyLoss().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)

# Training - 문제의식 data로 encoder 추가 학습할 수 있는 과정

In [18]:
device = torch.device('mps:0' if torch.backends.mps.is_available() else 'cpu')

In [15]:
model.train()  # 모델을 훈련 모드로 설정
model.to(device)
for epoch in range(50) :
    total_loss = 0
    for batch_idx, (data, target) in enumerate(train_loader):
        data = tuple(d.to(device) for d in data)
        target = torch.tensor(target).long().to(device)
        optimizer.zero_grad()
        output = model(data) 
        loss = criterion(output, target)
        total_loss += loss
        loss.backward()
        optimizer.step()
    print(f'Epoch {epoch+1}, Loss: {total_loss}')

Epoch 1, Loss: 16.749486923217773
Epoch 2, Loss: 17.352962493896484
Epoch 3, Loss: 16.8115291595459
Epoch 4, Loss: 16.901926040649414
Epoch 5, Loss: 17.09539794921875
Epoch 6, Loss: 17.021961212158203
Epoch 7, Loss: 16.325613021850586
Epoch 8, Loss: 16.613815307617188
Epoch 9, Loss: 17.53157615661621
Epoch 10, Loss: 17.18819236755371
Epoch 11, Loss: 17.07434844970703
Epoch 12, Loss: 16.776493072509766
Epoch 13, Loss: 16.350135803222656
Epoch 14, Loss: 16.65325927734375
Epoch 15, Loss: 17.337514877319336
Epoch 16, Loss: 17.100589752197266
Epoch 17, Loss: 16.71356201171875
Epoch 18, Loss: 16.622323989868164
Epoch 19, Loss: 16.54018783569336
Epoch 20, Loss: 16.5841007232666
Epoch 21, Loss: 16.895811080932617
Epoch 22, Loss: 16.49324607849121
Epoch 23, Loss: 17.039358139038086
Epoch 24, Loss: 16.944934844970703
Epoch 25, Loss: 16.87317657470703
Epoch 26, Loss: 16.473180770874023
Epoch 27, Loss: 16.401714324951172
Epoch 28, Loss: 16.719398498535156
Epoch 29, Loss: 17.756261825561523
Epoch 3

# Test

In [16]:
device = 'cpu'
model = model.to(device)
total_correct = 0
total_samples = 0

with torch.no_grad():
    for batch_data, target in train_loader:
        target = torch.tensor(target).long()
        target = target.to(device)

        out = model(batch_data)
        predicted = torch.argmax(out, dim=1)
        total_samples += target.size(0)
        total_correct += (predicted == target).sum().item()

accuracy = total_correct / total_samples

In [17]:
print(accuracy)

0.8096590909090909


# few-shot learning code

In [57]:
class few_shot_Model(nn.Module):
    def __init__(self, encoder):
        super().__init__()
        self.encoder = encoder
        self.fc1 = nn.Linear(768,768)
        self.ac = nn.ReLU()
        self.fc2 = nn.Linear(768,256)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, doc_pair):
        doc_pair = self.encoder(doc_pair)
        doc_pair = self.fc1(doc_pair)
        doc_pair = self.ac(doc_pair)
        doc_pair = self.fc2(doc_pair)
        similarity = torch.cosine_similarity(doc_pair[0], doc_pair[1], dim=0)
        out = self.sigmoid(similarity)
        return out

In [25]:
df = pd.read_json('./data/youth_fewshot.json')
df['embedding'] = None
df = df[['제목','내용','sliced','embedding','cluster','세부분류']]
df.head(5)

Unnamed: 0,제목,내용,sliced,embedding,cluster,세부분류
0,"오거돈 ""3년간 청년정책 사업에 4,900억 투입""",민선 7기 2년차 첫 정책 '청년정책로드맵' 발표 2022년까지 106개 청년사업 ...,[민선 7기 2년차 첫 정책 '청년정책로드맵' 발표 2022년까지 106개 청년사업...,,10,11.0
1,안양청년 89명 청년정책 주도…희망발전 가동,최대호 안양시장 10일 청년정책 서포터즈 위촉장 전달. 사진제공=안양시 【파이...,"[최대호 안양시장 10일 청년정책 서포터즈 위촉장 전달., 사진제공=안양시 【...",,10,11.0
2,"서울시, '2020 청년정책 협력포럼' 개최","서울시가 '청년기본법 이후, 청년의 자리'를 주제로 '2020 청년정책 협력포럼'을...","[서울시가 '청년기본법 이후, 청년의 자리'를 주제로 '2020 청년정책 협력포럼'...",,10,11.0
3,"양평군, 2020~2024년 청년정책 윤곽 드러나",‘청년실태조사·정책기본계획’ 수립용역 보고회 【양평=뉴시스】 문영일 기자 = 경기 ...,[‘청년실태조사·정책기본계획’ 수립용역 보고회 【양평=뉴시스】 문영일 기자 = 경기...,,10,11.0
4,대통령 표창으로 꽃 피운 대구시 청년정책…“청년이 돌아오는 대구 만들 것”,"2015년 청년위원회 출범 계기 청년 목소리 담은 정책 시행 도전, 희망, 행복, ...","[2015년 청년위원회 출범 계기 청년 목소리 담은 정책 시행 도전, 희망, 행복,...",,10,11.0


In [26]:
# model에 넣지 않고 데이터 자체에서 bert를 이용한 embedding 과정 진행
tokenizer = AutoTokenizer.from_pretrained("snunlp/KR-Medium", do_lower_case=False)
bert = AutoModel.from_pretrained("snunlp/KR-Medium")
for param in bert.parameters():
    param.requires_grad = False
for idx in range(5) :
    tokenized = tokenizer(df.iloc[idx,2], padding='longest', return_tensors='pt')
    contextualized_sentences = bert(**tokenized)
    sentence_embeddings = contextualized_sentences.pooler_output
    df.iat[idx,3] = sentence_embeddings

In [27]:
df = df.iloc[:5,3]
df

0    [[tensor(0.2954), tensor(-0.1119), tensor(-0.4...
1    [[tensor(0.1580), tensor(0.1035), tensor(-0.10...
2    [[tensor(0.1997), tensor(-0.3428), tensor(-0.2...
3    [[tensor(0.3740), tensor(-0.1149), tensor(0.00...
4    [[tensor(0.1054), tensor(-0.1254), tensor(-0.0...
Name: embedding, dtype: object

In [58]:
f_model = few_shot_Model(encoder).to(device)
criterion = nn.BCELoss().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)

In [74]:
# 같은 class 간 데이터 조합 by combinations
from itertools import combinations, product
for epoch in range(1) :
    target = torch.tensor([1.0]).to(device)
    for i,j in combinations(range(4),2) : # 안에다가 range(4) 대신 같은 class 내 데이터 인덱스 범위 넣어주면 됨 
        f_model.train()
        data = df[[i,j]]
        data = tuple(d.to(device) for d in data)
        optimizer.zero_grad()
        output = f_model(data).unsqueeze(0)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        print(f'Loss: {loss}')
        
# 다른 class 간 데이터 조합 by product
    target = torch.tensor([0.0]).to(device)
    for i,j in product(g1, g2) : # 안에다가 서로 다른 클래스들의 인덱스 집합을 넣어주면 됨
        f_model.train() 
        data = (l1.iloc[i,3], l1.iloc[j,3])
        data = tuple(d.to(device) for d in data)
        target = torch.tensor(target).long().to(device)
        optimizer.zero_grad()
        output = f_model(data) 
        loss = criterion(output, target)
        total_loss += loss
        loss.backward()
        optimizer.step()
        print(f'Loss: {total_loss}')

Loss: 0.3390222489833832
Loss: 0.33081522583961487
Loss: 0.3357342481613159
Loss: 0.34250032901763916
Loss: 0.3297666907310486
Loss: 0.33760297298431396


In [1]:
# encoder만 유지하고 뒤 fc layer 등은 바꿔 끼워주며 세부분류마다 진행하면 됨
# 각 세부분류 모델 훈련 이후 라벨링 되어있지 않은 애들은 모델