In [49]:
import warnings
import numpy as np
import pandas as pd
from time import time
import re
import os
import io
import distutils.dir_util
import json
import pickle 
import random
from tqdm import tqdm

import torch
from torch.utils.data import Dataset, DataLoader

from collections import Counter
from collections import defaultdict
warnings.filterwarnings('ignore')

def fix_seed(seed: int) -> None:
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
    random.seed(seed)
    
fix_seed(1005)

In [50]:
#load tags_idx_dic and idx_tags_dic
#path1 = ''
#path2 = ''
#tags_idx_dic = pd.read_csv(path1)
#idx_tags_dic = pd.read_csv(path2)
tags_idx_dic = {'C':0,'D':1,'A':2,'F':3,'G':4,'E':5,'B':6}
idx_tags_dic = {0:'C',1:'D',2:'A',3:'F',4:'G',5:'E',6:'B'}

In [51]:
# read patent data(A~H)
data = pd.read_csv('/Users/leejeewoong/Desktop/gibo/코드정리/dataset/Big_ipc_dataset.csv')


In [52]:
data.head()

Unnamed: 0,출원번호,초록,ipc코드
0,1019820005041,"커피 제품을 제조하는 방법에 관한 것으로, 강화된 방향성분을 갖는 가용성 커피 제품...",A
1,1019840008556,"[구성]평활제 30-60중량%, 정전방지제 3-10중량% 및 유화제 20-40중량%...",D
2,1019840008607,여러가지 형태의 용접선에 광범위하게 적용할 수 있는 용접선 추적장치를 통해 용접용 ...,B
3,1019850000457,"[목적] 세로토닌 M 수용체의 길항질 작용을 가져 편두통, 혈관과 복합두통과 삼차 ...",C
4,1019850009061,"[목적]프레이트 콘테이너문의 결속장치에 관한 것으로서, 특히 국제표준기구에서 규정하...",E


In [32]:
from code_def import *
from RSTagDataset import *
import argparse

args = argparse.Namespace(
  data_path='/kaggle/input/test-dataset/',
  max_len=256,
  batch_size=20, 
  model_name = "paust/pko-t5-small"
)

In [54]:
#this is for gibo's good tech dataset (or bad tech)
data = data.sample(100)

In [55]:
from transformers import BertTokenizer
from sklearn.model_selection import train_test_split
tokenizer = BertTokenizer.from_pretrained('kykim/bert-kor-base')
train,val = train_test_split(data,test_size=0.05)
dataset = train
dataset = dataset.groupby("ipc코드").head(500)
dataset = dataset.groupby('출원번호').agg({'초록':'first', 
                             'ipc코드': lambda x : ','.join(set(x))}).reset_index()

dataset['ipc코드'] = dataset['ipc코드'].str.split(',')
train,test = train_test_split(dataset,test_size=0.02)
train_dataset = RSTagDataset(args=args, train=train, test=test, val=val, data_name='train', tokenizer=tokenizer,pt_idx_tags_dic=idx_tags_dic,pt_tags_idx_dic=tags_idx_dic)
#currently val_dataset working
val_dataset = RSTagDataset(args=args, train=train, test=test, val=val, data_name='val', tokenizer=tokenizer,pt_idx_tags_dic=idx_tags_dic,pt_tags_idx_dic=tags_idx_dic)
test_dataset = RSTagDataset(args=args, train=train, test=test, val=val,data_name='test', tokenizer=tokenizer,pt_idx_tags_dic=idx_tags_dic,pt_tags_idx_dic=tags_idx_dic)
train_params = {
    'batch_size': args.batch_size,
    'shuffle': True
    }

val_params = {
    'batch_size': args.batch_size,
    'shuffle': False
    }

train_loader = DataLoader(train_dataset, **train_params)
val_loader = DataLoader(val_dataset, **val_params)
test_loader = DataLoader(test_dataset, **val_params)

93 93
5 5
2 2


In [56]:
import torch
mps_device = torch.device("cpu")
#load trained_model
model2 = torch.load('/Users/leejeewoong/Desktop/gibo/코드정리/model/bert_Big.pt',map_location=torch.device('cpu'))
#eval() is needed (faster)
model2.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(42000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [57]:
#this is for short test
test_dataset = RSTagDataset(args=args, train=train, test=test.iloc[0:5], val=val,data_name='test', tokenizer=tokenizer,pt_idx_tags_dic=idx_tags_dic,pt_tags_idx_dic=tags_idx_dic)
test_loader = DataLoader(test_dataset, **val_params)

2 2


In [58]:
predictions, actuals = validate(5, tokenizer, model2, mps_device, test_loader,idx_tags_dic=idx_tags_dic)

100%|██████████| 1/1 [00:00<00:00,  4.79it/s]


In [59]:
test_results = pd.DataFrame({'Generated Text':predictions,'Actual Text':actuals})
valid_score(test_results)

100.0

In [60]:
test_results

Unnamed: 0,Generated Text,Actual Text
0,"E, F, B",e
1,"B, F, G",b


## For test

In [40]:
good_data = pd.read_excel('/Users/leejeewoong/Desktop/gibo/기보2023문서/up_tech_sheet1.xlsx')
test_input = pd.DataFrame()
test_input['출원번호'] = good_data['NO']
test_input['초록'] = good_data['과제명(국문)']
test_input['ipc코드'] = 'A'
test_input['ipc코드'] = test_input['ipc코드'].str.split(',')

In [41]:
test_dataset = RSTagDataset(args=args, train=train, test=test_input, val=val,data_name='test', tokenizer=tokenizer,pt_idx_tags_dic=idx_tags_dic,pt_tags_idx_dic=tags_idx_dic)
test_loader = DataLoader(test_dataset, **val_params)
predictions, actuals = validate(5, tokenizer, model2, mps_device, test_loader,idx_tags_dic)
test_results = pd.DataFrame({'Generated Text':predictions,'Actual Text':actuals})
test_results = test_results.apply(lambda x: x.astype(str).str.upper())


1077 1077


100%|██████████| 54/54 [01:16<00:00,  1.42s/it]


In [42]:
good_data['기술분야'] = test_results['Generated Text']

In [43]:
good_data.to_csv('/Users/leejeewoong/Desktop/gibo/기보2023문서/Y_tech.csv',index=False,encoding='utf-8-sig')

In [45]:
good_data.head()

Unnamed: 0,IPC,등록일자,특허명,특허요약,Unnamed: 4
0,A47L,1995.07.21,진공청소기의흡음방구조,본 발명은 전동송풍기가 진공청소기 본체에 대하여 대략의 상하 대칭형으로 된 상하부 ...,
1,A47L,1997.11.03,충격흡수구조를가진진공청소기의후방바퀴,"본 발명은 진공청소기의 후방바퀴에 관한 것으로, 보다 상세하게는, 후방바퀴내에 충격...",
2,A23L,1997.11.17,포장된 국수용 살균기,본발명은 삶아져서 포장된 국수를 살균하는 살균기에 관한 것이다.본 살균기는 한쌍의 ...,
3,A61K,1997.12.03,파라티로이드 호르몬류의 안정화 조성물,유효성분으로서 PTH를 함유하고 파라티로이드 호르몬에 대한 안정화제로서 유효량의 당...,
4,A62C,1997.12.03,산불진화용 소화 로켓트탄,"본 발명은 산불진화용 소화 로켓트탄에 관한 것으로서, 소화 로켓트탄의 소화탄두에 감...",


## For test 2

In [61]:
good_data = pd.read_excel('/Users/leejeewoong/Desktop/gibo/기보2023문서/down_tech_sheet1.xlsx')
test_input = pd.DataFrame()
test_input['출원번호'] = good_data['등록일자']
test_input['초록'] = good_data['특허명']
test_input['ipc코드'] = 'A'
test_input['ipc코드'] = test_input['ipc코드'].str.split(',')

test_dataset = RSTagDataset(args=args, train=train, test=test_input, val=val,data_name='test', tokenizer=tokenizer,pt_idx_tags_dic=idx_tags_dic,pt_tags_idx_dic=tags_idx_dic)
test_loader = DataLoader(test_dataset, **val_params)
predictions, actuals = validate(5, tokenizer, model2, mps_device, test_loader,idx_tags_dic)
test_results = pd.DataFrame({'Generated Text':predictions,'Actual Text':actuals})
test_results = test_results.apply(lambda x: x.astype(str).str.upper())


104 104


100%|██████████| 6/6 [00:07<00:00,  1.25s/it]


In [62]:
test_results.head()

Unnamed: 0,Generated Text,Actual Text
0,"A, D, F",A
1,"A, D, B",A
2,"A, D, C",A
3,"A, C, E",A
4,"A, E, D",A


In [None]:
good_data['기술분야'] = good_data['IPC'].str[0]
#good_data = good_data.drop(columns=['Unnamed: 4'])
good_data.to_csv('/Users/leejeewoong/Desktop/gibo/기보2023문서/N_tech.csv',index=False,encoding='utf-8-sig')