In [1]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/b5/d5/c6c23ad75491467a9a84e526ef2364e523d45e2b0fae28a7cbe8689e7e84/transformers-4.8.1-py3-none-any.whl (2.5MB)
[K     |████████████████████████████████| 2.5MB 11.0MB/s 
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/d4/e2/df3543e8ffdab68f5acc73f613de9c2b155ac47f162e725dcac87c521c11/tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3MB)
[K     |████████████████████████████████| 3.3MB 38.3MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)
[K     |████████████████████████████████| 901kB 37.1MB/s 
Collecting huggingface-hub==0.0.12
  Downloading https://files.pythonhosted.org/packages/2f/ee/97e253668fda9b17e968b3f97b2f8e53aa0127e8807d24a54768742

In [2]:
import torch

from transformers import BertTokenizer
from transformers import BertForSequenceClassification, AdamW, BertConfig
from transformers import get_linear_schedule_with_warmup
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

from tqdm.notebook import tqdm
import pandas as pd
import numpy as np
import random
import time
import datetime

In [3]:
if torch.cuda.is_available():    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print('No GPU available, using the CPU instead.')

There are 1 GPU(s) available.
We will use the GPU: Tesla K80


In [4]:
!git clone https://github.com/perfume-reconmendation/topic_modeling.git

Cloning into 'topic_modeling'...
remote: Enumerating objects: 20, done.[K
remote: Counting objects: 100% (20/20), done.[K
remote: Compressing objects: 100% (14/14), done.[K
remote: Total 20 (delta 7), reused 16 (delta 5), pack-reused 0[K
Unpacking objects: 100% (20/20), done.


In [5]:
df = pd.read_csv('/content/topic_modeling/data/final_data.csv')
df = df.dropna().reset_index().drop(['index'], axis=1)
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 75000 entries, 0 to 74999
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   gender   75000 non-null  object
 1   name     75000 non-null  object
 2   accords  75000 non-null  object
 3   review   75000 non-null  object
dtypes: object(4)
memory usage: 2.3+ MB


Unnamed: 0,gender,name,accords,review
0,female,Alien Mugler for women,"['white floral', 'amber', 'woody']","Got a sample of this today, and my 9 year old ..."
1,female,Alien Mugler for women,"['white floral', 'amber', 'woody']",First impressions. Test on blotter.\nI have he...
2,female,Alien Mugler for women,"['white floral', 'amber', 'woody']",This perfume reminds me of my best friend. Act...
3,female,Alien Mugler for women,"['white floral', 'amber', 'woody']",Imagine tripping over your own feet and fallin...
4,female,Alien Mugler for women,"['white floral', 'amber', 'woody']",Gorgeous Gorgeous Blend ..\nLove the scent...\...


In [6]:
len(df['name'].unique())

89

In [7]:
df_label = pd.read_csv('/content/topic_modeling/data/labeled_data.csv', usecols=['name', 'label'])
df_label = df_label.dropna().reset_index().drop(['index'], axis=1)
df_label.info()
df_label.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 89 entries, 0 to 88
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   name    89 non-null     object 
 1   label   89 non-null     float64
dtypes: float64(1), object(1)
memory usage: 1.5+ KB


Unnamed: 0,name,label
0,Alien Mugler for women,2.0
1,Coco Mademoiselle Chanel for women,2.0
2,Black Orchid Tom Ford for women,2.0
3,Black Opium Yves Saint Laurent for women,2.0
4,Hypnotic Poison Christian Dior for women,2.0


In [13]:
df_label.label = df_label.label.astype(int)
df_label

Unnamed: 0,name,label
0,Alien Mugler for women,2
1,Coco Mademoiselle Chanel for women,2
2,Black Orchid Tom Ford for women,2
3,Black Opium Yves Saint Laurent for women,2
4,Hypnotic Poison Christian Dior for women,2
...,...,...
84,Spicebomb Extreme Viktor&Rolf for men,1
85,Cool Water Davidoff for men,0
86,Herod Parfums de Marly for men,1
87,L’Homme Ideal Eau de Parfum Guerlain for men,1


In [14]:
len(df_label['label'].unique())

4

In [15]:
train_df = pd.merge(df, df_label, how='inner', on='name')
train_df

Unnamed: 0,gender,name,accords,review,label
0,female,Alien Mugler for women,"['white floral', 'amber', 'woody']","Got a sample of this today, and my 9 year old ...",2
1,female,Alien Mugler for women,"['white floral', 'amber', 'woody']",First impressions. Test on blotter.\nI have he...,2
2,female,Alien Mugler for women,"['white floral', 'amber', 'woody']",This perfume reminds me of my best friend. Act...,2
3,female,Alien Mugler for women,"['white floral', 'amber', 'woody']",Imagine tripping over your own feet and fallin...,2
4,female,Alien Mugler for women,"['white floral', 'amber', 'woody']",Gorgeous Gorgeous Blend ..\nLove the scent...\...,2
...,...,...,...,...,...
74995,male,Jazz Club Maison Martin Margiela for men,"['tobacco', 'rum', 'sweet', 'vanilla', 'woody'...",I've been wanting to try this on for a while a...,1
74996,male,Jazz Club Maison Martin Margiela for men,"['tobacco', 'rum', 'sweet', 'vanilla', 'woody'...",Goes on sweet. Sweet and boozy with too much d...,1
74997,male,Jazz Club Maison Martin Margiela for men,"['tobacco', 'rum', 'sweet', 'vanilla', 'woody'...",Yesterday I walked in to a store looking for a...,1
74998,male,Jazz Club Maison Martin Margiela for men,"['tobacco', 'rum', 'sweet', 'vanilla', 'woody'...",Tried this today and now looking at notes I c...,1


In [16]:
# 셔플
shuffle = train_df.sample(frac=1)

# Train, Test 분리
train = shuffle[0:15000].reset_index(drop=True)
test = shuffle[15000:20000].reset_index(drop=True)

# 토큰 사전
document_bert = ["[CLS] " + str(s) + " [SEP]" for s in train.review]
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased', do_lower_case=True)
tokenized_texts = [tokenizer.tokenize(s) for s in document_bert]

# 패딩
MAX_LEN = 512
input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype='long', truncating='post', padding='post')

# 성능을 위한 마스킹
attention_masks = []
for seq in input_ids:
    seq_mask = [float(i>0) for i in seq]
    attention_masks.append(seq_mask)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=995526.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=29.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1961828.0, style=ProgressStyle(descript…




In [17]:
# Validation split
train_inputs, validation_inputs, train_labels, validation_labels = \
    train_test_split(input_ids, train['label'].values, random_state=1, test_size=0.1)
train_masks, validation_masks, _, _ = \
    train_test_split(attention_masks, input_ids, random_state=1, test_size=0.1)

In [18]:
# To Tensor
train_inputs = torch.tensor(train_inputs)
train_labels = torch.tensor(train_labels)
train_masks = torch.tensor(train_masks)

validation_inputs = torch.tensor(validation_inputs)
validation_labels = torch.tensor(validation_labels)
validation_masks = torch.tensor(validation_masks)

In [29]:
# To Loader
BATCH_SIZE = 8

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=BATCH_SIZE)

validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=BATCH_SIZE)

In [30]:
# TestSet
sentences = test['review']
sentences = ["[CLS] " + str(sentence) + " [SEP]" for sentence in sentences]
labels = test['label'].values

tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]

input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

attention_masks = []
for seq in input_ids:
    seq_mask = [float(i>0) for i in seq]
    attention_masks.append(seq_mask)

test_inputs = torch.tensor(input_ids)
test_labels = torch.tensor(labels)
test_masks = torch.tensor(attention_masks)

test_data = TensorDataset(test_inputs, test_masks, test_labels)
test_sampler = RandomSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=BATCH_SIZE)

In [31]:
test_inputs.shape, test_labels.shape, test_masks.shape

(torch.Size([5000, 512]), torch.Size([5000]), torch.Size([5000, 512]))

In [32]:
train_inputs.shape, train_labels.shape, train_masks.shape

(torch.Size([13500, 512]), torch.Size([13500]), torch.Size([13500, 512]))

In [33]:
validation_inputs.shape, validation_labels.shape, validation_masks.shape

(torch.Size([1500, 512]), torch.Size([1500]), torch.Size([1500, 512]))

In [34]:
model = BertForSequenceClassification.from_pretrained("bert-base-multilingual-cased", num_labels=4)
model.cuda()

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model ch

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elemen

In [35]:
optimizer = AdamW(model.parameters(), lr = 2e-5, eps = 1e-8)
epochs = 2
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = 0, num_training_steps = total_steps)

In [36]:
# 정확도 계산 함수
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

# 시간 표시 함수
def format_time(elapsed):
    elapsed_rounded = int(round((elapsed)))
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [37]:
import gc
gc.collect()

486

In [38]:
# 재현을 위해 랜덤시드 고정
seed_val = 1
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

# cuda delete cache
torch.cuda.empty_cache()

# 그래디언트 초기화
model.zero_grad()

# 에폭만큼 반복
for epoch_i in range(0, epochs):
    
    # ========================================
    #               Training
    # ========================================
    
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    # 시작 시간 설정
    t0 = time.time()

    # 로스 초기화
    total_loss = 0

    # 훈련모드로 변경
    model.train()
        
    # 데이터로더에서 배치만큼 반복하여 가져옴
    for step, batch in enumerate(tqdm(train_dataloader, desc='[Train] ')):
        # 경과 정보 표시
        if step % 500 == 0 and not step == 0:
            elapsed = format_time(time.time() - t0)
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        # 배치를 GPU에 넣음
        batch = tuple(t.to(device) for t in batch)

        # 배치에서 데이터 추출
        b_input_ids, b_input_mask, b_labels = batch

        # Forward 수행                
        outputs = model(b_input_ids, 
                        token_type_ids=None, 
                        attention_mask=b_input_mask, 
                        labels=b_labels)
        
        # 로스 구함
        loss = outputs[0]

        # 총 로스 계산
        total_loss += loss.item()

        # Backward 수행으로 그래디언트 계산
        loss.backward()

        # 그래디언트 클리핑
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # 그래디언트를 통해 가중치 파라미터 업데이트
        optimizer.step()

        # 스케줄러로 학습률 감소
        scheduler.step()

        # 그래디언트 초기화
        model.zero_grad()

    # 평균 로스 계산
    avg_train_loss = total_loss / len(train_dataloader)            

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(format_time(time.time() - t0)))
        
    # ========================================
    #               Validation
    # ========================================

    print("")
    print("Running Validation...")

    #시작 시간 설정
    t0 = time.time()

    # 평가모드로 변경
    model.eval()

    # 변수 초기화
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0

    # 데이터로더에서 배치만큼 반복하여 가져옴
    for batch in tqdm(validation_dataloader, desc="[Validation]"):
        # 배치를 GPU에 넣음
        batch = tuple(t.to(device) for t in batch)
        
        # 배치에서 데이터 추출
        b_input_ids, b_input_mask, b_labels = batch
        
        # 그래디언트 계산 안함
        with torch.no_grad():     
            # Forward 수행
            outputs = model(b_input_ids, 
                            token_type_ids=None, 
                            attention_mask=b_input_mask)
        
        # 로스 구함
        logits = outputs[0]

        # CPU로 데이터 이동
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        
        # 출력 로짓과 라벨을 비교하여 정확도 계산
        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        eval_accuracy += tmp_eval_accuracy
        nb_eval_steps += 1

    print("  Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
    print("  Validation took: {:}".format(format_time(time.time() - t0)))

print("")
print("Training complete!")


Training...


HBox(children=(FloatProgress(value=0.0, description='[Train] ', max=1688.0, style=ProgressStyle(description_wi…

  Batch   500  of  1,688.    Elapsed: 0:12:26.
  Batch 1,000  of  1,688.    Elapsed: 0:24:55.
  Batch 1,500  of  1,688.    Elapsed: 0:37:23.


  Average training loss: 0.89
  Training epcoh took: 0:42:04

Running Validation...


HBox(children=(FloatProgress(value=0.0, description='[Validation]', max=188.0, style=ProgressStyle(description…


  Accuracy: 0.73
  Validation took: 0:01:38

Training...


HBox(children=(FloatProgress(value=0.0, description='[Train] ', max=1688.0, style=ProgressStyle(description_wi…

  Batch   500  of  1,688.    Elapsed: 0:12:28.
  Batch 1,000  of  1,688.    Elapsed: 0:24:55.
  Batch 1,500  of  1,688.    Elapsed: 0:37:23.


  Average training loss: 0.56
  Training epcoh took: 0:42:03

Running Validation...


HBox(children=(FloatProgress(value=0.0, description='[Validation]', max=188.0, style=ProgressStyle(description…


  Accuracy: 0.75
  Validation took: 0:01:38

Training complete!


In [42]:
#시작 시간 설정
t0 = time.time()

# 평가모드로 변경
model.eval()

# 변수 초기화
eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0

# 데이터로더에서 배치만큼 반복하여 가져옴
for step, batch in enumerate(tqdm(test_dataloader, desc='[step]')):
    # 경과 정보 표시
    if step % 100 == 0 and not step == 0:
        elapsed = format_time(time.time() - t0)
        print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(test_dataloader), elapsed))

    # 배치를 GPU에 넣음
    batch = tuple(t.to(device) for t in batch)
    
    # 배치에서 데이터 추출
    b_input_ids, b_input_mask, b_labels = batch
    
    # 그래디언트 계산 안함
    with torch.no_grad():     
        # Forward 수행
        outputs = model(b_input_ids, 
                        token_type_ids=None, 
                        attention_mask=b_input_mask)
    
    # 로스 구함
    logits = outputs[0]

    # CPU로 데이터 이동
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()
    
    # 출력 로짓과 라벨을 비교하여 정확도 계산
    tmp_eval_accuracy = flat_accuracy(logits, label_ids)
    eval_accuracy += tmp_eval_accuracy
    nb_eval_steps += 1

print("")
print("Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
print("Test took: {:}".format(format_time(time.time() - t0)))

HBox(children=(FloatProgress(value=0.0, description='[step]', max=625.0, style=ProgressStyle(description_width…

  Batch   100  of    625.    Elapsed: 0:00:51.
  Batch   200  of    625.    Elapsed: 0:01:43.
  Batch   300  of    625.    Elapsed: 0:02:36.
  Batch   400  of    625.    Elapsed: 0:03:28.
  Batch   500  of    625.    Elapsed: 0:04:20.
  Batch   600  of    625.    Elapsed: 0:05:12.


Accuracy: 0.77
Test took: 0:05:25


In [56]:
def evaluate(text):
    model.eval()

    m = {
        0: 'A type',
        1: 'B type',
        2: 'C type',
        3: 'D type'
    }

    sentence = text
    sentences = ["[CLS] " + sentence + " [SEP]"]

    tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]

    input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
    input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

    attention_masks = []
    for seq in input_ids:
        seq_mask = [float(i>0) for i in seq]
        attention_masks.append(seq_mask)

    test_inputs = torch.tensor(input_ids).to(device)
    test_masks = torch.tensor(attention_masks).to(device)

    outputs = model(test_inputs, token_type_ids=None, attention_mask=test_masks)
    
    # 로스 구함
    logits = outputs[0]

    # CPU로 데이터 이동
    logits = logits.detach().cpu().numpy()
    print('this text predicted :', m[logits.argmax(1)[0]])

In [61]:
evaluate("i feel the wind while looking at the night sky in summer.")
evaluate("it's like having a strong spice in your mouth.")
evaluate("it looks like it's rolling on a wooden floor.")

this text predicted : A type
this text predicted : B type
this text predicted : C type


In [68]:
from transformers import AutoModelForSequenceClassification
model.save_pretrained("lunab_model_21_06_29")
pytorch_model = AutoModelForSequenceClassification.from_pretrained("lunab_model_21_06_29")

In [72]:
from transformers import AutoTokenizer
tokenizer.save_pretrained("lunab_tokenizer_21_06_29")
pytorch_tokenizer = AutoTokenizer.from_pretrained("lunab_tokenizer_21_06_29")

In [73]:
def evaluate_load(text):
    pytorch_model.cuda()
    pytorch_model.eval()

    m = {
        0: 'A type',
        1: 'B type',
        2: 'C type',
        3: 'D type'
    }

    sentence = text
    sentences = ["[CLS] " + sentence + " [SEP]"]

    tokenized_texts = [pytorch_tokenizer.tokenize(sent) for sent in sentences]

    input_ids = [pytorch_tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
    input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

    attention_masks = []
    for seq in input_ids:
        seq_mask = [float(i>0) for i in seq]
        attention_masks.append(seq_mask)

    test_inputs = torch.tensor(input_ids).to(device)
    test_masks = torch.tensor(attention_masks).to(device)

    outputs = pytorch_model(test_inputs, token_type_ids=None, attention_mask=test_masks)
    
    # 로스 구함
    logits = outputs[0]

    # CPU로 데이터 이동
    logits = logits.detach().cpu().numpy()
    print('this text predicted :', m[logits.argmax(1)[0]])

In [74]:
evaluate_load("i feel the wind while looking at the night sky in summer.")
evaluate_load("it's like having a strong spice in your mouth.")
evaluate_load("it looks like it's rolling on a wooden floor.")

this text predicted : A type
this text predicted : B type
this text predicted : C type


In [75]:
from google.colab import drive 
drive.mount('/content/drive')

Mounted at /content/drive


In [77]:
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer

model.save_pretrained("/content/drive/MyDrive/SAI/p/lunab_model_21_06_29")
tokenizer.save_pretrained("/content/drive/MyDrive/SAI/p/lunab_tokenizer_21_06_29")

print('Done...')

Done...
