In [1]:
import os
import pandas as pd
import numpy as np
import torch
from torch import Generator
from peft import LoraConfig, TaskType
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import DataLoader, ConcatDataset, Subset, random_split, RandomSampler, TensorDataset
# transformer
from transformers.optimization import AdamW, get_scheduler, SchedulerType
# native
from NlpAnalytics import *

PATH = '/Users/lunli/Library/CloudStorage/GoogleDrive-yaojn19880525@gmail.com/My Drive/Colab Notebooks/'
DATASET_NAME = 'amazon'


  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package stopwords to /Users/lunli/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# load valid, test, unsup
df_train = pd.read_csv(os.path.join(PATH, f'data/{DATASET_NAME}/sup_data_intent.csv'))
df_valid = pd.read_csv(os.path.join(PATH, f'data/{DATASET_NAME}/test_intent.csv'))
df_valid['intent']= df_valid['intent'].astype('category')
df_valid['intent'] = df_valid['intent'].cat.codes
# df_test = pd.read_csv(os.path.join(PATH, f'data/{DATASET_NAME}/amazon_test.csv'))
df_unsup_1 = pd.read_csv(os.path.join(PATH, f'data/{DATASET_NAME}/unsup_data_intent_1.csv'))
df_unsup_2 = pd.read_csv(os.path.join(PATH, f'data/{DATASET_NAME}/unsup_data_intent_2.csv'))

In [3]:
# combine unsup data
df_unsup_1.columns = ['ori_text', 'aug_text_2']
df_unsup = pd.concat([df_unsup_1, df_unsup_2])
# drop duplicates
df_unsup.drop_duplicates(inplace=True)
# reset index and drop index column
df_unsup.reset_index(inplace=True)
df_unsup = df_unsup.drop('index', axis = 1)
# rename cols
df_unsup.columns = ['ori_text', 'aug_text']
df_unsup


Unnamed: 0,ori_text,aug_text
0,add the artist choclair to la mejor música de bso,'add artist to la mejor de bso'
1,rate this book four stars out of 6,'rate this book stars of'
2,add tune to punk español,'add music to'
3,play deezer form 2010 tune by dave grohl,'Deezer shape air dave grohl'
4,i d like to go to a halal restaurant in twenty...,'I like to go to a halal restaurant in twenty ...
...,...,...
27270,play some music from the thirties,'a part of the music of the'
27271,can you get me seating for a party of 4,'can you me seating a party of'
27272,rate the previous textbook a 4 out of 6,'evaluate the previous manual has 4 out of 6'
27273,add this artist to the this is dirty projector...,'add this artist to the list of dirty project...


In [4]:
# drop extra column in df_sup
df_train = df_train.drop('level_1', axis = 1)
df_train['intent']= df_train['intent'].astype('category')
df_train['intent'] = df_train['intent'].cat.codes
df_train

Unnamed: 0,intent,text
0,0,add the lady bunny album to décadas
1,0,add blag dahlia to pura vida
2,0,add animal stories to maryanne s by per yngve ...
3,0,can you add something by gregori chad petree t...
4,0,add this song onto hip hop gaming playlist
...,...,...
72,6,in the neighborhood find movie times for movies
73,6,show me movie times at my local theater
74,6,what films are scheduled around here
75,6,what is the movie schedules at consolidated th...


In [5]:
# tokennize sup and unsup toether
sup_size = len(df_train)
unsup_size = len(df_unsup)

#
 ### Load tokenizer
tokenizer = BertLoader(load_tokenizer=True).tokenizer
sup_unsup = pd.DataFrame(df_train['text'].tolist() + df_unsup['ori_text'].tolist() + 
                        df_unsup['aug_text'].tolist())
sup_unsup.columns = ['text']
sup_unsup_  = DatasetNLP(input_df=sup_unsup, 
                    tokenizer=tokenizer,
                    cols_to_tokenize=['text'],  
                    cols_label=[] )



In [6]:
# spilt to up and unsup


def split_sup_unsup(sup_unsup_, sup_size, unsup_size):
    count = 0
    input_ids_sup = []
    mask_sup = []
    input_ids_unsup_ori = []
    mask_unsup_ori = []
    input_ids_unsup_aug = []
    mask_unsup_aug = []
    for each in sup_unsup_:
        if count < sup_size:
            count += 1
            input_ids_sup.append(each[0])
            mask_sup.append(each[1])
        elif sup_size <= count < unsup_size + sup_size:
            count += 1
            input_ids_unsup_ori.append(each[0])
            mask_unsup_ori.append(each[1])
        else:
            input_ids_unsup_aug.append(each[0])
            mask_unsup_aug.append(each[1])

    return input_ids_sup, mask_sup, input_ids_unsup_ori, mask_unsup_ori, input_ids_unsup_aug, mask_unsup_aug

input_ids_sup, mask_sup, input_ids_unsup_ori, mask_unsup_ori,input_ids_unsup_aug, mask_unsup_aug = split_sup_unsup(sup_unsup_, sup_size, unsup_size)

In [7]:
### Load tokenizer
tokenizer = BertLoader(load_tokenizer=True).tokenizer

# use DatasetNLP tokenize valid and test


valid_data = DatasetNLP(input_df=df_valid, 
                    tokenizer=tokenizer,  
                    cols_to_tokenize=['text'],  
                    cols_label=['intent'] )
# test_data = DatasetNLP(input_df=df_test, 
#                     tokenizer=tokenizer,  
#                     cols_to_tokenize=['text'],  
#                     cols_label=['label'] )

# make new dataserNLP fpr sup and unsup:
train_sup_data_ = TensorDataset(torch.stack(input_ids_sup), torch.stack(mask_sup), torch.LongTensor(df_train['intent'].tolist()))
train_unsup_data = TensorDataset(torch.stack(input_ids_unsup_ori), torch.stack(mask_unsup_ori),
                                 torch.stack(input_ids_unsup_aug), torch.stack(mask_unsup_aug))


In [15]:
len(train_unsup_data[0])

4

In [76]:
# To dataloader
# to dataloader
generator = Generator().manual_seed(42)
train_sup_dataloader = DataLoader(train_sup_data_, sampler=RandomSampler(train_sup_data_, generator=generator), batch_size=8)
train_unsup_dataloader = DataLoader(train_unsup_data, sampler=RandomSampler(train_unsup_data, generator=generator), batch_size=32)
valid_dataloader = DataLoader(valid_data, sampler=RandomSampler(valid_data, generator=generator), batch_size=32)
test_dataloader = DataLoader(valid_data, sampler=RandomSampler(valid_data, generator=generator), batch_size=32)


In [77]:
# trainer
### load HF BERT Classifier
num_labels = len(df_train['intent'].unique())
loader = BertClassifierLoader(ClassifierType.BERT_CLASSIFIER_HF, "bert-base-uncased", num_labels, 0.1, load_tokenizer=True)

datamodeler = {DataLoaderType.TRAINING: train_sup_dataloader,DataLoaderType.VALIDATION: valid_dataloader,
            DataLoaderType.TESTING:test_dataloader, DataLoaderType.TRAINING_UNLABELED:train_unsup_dataloader}

loss_sup = get_loss_functions(LossFuncType.CROSS_ENTROPY)
loss_unsup = get_loss_functions(LossFuncType.KL_DIV)

loss_dict = {'sup':loss_sup, 'unsup':loss_unsup}
##### no lora ####
optimizer = AdamNLP.newNLPAdam(loader.model, {'embeddings':True, 'encoder': 9}, lr = 2e-4)
model = optimizer.get_model_transformed()
##### lora #####
# lora_config = LoraConfig(task_type=TaskType.SEQ_CLS,target_modules=["query", "key", "value"], r=1, lora_alpha=1, lora_dropout=0.1)
# optimizer = AdamNLP.newNLPAdam_LORA(loader.model, lora_config)
# model = optimizer.get_model_transformed()

trainer = TrainerUDA(model, datamodeler, loss_dict, optimizer)
trainer.train(3, schedule_type = SchedulerType.INVERSE_SQRT, save_model_freq=-1)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


At step 100, the training loss is 0.24158695784397424.
At step 200, the training loss is 0.1620826621935703.
At step 300, the training loss is 0.12302737587752442.
At step 400, the training loss is 0.09842588788131251.
At step 500, the training loss is 0.08394889936130494.
At step 600, the training loss is 0.07986806775831307.
At step 700, the training loss is 0.08585280282489423.
At step 800, the training loss is 0.08192687783681322.
For epoch 1, the mean sup loss is: 0.07900058039041521, and accuracy is: 0.7657177448272705.


Epoch:  33%|███▎      | 1/3 [10:05<20:10, 605.37s/it]

Validation accuracy is: 0.8942857384681702.

At step 100, the training loss is 0.02577314996859059.
At step 200, the training loss is 0.02551072683534585.
At step 300, the training loss is 0.03127870161474372.
At step 400, the training loss is 0.02820278826111462.
At step 500, the training loss is 0.03176299004023895.
At step 600, the training loss is 0.03705374830053188.
At step 700, the training loss is 0.03809330270326297.
At step 800, the training loss is 0.037112645038578196.
For epoch 2, the mean sup loss is: 0.03657620057713673, and accuracy is: 0.982036828994751.


Epoch:  67%|██████▋   | 2/3 [20:31<10:17, 617.36s/it]

Validation accuracy is: 0.9285714030265808.

At step 100, the training loss is 0.02405446336604655.
At step 200, the training loss is 0.023642566931666806.


Epoch:  67%|██████▋   | 2/3 [23:31<11:45, 705.94s/it]


KeyboardInterrupt: 