In [8]:
# !pip install datasets gpytorch transformers

In [4]:
# !wget https://www.dropbox.com/s/d7y0otn695k5lu4/amazon-review.zip

In [5]:
!unzip amazon-review.zip

Archive:  amazon-review.zip
   creating: amazon-review/
  inflating: amazon-review/books.csv  
  inflating: amazon-review/books-UL.csv  
  inflating: amazon-review/dvd-UL.csv  
  inflating: amazon-review/dvd.csv   
  inflating: amazon-review/electronics-UL.csv  
  inflating: amazon-review/electronics.csv  
  inflating: amazon-review/kitchen_housewares-UL.csv  
  inflating: amazon-review/kitchen_housewares.csv  


In [1]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertConfig, BertTokenizer, BertTokenizerFast, BertModel
from datasets import load_dataset
import os
from gpytorch.kernels.rq_kernel import RQKernel
import pandas as pd
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm

In [2]:
import torch
from torch.autograd import Variable
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
from functools import partial
import matplotlib.pyplot as plt
import numpy as np
import pylab


In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


### Custom Dataset

In [4]:
class ReviewDataset(Dataset):
    def __init__(self, data, is_unlabelled=False, model_name = "bert-base-uncased", max_len=512):
        self.data = data
        self.max_len = max_len
        self.is_unlabelled = is_unlabelled
        self.tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True)

    def __getitem__(self, index):
        
        review = self.data['review_text'][index]
        
        encoded_input = self.tokenizer.encode_plus(
                review,
                add_special_tokens=True,
                max_length= self.max_len,
                pad_to_max_length=True,
                return_overflowing_tokens=True,
                truncation=True,
                padding='max_length'
            )

        input_ids = encoded_input["input_ids"]
        attention_mask = encoded_input["attention_mask"] if "attention_mask" in encoded_input else None
        token_type_ids = encoded_input["token_type_ids"] if "token_type_ids" in encoded_input else None


        if self.is_unlabelled:
            data_input = {
                "input_ids": torch.tensor(input_ids).to(device),
                "attention_mask": torch.tensor(attention_mask).to(device),
                "token_type_ids": torch.tensor(token_type_ids).to(device),
            }
        else:
            sentiment = self.data["sentiment"][index]
            data_input = {
                "input_ids": torch.tensor(input_ids).to(device),
                "attention_mask": torch.tensor(attention_mask).to(device),
                "token_type_ids": torch.tensor(token_type_ids).to(device),
                "label": torch.tensor(sentiment, dtype=torch.long).to(device),
            }
        return data_input


    def __len__(self):
        return len(self.data['review_text'])


#### Training Config

In [5]:
training_config = {
    "model_name":"bert-base-uncased",
    "train_size":80,
    "max_len":256,
    "hidden_size":768,
    "batch_size":2,
    "lr":1e-5,
    "epoch":10,
    "steps_to_print":100,
}

In [6]:
root = "./amazon-review/"
# domains = ["books", "dvd", "electronics", "kitchen_housewares"]
source_file = "books"
target_val_file = "dvd"
target_train_file = "dvd-UL"

In [7]:
src = load_dataset("csv", data_files=root+source_file+".csv")['train']
trg_train_data = load_dataset("csv", data_files=root+target_train_file+".csv")['train']
trg_val_data = load_dataset("csv", data_files=root+target_val_file+".csv")['train']

source_data = ReviewDataset(data=src)
trg_train = ReviewDataset(data=trg_train_data, is_unlabelled=True)
trg_val = ReviewDataset(data=trg_val_data) 

src_train, src_val = torch.utils.data.random_split(dataset=source_data, lengths=[int(len(source_data)*0.80), len(source_data)-int(len(source_data)*0.80)], generator=torch.Generator().manual_seed(42))
# trg_train, trg_val = torch.utils.data.random_split(dataset=target_data, lengths=[int(len(target_data)*0.80), len(target_data)-int(len(target_data)*0.80)], generator=torch.Generator().manual_seed(42))


Using custom data configuration default
Reusing dataset csv (/root/.cache/huggingface/datasets/csv/default-25926b895a54667a/0.0.0/49187751790fa4d820300fd4d0707896e5b941f1a9c644652645b866716a4ac4)
Using custom data configuration default
Reusing dataset csv (/root/.cache/huggingface/datasets/csv/default-90bc47a32619b67a/0.0.0/49187751790fa4d820300fd4d0707896e5b941f1a9c644652645b866716a4ac4)
Using custom data configuration default
Reusing dataset csv (/root/.cache/huggingface/datasets/csv/default-f6e4771184a1405b/0.0.0/49187751790fa4d820300fd4d0707896e5b941f1a9c644652645b866716a4ac4)


In [8]:
src_trainlaoder = DataLoader(
    dataset=src_train,
    batch_size=training_config['batch_size'],
    shuffle=True,
)
src_vallaoder = DataLoader(
    dataset=src_val,
    batch_size=training_config['batch_size'],
    shuffle=False,
)

trg_trainlaoder = DataLoader(
    dataset=trg_train,
    batch_size=training_config['batch_size'],
    shuffle=True,
)
trg_vallaoder = DataLoader(
    dataset=trg_val,
    batch_size=training_config['batch_size'],
    shuffle=False,
)

## Model

In [9]:
class FeatureExtractor(nn.Module):
    
    def __init__(self, model_name):
        super(FeatureExtractor, self).__init__()
        self.bert = BertModel.from_pretrained(
            pretrained_model_name_or_path=model_name,
            output_hidden_states=True
        )
    
    def forward(self, input_ids, attention_mask=None):
        
        top_layer, pooled_output, layers = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
        )
        return top_layer, pooled_output, layers
        
        

In [10]:
class Classifier(nn.Module):
    
    def __init__(self, hidden_size=768, num_classes=2):
        super(Classifier, self).__init__()
        
        self.fc1 = nn.Linear(in_features=hidden_size, out_features=256)
        self.fc2 = nn.Linear(in_features=256, out_features=128)
        self.out = nn.Linear(in_features=128, out_features=num_classes)
        self.relu = nn.ReLU()
    
    def forward(self, x):
        fc1_out = self.relu(self.fc1(x))
        fc2_out = self.relu(self.fc2(fc1_out))
        logits = self.out(fc2_out)
        return fc1_out, fc2_out, logits
        

In [11]:
common_net = FeatureExtractor(model_name=training_config['model_name']).to(device)
src_net = Classifier().to(device)
trg_net = Classifier().to(device)

In [12]:
optimizer = optim.SGD([{'params': common_net.parameters()},
                       {'params': src_net.parameters()},
                       {'params': trg_net.parameters()}], lr= training_config['lr'])

criterion = nn.CrossEntropyLoss()

In [13]:
def pairwise_distance(x, y):

    if not len(x.shape) == len(y.shape) == 2:
        raise ValueError('Both inputs should be matrices.')

    if x.shape[1] != y.shape[1]:
        raise ValueError('The number of features should be the same.')

    x = x.view(x.shape[0], x.shape[1], 1)
    y = torch.transpose(y, 0, 1)
    output = torch.sum((x - y) ** 2, 1)
    output = torch.transpose(output, 0, 1)

    return output

def gaussian_kernel_matrix(x, y, sigmas):

    sigmas = sigmas.view(sigmas.shape[0], 1)
    beta = 1. / (2. * sigmas)
    dist = pairwise_distance(x, y).contiguous()
    dist_ = dist.view(1, -1)
    s = torch.matmul(beta, dist_)

    return torch.sum(torch.exp(-s), 0).view_as(dist)

def maximum_mean_discrepancy(x, y, kernel= gaussian_kernel_matrix):

    cost = torch.mean(kernel(x, x))
    cost += torch.mean(kernel(y, y))
    cost -= 2 * torch.mean(kernel(x, y))

    return cost

def mmd_loss(source_features, target_features):

    sigmas = [
        1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 5, 10, 15, 20, 25, 30, 35, 100,
        1e3, 1e4, 1e5, 1e6
    ]
#     if params.use_gpu:
#         gaussian_kernel = partial(
#             gaussian_kernel_matrix, sigmas = Variable(torch.cuda.FloatTensor(sigmas))
#         )
#     else:
    gaussian_kernel = partial(
        gaussian_kernel_matrix, sigmas = Variable(torch.cuda.FloatTensor(sigmas))
    )
    loss_value = maximum_mean_discrepancy(source_features, target_features, kernel= gaussian_kernel)
    loss_value = loss_value

    return loss_value

In [14]:
def accuracy(y, y_):
    correct = 0
    for i in range(y.shape[0]):
        if y[i]==y_[i]:
            correct += 1
            
    return correct/y.shape[0]

In [15]:
def eval(common_net, src_net, dataloader):
    acc = []
    
    common_net.eval()
    src_net.eval()
    
    for batch in tqdm(dataloader):
        _, features, _ = common_net(batch['input_ids'], batch['attention_mask'])
        _, _, logits = src_net(features)
        acc.append(accuracy(batch['label'], logits.argmax(dim=1)))
    return sum(acc)/len(dataloader)

In [16]:
# print(eval(common_net, src_net, trg_vallaoder))

In [17]:
def train(common_net, src_net, trg_net, optimizer, criterion, epoch,
          source_dataloader, target_dataloader, train_hist):

    common_net.train()
    src_net.train()
    trg_net.train()

    start_steps = epoch * len(source_dataloader)
    total_steps = training_config['epoch'] * len(source_dataloader)

    source_iter = iter(source_dataloader)
    target_iter = iter(target_dataloader)

    start = 0
    for batch_idx in range(min(len(source_dataloader), len(target_dataloader))):
        # get data
        sdata = next(source_iter)
        tdata = next(target_iter)


        optimizer.zero_grad()

        _, src_features, _ = common_net(sdata['input_ids'], sdata['attention_mask'])
        _, trg_features, _ = common_net(tdata['input_ids'], tdata['attention_mask'])
        
        src_fc1_out, src_fc2_out, src_logits = src_net(src_features)
        trg_fc1_out, trg_fc2_out, trg_logits = trg_net(trg_features)
        
        class_loss = criterion(src_logits, sdata['label'])
        mmd = 0.5*mmd_loss(src_features, trg_features) + 0.5*mmd_loss(src_fc1_out, trg_fc1_out) + 0.5*mmd_loss(src_fc2_out, trg_fc2_out) +  0.5*mmd_loss(src_logits, trg_logits)
        
        

        loss = class_loss + mmd
        loss.backward()
        optimizer.step()
        step = epoch * len(target_dataloader) + batch_idx
        start += 1
        
        if (start) % training_config['steps_to_print'] == 0:
            src_acc = eval(common_net, src_net, src_vallaoder)
            trg_acc = eval(common_net, src_net, trg_vallaoder)
            print(f'{start}/{total_steps} | Total_loss = {loss.item():.4f} | Class_loss = {class_loss.item():.4f} | MMD_loss = {mmd.item():.2f} | SRC_acc = {src_acc:.4f} | TRG_acc = {trg_acc:.4f}')
        train_hist['Total_loss'].append(loss.cpu().item())
        train_hist['Class_loss'].append(class_loss.cpu().item())
        train_hist['MMD_loss'].append(mmd.cpu().item())

In [18]:
train_hist = {}
train_hist['Total_loss'] = []
train_hist['Class_loss'] = []
train_hist['MMD_loss'] = []

In [19]:
train(
    common_net=common_net,
    src_net=src_net,
    trg_net=trg_net,
    optimizer=optimizer,
    criterion=criterion,
    epoch=training_config['epoch'],
    source_dataloader=src_trainlaoder,
    target_dataloader=trg_trainlaoder,
    train_hist=train_hist
)

100%|██████████| 199/199 [00:19<00:00, 10.37it/s]
100%|██████████| 989/989 [01:35<00:00, 10.35it/s]


100/7950 | Total_loss = 18.6403 | Class_loss = 0.7006 | MMD_loss = 17.94 | SRC_acc = 0.5101 | TRG_acc = 0.4990


100%|██████████| 199/199 [00:19<00:00, 10.33it/s]
100%|██████████| 989/989 [01:36<00:00, 10.30it/s]


200/7950 | Total_loss = 17.6674 | Class_loss = 0.6853 | MMD_loss = 16.98 | SRC_acc = 0.5101 | TRG_acc = 0.4990


100%|██████████| 199/199 [00:19<00:00, 10.27it/s]
100%|██████████| 989/989 [01:35<00:00, 10.32it/s]


300/7950 | Total_loss = 19.1500 | Class_loss = 0.7580 | MMD_loss = 18.39 | SRC_acc = 0.5101 | TRG_acc = 0.4990


100%|██████████| 199/199 [00:19<00:00, 10.32it/s]
100%|██████████| 989/989 [01:35<00:00, 10.36it/s]


400/7950 | Total_loss = 15.9169 | Class_loss = 0.7301 | MMD_loss = 15.19 | SRC_acc = 0.5101 | TRG_acc = 0.4990


100%|██████████| 199/199 [00:19<00:00, 10.34it/s]
100%|██████████| 989/989 [01:35<00:00, 10.32it/s]


500/7950 | Total_loss = 15.4561 | Class_loss = 0.6551 | MMD_loss = 14.80 | SRC_acc = 0.5101 | TRG_acc = 0.4990


100%|██████████| 199/199 [00:19<00:00, 10.26it/s]
100%|██████████| 989/989 [01:35<00:00, 10.31it/s]


600/7950 | Total_loss = 14.5536 | Class_loss = 0.6987 | MMD_loss = 13.85 | SRC_acc = 0.5101 | TRG_acc = 0.4990


100%|██████████| 199/199 [00:19<00:00, 10.32it/s]
100%|██████████| 989/989 [01:35<00:00, 10.32it/s]


700/7950 | Total_loss = 14.1620 | Class_loss = 0.6972 | MMD_loss = 13.46 | SRC_acc = 0.5101 | TRG_acc = 0.4990
