# Import statements

In [1]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertConfig, BertTokenizer, BertTokenizerFast, BertModel
import numpy as np
import math
import os
import pandas as pd
import torch.nn as nn



# Select device

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


# Parameters 

In [18]:
config = {
    "num_labels": 2,
    "hidden_dropout_prob": 0.15,
    "hidden_size": 768,
    "max_length": 512,
}

training_parameters = {
    "batch_size": 2,
    "epochs": 10,
    "output_folder": "../working/",
    "output_file": "model.bin",
    "learning_rate": 1e-5,
    "print_after_steps": 100,
    "save_steps": 5000,

}

# Define Dataset and dataloaders

In [19]:
class ReviewDataset(Dataset):
    def __init__(self, df, model_name = "bert-base-uncased"):
        self.df = df
        self.config = BertConfig.from_pretrained("bert-base-uncased", output_hidden_states=True)
        self.tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True)

    def __getitem__(self, index):
        review = self.df.iloc[index]["review_text"]
        sentiment = self.df.iloc[index]["sentiment"]

        encoded_input = self.tokenizer.encode_plus(
                review,
                add_special_tokens=True,
                max_length= config["max_length"],
                pad_to_max_length=True,
                return_overflowing_tokens=True,
                truncation=True,
                padding='max_length'
            )

        input_ids = encoded_input["input_ids"]
        attention_mask = encoded_input["attention_mask"] if "attention_mask" in encoded_input else None
        token_type_ids = encoded_input["token_type_ids"] if "token_type_ids" in encoded_input else None



        data_input = {
            "input_ids": torch.tensor(input_ids),
            "attention_mask": torch.tensor(attention_mask),
            "token_type_ids": torch.tensor(token_type_ids),
            "label": torch.tensor(sentiment, dtype=torch.long),
        }

        return data_input["input_ids"], data_input["attention_mask"], data_input["token_type_ids"], data_input["label"]



    def __len__(self):
        return self.df.shape[0]



In [20]:
root = "../input/amazonproductsreview/amazon-review/"
# domains = ["books", "dvd", "electronics", "kitchen_housewares"]
source = "books"
target = "dvd"

In [21]:
source_df = pd.read_csv(root+source+".csv", nrows=1500, usecols=['review_text', 'sentiment'])
source_dataset = ReviewDataset(source_df)
source_dataloader = DataLoader(dataset = source_dataset, batch_size = training_parameters["batch_size"], num_workers=4, shuffle = True)

target_df = pd.read_csv(root+target+".csv", nrows=1500, usecols=['review_text', 'sentiment'])
target_dataset = ReviewDataset(target_df)
target_dataloader = DataLoader(dataset = target_dataset, batch_size = training_parameters["batch_size"], num_workers=4, shuffle = False)

In [22]:
print(source_df.shape, target_df.shape)

(1500, 2) (1500, 2)


In [23]:
# source_df.head(20)

# Gradient Reversal Function 

In [24]:
from torch.autograd import Function


class GradientReversalFn(Function):
    @staticmethod
    def forward(ctx, x, alpha):
        ctx.alpha = alpha
        
        return x.view_as(x)

    @staticmethod
    def backward(ctx, grad_output):
        output = grad_output.neg() * ctx.alpha

        return output, None


# Defining Model

In [25]:
import torch
import torch.nn as nn
import torch.optim as optim

class DomainAdaptationModel(nn.Module):
    def __init__(self):
        super(DomainAdaptationModel, self).__init__()
        
        num_labels = config["num_labels"]
        self.bert = BertModel.from_pretrained('bert-base-uncased', output_hidden_states=True)
        self.dropout = nn.Dropout(config["hidden_dropout_prob"])
        self.sentiment_classifier = nn.Sequential(
            nn.Linear(config["hidden_size"], num_labels),
            nn.LogSoftmax(dim=1),
        )
        self.domain_classifier = nn.Sequential(
            nn.Linear(config["hidden_size"], 2),
            nn.LogSoftmax(dim=1),
        )


    def forward(
          self,
          input_ids=None,
          attention_mask=None,
          token_type_ids=None,
          labels=None,
          grl_lambda = 1.0, 
          ):

        top_layer, pooled_output, layers = self.bert(
                input_ids,
                attention_mask=attention_mask,
                token_type_ids=token_type_ids,
            )

#         top_layer, pooled, layers = self.base(input_ids, attention_mask)
#         pooled_output = outputs[1]
        pooled_output = self.dropout(pooled_output)

        reversed_pooled_output = GradientReversalFn.apply(pooled_output, grl_lambda)

        sentiment_pred = self.sentiment_classifier(pooled_output)
        domain_pred = self.domain_classifier(reversed_pooled_output)
        
        return layers, pooled_output, sentiment_pred.to(device), domain_pred.to(device)

# Computer Accuracy

In [26]:
def compute_accuracy(logits, labels):
    
    predicted_labels_dict = {
      0: 0,
      1: 0,
    }
    
    predicted_label = logits.max(dim = 1)[1]
    
    for pred in predicted_label:
        predicted_labels_dict[pred.item()] += 1
    acc = (predicted_label == labels).float().mean()
    
    return acc, predicted_labels_dict

# Evaluate the model

In [27]:
def evaluate(model, dataset = "electronics", percentage = 100):
    with torch.no_grad():
        predicted_labels_dict = {                                                   
          0: 0,                                                                     
          1: 0,                                                                     
        }
        
        dev_df = pd.read_csv(root+dataset+".csv")
        dev_df = dev_df[1500:]
        dev_df = dev_df.reset_index(drop=True)
#         data_size = dev_df.shape[0]
#         selected_for_evaluation = int(data_size*percentage/100)
#         dev_df = dev_df.head(selected_for_evaluation)
                             
#         dev_df = dev_df.iloc[1500:, :]
        dataset = ReviewDataset(dev_df)

        dataloader = DataLoader(dataset = dataset, batch_size = training_parameters["batch_size"], shuffle = False, num_workers=4)

        mean_accuracy = 0.0
        total_batches = len(dataloader)
        
        for input_ids, attention_mask, token_type_ids, labels in dataloader:
            inputs = {
                "input_ids": input_ids,
                "attention_mask": attention_mask,
                "token_type_ids" : token_type_ids,
                "labels": labels,
            }
            for k, v in inputs.items():
                inputs[k] = v.to(device)


            _, _, sentiment_pred, _ = model(**inputs)
            accuracy, predicted_labels = compute_accuracy(sentiment_pred, inputs["labels"])
            mean_accuracy += accuracy
            predicted_labels_dict[0] += predicted_labels[0]
            predicted_labels_dict[1] += predicted_labels[1]  
#         print(predicted_labels_dict)
    return mean_accuracy/total_batches

# Training 

In [28]:
target_acc = []
model = DomainAdaptationModel().to(device)

In [None]:
lr = training_parameters["learning_rate"]
n_epochs = training_parameters["epochs"]


optimizer = optim.Adam(model.parameters(), lr)

loss_fn_sentiment_classifier = torch.nn.NLLLoss()
loss_fn_domain_classifier = torch.nn.NLLLoss()
'''
In one training step we will update the model using both the source labeled data and target unlabeled data
We will run it till the batches last for any of these datasets

In our case target dataset has more data. Hence, we will leverage the entire source dataset for training

If we use the same approach in a case where the source dataset has more data then the target dataset then we will
under-utilize the labeled source dataset. In such a scenario it is better to reload the target dataset when it finishes
This will ensure that we are utilizing the entire source dataset to train our model.
'''

max_batches = min(len(source_dataloader), len(target_dataloader))

for epoch_idx in range(n_epochs):
    
    source_iterator = iter(source_dataloader)
    target_iterator = iter(target_dataloader)

    for batch_idx in range(max_batches):
        
        p = float(batch_idx + epoch_idx * max_batches) / (training_parameters["epochs"] * max_batches)
        grl_lambda = 2. / (1. + np.exp(-10 * p)) - 1
        grl_lambda = torch.tensor(grl_lambda)
        
        model.train()
        
        if(batch_idx%training_parameters["print_after_steps"] == 0 ):
            print("Training Step:", batch_idx)
        
        optimizer.zero_grad()
        
        # Souce dataset training update
        input_ids, attention_mask, token_type_ids, labels = next(source_iterator)
        inputs = {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "token_type_ids" : token_type_ids,
            "labels" : labels,
            "grl_lambda" : grl_lambda,
        }

        for k, v in inputs.items():
            inputs[k] = v.to(device)
            
        _, _, sentiment_pred, domain_pred = model(**inputs)
        loss_s_sentiment = loss_fn_sentiment_classifier(sentiment_pred, inputs["labels"])
        y_s_domain = torch.zeros(training_parameters["batch_size"], dtype=torch.long).to(device)
        loss_s_domain = loss_fn_domain_classifier(domain_pred, y_s_domain)


        # Target dataset training update 
        input_ids, attention_mask, token_type_ids, labels = next(target_iterator)
        inputs = {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "token_type_ids" : token_type_ids,
            "labels" : labels,
            "grl_lambda" : grl_lambda,
        }

        for k, v in inputs.items():
            inputs[k] = v.to(device)
    
        _, _, _, domain_pred = model(**inputs)
        
        # Note that we are not using the sentiment predictions here for updating the weights
        y_t_domain = torch.ones(training_parameters["batch_size"], dtype=torch.long).to(device)
        loss_t_domain = loss_fn_domain_classifier(domain_pred, y_t_domain)

        # Combining the loss 

        loss = loss_s_sentiment + loss_s_domain + loss_t_domain
        loss.backward()
        optimizer.step()

    # Evaluate the model after every epoch
    
    
    accuracy = evaluate(model, dataset = source, percentage = 1).item()
    print(f'Accuracy on {source} domain after epoch {epoch_idx} = {accuracy}')
#     print("Accuracy on amazon after epoch " + str(epoch_idx) + " is " + str(accuracy))

    accuracy = evaluate(model, dataset = target, percentage = 100).item()
    if len(target_acc)>0 and accuracy + 0.001 > target_acc[-1]:
        torch.save(model.state_dict(), os.path.join(training_parameters["output_folder"], "_"+str(accuracy)[:4]+"_"+training_parameters["output_file"]))
        target_acc.append(accuracy)
    elif len(target_acc)<=0:
        torch.save(model.state_dict(), os.path.join(training_parameters["output_folder"], "_"+str(accuracy)[:4]+"_"+training_parameters["output_file"] ))
        target_acc.append(accuracy)
    print(f'Accuracy on {target} domain after epoch {epoch_idx} = {accuracy}')
  

Training Step: 0
Training Step: 100
Training Step: 200
Training Step: 300
Training Step: 400
Training Step: 500
Training Step: 600
Training Step: 700
Accuracy on books domain after epoch 0 = 0.8360655307769775
Accuracy on dvd domain after epoch 0 = 0.7573221325874329
Training Step: 0
Training Step: 100
Training Step: 200
Training Step: 300
Training Step: 400
Training Step: 500
Training Step: 200
Training Step: 300
Training Step: 400
Training Step: 500
Training Step: 600
Training Step: 700
Accuracy on books domain after epoch 2 = 0.7827868461608887
Accuracy on dvd domain after epoch 2 = 0.6652719378471375
Training Step: 0
Training Step: 100
Training Step: 200
Training Step: 300
Training Step: 400
Training Step: 500
Training Step: 600
Training Step: 700
Accuracy on books domain after epoch 3 = 0.8442622423171997
Accuracy on dvd domain after epoch 3 = 0.6297070980072021
Training Step: 0
Training Step: 100
Training Step: 200
Training Step: 300
Training Step: 400
Training Step: 500
Training

# Evaluate the model on the entire dev set at the end

In [None]:
domain = "books"
accuracy = evaluate(model, dataset = domain, percentage = 100).item()  
print(f'Accuracy on {domain} domain = {accuracy}')

domain = "dvd"
accuracy = evaluate(model, dataset = domain, percentage = 100).item()  
print(f'Accuracy on {domain} domain = {accuracy}')

# domain = "electronics"
# accuracy = evaluate(model, dataset = domain, percentage = 50).item()  
# print(f'Accuracy on {domain} domain = {accuracy}')

# domain = "kitchen_housewares"
# accuracy = evaluate(model, dataset = domain, percentage = 50).item()  
# print(f'Accuracy on {domain} domain = {accuracy}')


In [None]:
# target_acc

In [None]:
model = DomainAdaptationModel()
model.to(device)
model.load_state_dict(torch.load("../working/_1.0_model.bin"))

In [None]:
domain = "books"
accuracy = evaluate(model, dataset = domain, percentage = 100).item()  
print(f'Accuracy on {domain} domain = {accuracy}')

domain = "dvd"
accuracy = evaluate(model, dataset = domain, percentage = 100).item()  
print(f'Accuracy on {domain} domain = {accuracy}')

# domain = "electronics"
# accuracy = evaluate(model, dataset = domain, percentage = 50).item()  
# print(f'Accuracy on {domain} domain = {accuracy}')

# domain = "kitchen_housewares"
# accuracy = evaluate(model, dataset = domain, percentage = 50).item()  
# print(f'Accuracy on {domain} domain = {accuracy}')
