In [None]:
# Download Dataset
!wget —no-check-certificate 'https://docs.google.com/uc?export=download&id=1FOyi0kAyW4utTfznN04xMoSergnEqPBV' -O data.zip
!unzip -q data.zip -d . && rm data.zip && rm -rf __MACOSX

# Download Fine-Tuned Model
!wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=1uxq5Gn_sXsN7y1nWnMMFXyLQ2OI6ScQL' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=1AvWN2REyz8vdywRmxG2--LTw7aPDIrtF" -O bert_gru_sentiment_classifier.pth && rm -rf /tmp/cookies.txt

## Import Library

In [None]:
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm
from sklearn.metrics import classification_report, accuracy_score

## Hyperparameters and Variables

In [None]:
LEARNING_RATE = 2e-5
EPOCHS = 3
N_CLASSES = 5
MAX_LEN = 128
BATCH_SIZE = 100
WARMUP_STEPS = 100

PRETRAINED_MODEL_NAME = 'bert-base-uncased'
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
train_df = pd.read_csv('data/train.csv')
val_df = pd.read_csv('data/valid.csv')
test_df = pd.read_csv('data/test.csv')

## Dataset Generator

In [None]:
class SentimentDataset(Dataset):
    """
    Subclass of torch.utils.data.Dataset, designed for sentiment analysis tasks.
    Tokenizes and prepares the input text data for use with a pre-trained transformer model.

    Attributes:
        data (pandas.DataFrame): Dataframe containing the input text data and corresponding labels.
        tokenizer (transformers.PreTrainedTokenizer): Tokenizer object compatible with the pre-trained transformer model.
        max_len (int): Maximum length for the tokenized text; longer texts will be truncated.
    """

    def __init__(self, data, tokenizer, max_len):
        self.data = data
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        """
        Returns the total number of samples in the dataset.

        Returns:
            int: Number of samples in the dataset.
        """
        return len(self.data)

    def __getitem__(self, idx):
        """
        Retrieves a single item from the dataset given its index.

        Args:
            idx (int): Index of the item to retrieve.

        Returns:
            dict: Dictionary containing the tokenized input_ids, attention_mask, and label for the requested item.
        """
        text = self.data.iloc[idx]['text']
        label = self.data.iloc[idx]['stars'] - 1
        tokens = self.tokenizer.encode_plus(text, add_special_tokens=True,
                                            max_length=self.max_len,
                                            padding='max_length',
                                            truncation=True,
                                            return_attention_mask=True,
                                            return_tensors='pt',
                                            )
        return {
            'input_ids': tokens['input_ids'].squeeze(),
            'attention_mask': tokens['attention_mask'].squeeze(),
            'label': torch.tensor(label, dtype=torch.long),
        }

## Bert+GRU(Bidirectional) Model

In [None]:
class BertGruSentimentClassifier(nn.Module):
    """
    Sentiment analysis model combining BERT and GRU layers.
    Inherits from the PyTorch Module class and is designed to classify the sentiment of a text.

    Args:
        n_classes (int): Number of output classes for sentiment classification.
    """

    def __init__(self, n_classes):
        """
        Initializes the BertGruSentimentClassifier model with the given number of output classes.

        Args:
            n_classes (int): Number of output classes for sentiment classification.
        """
        super().__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.gru = nn.GRU(input_size=self.bert.config.hidden_size,
                          hidden_size=256,
                          num_layers=2,
                          batch_first=True,
                          bidirectional=True,
                          dropout=0.2,
                          )
        self.out = nn.Linear(512, n_classes)

    def forward(self, input_ids, attention_mask):
        """
        Defines the forward pass of the BertGruSentimentClassifier model.

        Args:
            input_ids (torch.Tensor): Tokenized input text data, as a tensor.
            attention_mask (torch.Tensor): Attention mask for the input text data, as a tensor.

        Returns:
            out (torch.Tensor): Output logits for the sentiment classification.
        """
        bert_output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        last_hidden_state = bert_output.last_hidden_state
        gru_output, _ = self.gru(last_hidden_state)
        out = self.out(gru_output[:, -1])
        return out

In [None]:
tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)
train_dataset = SentimentDataset(train_df[['text', 'stars']], tokenizer, MAX_LEN)
val_dataset = SentimentDataset(val_df[['text', 'stars']], tokenizer, MAX_LEN)
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [None]:
model = BertGruSentimentClassifier(N_CLASSES)
model = model.to(DEVICE)

optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=WARMUP_STEPS, num_training_steps=len(train_dataloader) * EPOCHS)

## Train

In [None]:
def train_epoch(model, dataloader, optimizer, scheduler, device):
    """
    Trains the model for one epoch and returns the average loss.

    Args:
        model (nn.Module): Model to train.
        dataloader (DataLoader): DataLoader instance providing the training data.
        optimizer (Optimizer): Optimizer for updating model parameters.
        scheduler (Scheduler): Learning rate scheduler.
        device (torch.device): Device to use for running the model and data.

    Returns:
        float: Average loss for the epoch.
    """
    model.train()
    total_loss = 0
    progress_bar = tqdm(dataloader, desc="Training", unit="batch")
    
    for batch in progress_bar:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        logits = model(input_ids, attention_mask)
        loss = nn.CrossEntropyLoss()(logits, labels)
        total_loss += loss.item()

        loss.backward()
        optimizer.step()
        scheduler.step()

    return total_loss / len(dataloader)


def evaluate(model, dataloader, device):
    """
    Evaluates the model on a dataset and returns the average loss and accuracy.

    Args:
        model (nn.Module): Model to evaluate.
        dataloader (DataLoader): DataLoader instance providing the evaluation data.
        device (torch.device): Device to use for running the model and data.

    Returns:
        float: Average loss for the evaluation.
        float: Accuracy for the evaluation.
    """
    model.eval()
    total_loss = 0
    total_correct = 0
    progress_bar = tqdm(dataloader, desc="Evaluation", unit="batch")

    for batch in progress_bar:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        with torch.no_grad():
            logits = model(input_ids, attention_mask)

        loss = nn.CrossEntropyLoss()(logits, labels)
        total_loss += loss.item()

        preds = torch.argmax(logits, dim=1)
        total_correct += (preds == labels).sum().item()

    return total_loss / len(dataloader), total_correct / len(dataloader.dataset)

In [10]:
for epoch in range(EPOCHS):
    print(f'Epoch {epoch + 1}/{EPOCHS}')
    print('-' * 10)

    train_loss = train_epoch(model, train_dataloader, optimizer, scheduler, DEVICE)
    print(f'Train loss: {train_loss:.4f}')

    val_loss, val_acc = evaluate(model, val_dataloader, DEVICE)
    print(f'Validation loss: {val_loss:.4f}, accuracy: {val_acc:.4f}')

In [None]:
# Save trained Model
# Comment out for now since already have finetuned model
# training takes a long time
# torch.save(model.state_dict(), 'bert_gru_sentiment_classifier.pth')

In [None]:
# Load saved model
model = BertGruSentimentClassifier(N_CLASSES)
model.load_state_dict(torch.load("bert_gru_sentiment_classifier.pth", map_location=torch.device(DEVICE)))
model = model.to(DEVICE)
model.eval()

## Predict

In [None]:
def prepare_input(text, tokenizer, max_len):
    """
    Prepares the input for the model by tokenizing the text, applying padding and truncation,
    and converting the output to tensors.

    Args:
        text (str): Text to tokenize.
        tokenizer (func): Function to tokenize the text.
        max_len (int): Maximum length of the text.

    Returns:
        input_ids (torch.Tensor): Input ids tensor.
        attention_mask (torch.Tensor): Attention mask tensor.
    """
    tokens = tokenizer.encode_plus(text, add_special_tokens=True,
                                   max_length=max_len,
                                   padding="max_length",
                                   truncation=True,
                                   return_attention_mask=True,
                                   return_tensors="pt",
                                   )
    input_ids = tokens["input_ids"].squeeze().to(DEVICE)
    attention_mask = tokens["attention_mask"].squeeze().to(DEVICE)
    return input_ids, attention_mask


def predict(text_list, model, tokenizer, max_len=128):
    """
    Predicts the sentiment of a list of texts using the given model and tokenizer.

    Args:
        text_list (pd.DataFrame): List of texts.
        model (nn.Module): Model to use for prediction.
        tokenizer (func): Function to tokenize the text.
        max_len (int, optional): Maximum length of the text. Defaults to 128.

    Returns:
        predictions (list): List of predictions.
    """
    predictions = []
    for text in tqdm(text_list):
        input_ids, attention_mask = prepare_input(text, tokenizer, max_len)
        with torch.no_grad():
            logits = model(input_ids.unsqueeze(0), attention_mask.unsqueeze(0))
        probs = torch.softmax(logits, dim=1)
        prediction = torch.argmax(probs, dim=1).item() + 1
        predictions.append(prediction)
    return predictions


def score_report(y_true, y_pred):
    """
    Prints the classification report and the confusion matrix for the true and predicted labels.

    Args:
        y_true (list): True labels.
        y_pred (list): Predicted labels.
    """
    print(classification_report(y_true, y_pred))
    print('accuracy', accuracy_score(y_true, y_pred))


def create_predict(df, save_path='data/pred.csv'):
    """
    Generates predictions for the input DataFrame, optionally saves them to a CSV file, and
    computes and displays a classification report if true labels are provided.

    Args:
        df (pandas.DataFrame): Input DataFrame containing the 'text' and optionally 'stars' columns.
        save_path (str, optional): Path to save the predictions as a CSV file. Default: 'data/pred.csv'.
    """
    prediction = predict(df['text'], model, tokenizer)
    pd.concat([df['review_id'], pd.DataFrame((prediction), columns=['stars'])], axis=1).to_csv(save_path, index=False)
    if 'stars' in df.columns:
        score_report(df['stars'], prediction)

In [None]:
create_predict(val_df, 'data/val_pred.csv')
create_predict(test_df)