# Import Data from Kaggle

In [None]:
!pip install -q kaggle

In [None]:
from google.colab import files
files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"nihalmenon","key":"ae474103901b1b6687fbaf66094ad5be"}'}

In [None]:
!mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json

In [None]:
! kaggle competitions download -c quora-question-pairs

Downloading quora-question-pairs.zip to /content
 99% 306M/309M [00:16<00:00, 22.7MB/s]
100% 309M/309M [00:16<00:00, 19.3MB/s]


In [None]:
!unzip -d data quora-question-pairs.zip

Archive:  quora-question-pairs.zip
  inflating: data/sample_submission.csv.zip  
  inflating: data/test.csv           
  inflating: data/test.csv.zip       
  inflating: data/train.csv.zip      


# Look through Data

In [None]:
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [None]:
from pathlib import Path
import zipfile

data_path = Path("data/")
train_path = data_path / "train"
test_path = data_path / "test"

if train_path.is_dir():
  print(f"{train_path} directory already exists...")
else:
  print(f"{train_path} does not exist. creating directory.")
  train_path.mkdir(parents=True, exist_ok=True)

if test_path.is_dir():
  print(f"{test_path} directory already exists...")
else:
  print(f"{test_path} does not exist. creating directory.")
  test_path.mkdir(parents=True, exist_ok=True)

with zipfile.ZipFile(data_path / "train.csv.zip", "r") as zip_ref:
  print("Unzipping train data...")
  zip_ref.extractall(train_path)

with zipfile.ZipFile(data_path / "test.csv.zip", "r") as zip_ref:
  print("Unzipping test data...")
  zip_ref.extractall(test_path)

data/train does not exist. creating directory.
data/test does not exist. creating directory.
Unzipping train data...
Unzipping test data...


In [None]:
import os

def walk_through_dir(dir_path):
  for dirpath, dirnames, filenames in os.walk(dir_path):
    print(f"There are {len(dirnames)} directories and {len(filenames)} files in {dirpath}.")

walk_through_dir(data_path)

There are 2 directories and 4 files in data.
There are 0 directories and 1 files in data/train.
There are 0 directories and 1 files in data/test.


In [None]:
import pandas as pd

train_data = pd.read_csv(train_path / "train.csv", nrows=1000)

In [None]:
train_data.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


# Transform Data *(Attempt 2)*

Using https://mccormickml.com/2019/07/22/BERT-fine-tuning/ for tokenizing.



In [None]:
import torch
from torch.utils.data import DataLoader, TensorDataset
from torchtext import datasets, transforms

## Create BERT Tokenizer

In [None]:
from transformers import BertTokenizerFast

print('Loading BERT Tokenizer Fast')
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased', do_lower_case=True, device=device)

Loading BERT Tokenizer Fast


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
sample_text = "This is a sentence."

tokenized = tokenizer.tokenize(sample_text)
encoded = tokenizer.convert_tokens_to_ids(tokenized)

print(f"Sample text: {sample_text}")
print(f"\nTokenized: {tokenized}")
print(f"\nEncoded: {encoded}") # same as tokenizer.encode(sample_text)

Sample text: This is a sentence.

Tokenized: ['this', 'is', 'a', 'sentence', '.']

Encoded: [2023, 2003, 1037, 6251, 1012]


In [None]:
encoded_pair = tokenizer.encode(train_data['question1'][0], train_data['question2'][0])
tokenizer.decode(encoded_pair)

'[CLS] what is the step by step guide to invest in share market in india? [SEP] what is the step by step guide to invest in share market? [SEP]'

## Normalize Data (Sentence Length)

In [None]:
train_data["question1_length"] = train_data["question1"].apply(lambda x : len(tokenizer.encode(x)))
train_data["question2_length"] = train_data["question2"].apply(lambda x : len(tokenizer.encode(x)))

train_data["total_length"] = train_data["question1_length"] + train_data["question2_length"]

train_data.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate,question1_length,question2_length,total_length
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0,17,15,32
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0,21,26,47
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0,18,14,32
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0,15,25,40
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0,20,10,30


In [None]:
max_length = max(train_data["total_length"]) + 20 # in case of longer test sentences
max_length

113

## Split Data

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_validation, y_train, y_validation = train_test_split(train_data[["question1", "question2"]], train_data["is_duplicate"], test_size=0.2)

X_train.head()

Unnamed: 0,question1,question2
889,Have you ever been fired?,Have you ever been fired? How did you deal wit...
902,What are the best riddles by the Riddler (Batm...,Who was the best villain in the Nolan Batman f...
767,How has the vertebral column anatomy changed t...,When calculating bullet spin; MV X 12 (twist r...
479,Did Mahabharata really happen?,Did Mahabharata really happen or only an alleg...
977,How did Lucifer and Gabrielle relate to each o...,"How do the roles of the CIA, the NSC, and othe..."


## Tokenize Dataset

In [None]:
import torch
from torch.utils.data import TensorDataset
from tqdm import tqdm

In [None]:
def convert_to_tensor_dataset(dataset, labels):
  input_ids = []
  attention_masks = []
  token_type_ids = []

  for index, row in tqdm(dataset.iterrows(), total=len(dataset)):
    encoded_dict = tokenizer.encode_plus(
        row['question1'],
        row['question2'],
        max_length=max_length,
        padding='max_length',
        return_attention_mask=True,
        return_tensors='pt',
        truncation=True
    )
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])
    token_type_ids.append(encoded_dict['token_type_ids'])

  print('loop finished')
  # Convert the lists into tensors
  input_ids = torch.cat(input_ids, dim=0)
  print('input_ids finished')
  attention_masks = torch.cat(attention_masks, dim=0)
  print('attention_masks finished')
  token_type_ids = torch.cat(token_type_ids, dim=0)
  print('token_type_ids finished')
  labels = torch.tensor(labels.values)
  print('labels finished')


  return TensorDataset(input_ids, attention_masks, token_type_ids, labels)

In [None]:
X_train.shape, y_train.shape

((800, 2), (800,))

In [None]:
train_data_transformed = convert_to_tensor_dataset(X_train, y_train)
val_data_transformed = convert_to_tensor_dataset(X_validation, y_validation)

100%|██████████| 800/800 [00:00<00:00, 2019.07it/s]


loop finished
input_ids finished
attention_masks finished
token_type_ids finished
labels finished


100%|██████████| 200/200 [00:00<00:00, 2395.38it/s]

loop finished
input_ids finished
attention_masks finished
token_type_ids finished
labels finished





## Prep DataLoaders

In [None]:
from torch.utils.data import dataloader
import os

In [None]:
BATCH_SIZE=16

NUM_WORKERS = os.cpu_count()

train_dataloader = DataLoader(
    dataset=train_data_transformed,
    batch_size=BATCH_SIZE,
    num_workers=NUM_WORKERS,
    shuffle=True
)

val_dataloader = DataLoader(
    dataset=val_data_transformed,
    batch_size=BATCH_SIZE,
    num_workers=NUM_WORKERS,
    shuffle=False
)

train_dataloader, val_dataloader

(<torch.utils.data.dataloader.DataLoader at 0x7ae7f89f88b0>,
 <torch.utils.data.dataloader.DataLoader at 0x7ae7f89f84f0>)

In [None]:
len(train_dataloader), len(val_dataloader)

(50, 13)

In [None]:
input, mask, token_type, label = next(iter(train_dataloader))
input, mask, token_type, label

(tensor([[  101,  2065, 18520,  ...,     0,     0,     0],
         [  101,  2129,  2079,  ...,     0,     0,     0],
         [  101,  2054,  2003,  ...,     0,     0,     0],
         ...,
         [  101,  2054,  2024,  ...,     0,     0,     0],
         [  101,  2003,  2009,  ...,     0,     0,     0],
         [  101,  2339,  2106,  ...,     0,     0,     0]]),
 tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 tensor([[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         ...,
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]]),
 tensor([1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0]))

# Build and Train Model

## Model 1: BertForSequenceClassification

* From the huggingface library https://huggingface.co/docs/transformers/model_doc/bert



In [None]:
from transformers import BertForSequenceClassification

In [None]:
# import our BERT pretrained model

model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=2,
    output_attentions=False,
    output_hidden_states=False)

model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [None]:
# print out parameters
params = list(model.named_parameters())

print(f"Bert model has {len(params)} parameters.\n")
print(f"-------Embedding Layer----------\n")
for p in params[:5]:
  print(f"{p[0]} {str(tuple(p[1].size()))}")

print("\n-------First Transformer--------\n")

for p in params[5:21]:
  print(f"{p[0]} {str(tuple(p[1].size()))}")

print("\n-------Output Layer----------\n")
for p in params[-4:]:
  print(f"{p[0]} {str(tuple(p[1].size()))}")

Bert model has 201 parameters.

-------Embedding Layer----------

bert.embeddings.word_embeddings.weight (30522, 768)
bert.embeddings.position_embeddings.weight (512, 768)
bert.embeddings.token_type_embeddings.weight (2, 768)
bert.embeddings.LayerNorm.weight (768,)
bert.embeddings.LayerNorm.bias (768,)

-------First Transformer--------

bert.encoder.layer.0.attention.self.query.weight (768, 768)
bert.encoder.layer.0.attention.self.query.bias (768,)
bert.encoder.layer.0.attention.self.key.weight (768, 768)
bert.encoder.layer.0.attention.self.key.bias (768,)
bert.encoder.layer.0.attention.self.value.weight (768, 768)
bert.encoder.layer.0.attention.self.value.bias (768,)
bert.encoder.layer.0.attention.output.dense.weight (768, 768)
bert.encoder.layer.0.attention.output.dense.bias (768,)
bert.encoder.layer.0.attention.output.LayerNorm.weight (768,)
bert.encoder.layer.0.attention.output.LayerNorm.bias (768,)
bert.encoder.layer.0.intermediate.dense.weight (3072, 768)
bert.encoder.layer.0.int

### Prepare Optimizer and Loss function

In [None]:
from torch import nn

In [None]:
# optimizer
loss_fn = None
optimizer = torch.optim.SGD(params=model.parameters(), lr=.001)

#### Build train step function

In [None]:
from tqdm import tqdm
from typing import Tuple, Dict, List
import torch

In [None]:
import numpy
from sklearn.metrics import accuracy_score

def train_step(model: torch.nn.Module,
               dataloader: torch.utils.data.DataLoader,
               loss_fn: torch.nn.Module,
               optimizer: torch.optim.Optimizer,
               accuracy_fn=accuracy_score,
               device: torch.device=device) -> Tuple[float, float]:
  """Trains a PyTorch model for a single epoch.

  Returns:
    A tuple of training loss and training accuracy metrics.
    In the form (train_loss, train_accuracy).
  """
  # Put model in train mode
  model.train()

  # Setup train loss and train accuracy values
  train_loss, train_acc = 0, 0
  # Loop through data loader data batches
  for i, batch in enumerate(dataloader):

      input_ids, attention_masks, token_type_ids, labels = batch
      # Send data to target device
      input_ids, attention_masks, token_type_ids, labels = input_ids.to(device), attention_masks.to(device), token_type_ids.to(device), labels.to(device)
      y=labels

      # 1. Forward pass
      output = model(input_ids,
                     attention_mask=attention_masks,
                     token_type_ids=token_type_ids,
                     labels=labels)

      loss = output[0]
      logits = output[1]

      # 2. Calculate  and accumulate loss
      train_loss += loss

      # 3. Optimizer zero grad
      optimizer.zero_grad()

      # 4. Loss backward
      loss.backward()

      # 5. Optimizer step
      optimizer.step()

      # Calculate and accumulate accuracy metric across all batches
      y_pred = numpy.argmax(logits.detach().cpu().numpy(), axis=1).flatten()
      train_acc += accuracy_fn(labels.cpu(), y_pred)

  # Adjust metrics to get average loss and accuracy per batch
  train_loss = train_loss / len(dataloader)
  # train_acc = train_acc / len(dataloader)
  return train_loss, train_acc

### Build Test Step funtion

In [None]:
import numpy
from sklearn.metrics import accuracy_score

def test_step(model: torch.nn.Module,
              dataloader: torch.utils.data.DataLoader,
              loss_fn: torch.nn.Module,
              accuracy_fn=accuracy_score,
              device: torch.device=device) -> Tuple[float, float]:
  """Tests a PyTorch model for a single epoch.

  Returns:
    A tuple of testing loss and testing accuracy metrics.
    In the form (test_loss, test_accuracy).
  """
  # Put model in eval mode
  model.eval()

  # Setup test loss and test accuracy values
  test_loss, test_acc = 0, 0

  # Turn on inference context manager
  with torch.inference_mode():
      # Loop through DataLoader batches
      for i, batch in enumerate(dataloader):

          input_ids, attention_masks, token_type_ids, labels = batch

          # Send data to target device
          input_ids, attention_masks, token_type_ids, labels = input_ids.to(device), attention_masks.to(device), token_type_ids.to(device), labels.to(device)
          y=labels

          # 1. Forward pass
          output = model(input_ids,
                     attention_mask=attention_masks,
                     token_type_ids=token_type_ids,
                     labels=labels)

          loss = output[0]
          logits = output[1]

          # 2. Calculate and accumulate loss
          test_loss += loss

          # Calculate and accumulate accuracy
          y_pred = numpy.argmax(logits.detach().cpu().numpy(), axis=1).flatten()
          test_acc += accuracy_fn(labels.cpu(), y_pred)

  # Adjust metrics to get average loss and accuracy per batch
  test_loss = test_loss / len(dataloader)
  test_acc = test_acc / len(dataloader)
  return test_loss, test_acc

### Build our training loop

In [None]:
def train(model: torch.nn.Module,
          train_dataloader: torch.utils.data.DataLoader,
          test_dataloader: torch.utils.data.DataLoader,
          optimizer: torch.optim.Optimizer,
          loss_fn: torch.nn.Module,
          epochs: int,
          device: torch.device=device) -> Dict[str, List]:
  """Trains and tests a PyTorch model.

  Returns:
    A dictionary of training and testing loss as well as training and
    testing accuracy metrics. Each metric has a value in a list for
    each epoch.
  """
  print(f"Starting training process...")
  # Create empty results dictionary
  results = {"train_loss": [],
      "train_acc": [],
      "test_loss": [],
      "test_acc": []
  }

  # Loop through training and testing steps for a number of epochs
  for epoch in tqdm(range(epochs)):
      train_loss, train_acc = train_step(model=model,
                                          dataloader=train_dataloader,
                                          loss_fn=loss_fn,
                                          optimizer=optimizer,
                                          device=device)
      test_loss, test_acc = test_step(model=model,
          dataloader=test_dataloader,
          loss_fn=loss_fn,
          device=device)

      # Print out what's happening
      print(
          f"Epoch: {epoch+1} | "
          f"train_loss: {train_loss:.4f} | "
          f"train_acc: {train_acc:.4f} | "
          f"test_loss: {test_loss:.4f} | "
          f"test_acc: {test_acc:.4f}"
      )

      # Update results dictionary
      results["train_loss"].append(train_loss)
      results["train_acc"].append(train_acc)
      results["test_loss"].append(test_loss)
      results["test_acc"].append(test_acc)

  # Return the filled results at the end of the epochs
  return results

### Train

In [None]:
# Function for formatting time
from timeit import default_timer as timer
def print_train_time(start: float, end: float, device: torch.device = None):
    """Prints difference between start and end time.

    Args:
        start (float): Start time of computation (preferred in timeit format).
        end (float): End time of computation.
        device ([type], optional): Device that compute is running on. Defaults to None.

    Returns:
        float: time between start and end in seconds (higher is longer).
    """
    total_time = end - start
    print(f"Train time on {device}: {total_time:.3f} seconds")
    return total_time

In [None]:
from timeit import default_timer as timer

EPOCHS = 6
train_time_start = timer()
train(model=model, train_dataloader=train_dataloader, test_dataloader=val_dataloader, optimizer=optimizer, loss_fn=loss_fn, epochs=EPOCHS)
train_time_end = timer()
total_train_time_model = print_train_time(start=train_time_start,
                                          end=train_time_end,
                                          device=str(next(model.parameters()).device))

Starting training process...


  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
 17%|█▋        | 1/6 [00:15<01:18, 15.63s/it]

Epoch: 1 | train_loss: 0.6724 | train_acc: 30.0625 | test_loss: 0.6601 | test_acc: 0.6106


  self.pid = os.fork()
  self.pid = os.fork()
 33%|███▎      | 2/6 [00:31<01:02, 15.71s/it]

Epoch: 2 | train_loss: 0.6660 | train_acc: 30.0000 | test_loss: 0.6506 | test_acc: 0.6106


  self.pid = os.fork()
  self.pid = os.fork()
 50%|█████     | 3/6 [00:47<00:47, 15.68s/it]

Epoch: 3 | train_loss: 0.6553 | train_acc: 30.4375 | test_loss: 0.6555 | test_acc: 0.6106


  self.pid = os.fork()
  self.pid = os.fork()
 67%|██████▋   | 4/6 [01:02<00:31, 15.58s/it]

Epoch: 4 | train_loss: 0.6436 | train_acc: 30.8750 | test_loss: 0.6285 | test_acc: 0.6106


  self.pid = os.fork()
  self.pid = os.fork()
 83%|████████▎ | 5/6 [01:17<00:15, 15.49s/it]

Epoch: 5 | train_loss: 0.6338 | train_acc: 29.7500 | test_loss: 0.6156 | test_acc: 0.6106


  self.pid = os.fork()
  self.pid = os.fork()
100%|██████████| 6/6 [01:33<00:00, 15.53s/it]

Epoch: 6 | train_loss: 0.6148 | train_acc: 31.5000 | test_loss: 0.6022 | test_acc: 0.6106
Train time on cuda:0: 93.359 seconds





## Shape-Based Debugging

In [None]:
batch = next(iter(train_dataloader))
print(batch[0].shape, batch[1].shape, batch[2].shape)

torch.Size([16, 113]) torch.Size([16, 113]) torch.Size([16])


In [None]:
batch

[tensor([[ 101, 2054, 2024,  ...,    0,    0,    0],
         [ 101, 2054, 1005,  ...,    0,    0,    0],
         [ 101, 2129, 2079,  ...,    0,    0,    0],
         ...,
         [ 101, 2129, 2079,  ...,    0,    0,    0],
         [ 101, 2054, 2024,  ...,    0,    0,    0],
         [ 101, 2339, 6343,  ...,    0,    0,    0]]),
 tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 tensor([0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1])]

In [None]:
input_ids, attention_masks, labels = batch
input_ids = input_ids.to(device)
attention_masks = attention_masks.to(device)
labels=labels.to(device)

output = model(input_ids,
                     attention_mask=attention_masks,
                     labels=labels)

In [None]:
output[0], output[1]

(tensor(0.6919, device='cuda:0', grad_fn=<NllLossBackward0>),
 tensor([[ 0.1649,  0.2726],
         [ 0.3273,  0.6789],
         [ 0.1883,  0.5527],
         [-0.0738,  0.4794],
         [ 0.3704,  0.6081],
         [ 0.2819,  0.7817],
         [ 0.6704,  0.7681],
         [ 0.3168,  0.9711],
         [ 0.3202,  0.5780],
         [ 0.1797,  0.9498],
         [ 0.1509,  0.2349],
         [ 0.1146,  0.3128],
         [ 0.6453,  0.9218],
         [ 0.0973,  0.4067],
         [ 0.2586,  0.2677],
         [ 0.2756,  0.4788]], device='cuda:0', grad_fn=<AddmmBackward0>))

In [None]:
loss, logits = output[0], output[1]

In [None]:
for batch in enumerate(train_dataloader):
  print(batch)
  break

(0, [tensor([[ 101, 2054, 2003,  ...,    0,    0,    0],
        [ 101, 2054, 2003,  ...,    0,    0,    0],
        [ 101, 2054, 2024,  ...,    0,    0,    0],
        ...,
        [ 101, 2054, 2064,  ...,    0,    0,    0],
        [ 101, 2054, 1005,  ...,    0,    0,    0],
        [ 101, 2054, 2024,  ...,    0,    0,    0]]), tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), tensor([0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0])])


## Memory-based debugging

In [None]:
!nvidia-smi

Sat Mar 30 20:00:40 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   53C    P0              26W /  70W |   1991MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

## Evaluate

In [None]:
test_data = pd.read_csv(test_path / "test.csv", nrows=5000)
test_data.head()

Unnamed: 0,test_id,question1,question2
0,0,How does the Surface Pro himself 4 compare wit...,Why did Microsoft choose core m3 and not core ...
1,1,Should I have a hair transplant at age 24? How...,How much cost does hair transplant require?
2,2,What but is the best way to send money from Ch...,What you send money to China?
3,3,Which food not emulsifiers?,What foods fibre?
4,4,"How ""aberystwyth"" start reading?",How their can I start reading?


In [None]:
duplicates = 0
for index, row in train_data.iterrows():
  if row["is_duplicate"] == 1: duplicates+=1

print(duplicates)
print(duplicates/len(train_data))

380
0.38


In [None]:
test_data.head()

Unnamed: 0,test_id,question1,question2
0,0,How does the Surface Pro himself 4 compare wit...,Why did Microsoft choose core m3 and not core ...
1,1,Should I have a hair transplant at age 24? How...,How much cost does hair transplant require?
2,2,What but is the best way to send money from Ch...,What you send money to China?
3,3,Which food not emulsifiers?,What foods fibre?
4,4,"How ""aberystwyth"" start reading?",How their can I start reading?


In [None]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   test_id    5000 non-null   int64 
 1   question1  5000 non-null   object
 2   question2  5000 non-null   object
dtypes: int64(1), object(2)
memory usage: 117.3+ KB


In [None]:
test_data_transformed = convert_to_tensor_dataset()

TypeError: convert_to_tensor_dataset() missing 2 required positional arguments: 'dataset' and 'labels'