This notebook demonstrates steps of creating a pipeline of model training and testing

<!-- %pip install -q transformers datasets evaluate -->

In [1]:
!git clone https://github.com/OopsWrongCode/nlp-project.git

Cloning into 'nlp-project'...
remote: Enumerating objects: 85, done.[K
remote: Counting objects: 100% (1/1), done.[K
remote: Total 85 (delta 0), reused 0 (delta 0), pack-reused 84 (from 1)[K
Receiving objects: 100% (85/85), 44.49 MiB | 20.41 MiB/s, done.
Resolving deltas: 100% (16/16), done.


In [2]:
%cd nlp-project/

/kaggle/working/nlp-project


In [18]:
%pip install -q transformers

Note: you may need to restart the kernel to use updated packages.


In [8]:
import random
import torch.nn as nn
import torch
from torch.utils.data import Dataset
import pandas as pd
seed = 42
import numpy as np
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True

In [None]:
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [26]:
train = pd.read_csv('/kaggle/working/nlp-project/data/train.csv')
test = pd.read_csv('/kaggle/working/nlp-project/data/test.csv')
validation = pd.read_csv('/kaggle/working/nlp-project/data/valid.csv')

In [10]:
train.head()

Unnamed: 0,text,label,token_count,text_length
0,"['checked', 'margies', 'candies', 'yelp', 'che...",intj,2186,17479
1,"['sosuga', 'next', 'are', 'still', 'freaking',...",enfj,690,6170
2,"['you', 'people', 'dont', 'understand', 'henry...",infp,1748,12667
3,"['include', 'pls', 'and', 'thank', 'you', 'yes...",enfp,959,9354
4,"['ive', 'ever', 'seen', 'more', 'dad', 'lookin...",infj,1775,12807


In [14]:
test.head()

Unnamed: 0,text
0,"['they', 'obsessed', 'with', 'pinks', 'more', ..."
1,"['cant', 'wait', 'getting', 'puppy', 'intervie..."
2,"['what', 'wonderful', 'day', 'alive', 'lmaoooo..."
3,"['exactly', 'most', 'midzys', 'were', 'actuall..."
4,"['didnt', 'even', 'know', 'there', 'were', 'th..."


In [None]:
from torch.utils.data import Dataset
import torch

class MBTIDataset(Dataset):
    def __init__(self, texts, tokenizer, max_len, labels=None):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __getitem__(self, index):
        text = str(self.texts[index])

        encoding = self.tokenizer(
            text,
            truncation=True,
            max_length=self.max_len,
            padding='max_length',
            return_tensors='pt'
        )

        item = {
            'input_ids': encoding['input_ids'].squeeze(0),  
            'attention_mask': encoding['attention_mask'].squeeze(0),
        }

        if self.labels is not None:
            item['labels'] = torch.tensor(self.labels[index], dtype=torch.long)

        return item

    def __len__(self):
        return len(self.texts)

In [None]:
from transformers import AutoTokenizer
from torch.utils.data import DataLoader

BATCH_SIZE = 32

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

train_dataset = MBTIDataset(texts=train['text'].tolist(), labels=train['label'].tolist(), tokenizer=tokenizer, max_len=512)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

test_dataset = MBTIDataset(texts=test['text'].tolist(), tokenizer=tokenizer, max_len=512)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

validation_dataset = MBTIDataset(texts=validation['text'].tolist(), labels=validation['label'].tolist(), tokenizer=tokenizer, max_len=512)
validation_loader = DataLoader(validation_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [None]:
class MyGRU(nn.Module):
    def __init__(self):
        pass

    def forward(self, x):
        pass

<torch.utils.data.dataloader.DataLoader at 0x79f6e91004f0>

In [None]:
class MyLSTM(nn.Module):
    def __init__(self):
        pass

    def forward(self, x):
        pass