In [1]:
import pandas as pd
import numpy as np

### Read in Data

In [2]:
df = pd.read_csv('../data/text.csv')

In [3]:
df.drop('Unnamed: 0', inplace=True, axis=1)

In [4]:
df.head()

Unnamed: 0,text,label
0,i just feel really helpless and heavy hearted,4
1,ive enjoyed being able to slouch about relax a...,0
2,i gave up my internship with the dmrg and am f...,4
3,i dont know i feel so lost,0
4,i am a kindergarten teacher and i am thoroughl...,4


In [57]:
max_length = 0
def find_max_length(text):
    global max_length
    if len(text) > max_length:
        max_length = len(text)
df['text'].apply(find_max_length)
print(max_length)

830


### Prep and Split data into test and train 

In [35]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [36]:
encoder = LabelEncoder()

In [41]:
df['label'] = encoder.fit_transform(df['label'])

In [42]:
df.head()

Unnamed: 0,text,label
0,i just feel really helpless and heavy hearted,4
1,ive enjoyed being able to slouch about relax a...,0
2,i gave up my internship with the dmrg and am f...,4
3,i dont know i feel so lost,0
4,i am a kindergarten teacher and i am thoroughl...,4


In [43]:
train_texts, test_texts, train_labels, test_labels = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42)

### Import BertTokenizer from transformers

In [46]:
from transformers import BertTokenizer

In [47]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [58]:
##### Function to tokenize texts
def tokenize_function(texts):
    return tokenizer(texts.tolist(), padding='max_length', truncation=True, max_length=128)


In [59]:
train_encodings = tokenize_function(train_texts)
test_encodings = tokenize_function(test_texts)

In [64]:
## Imports for custom Dataset
import torch
from torch.utils.data import Dataset, DataLoader

In [85]:
# Custom Dataset object for orginization

class TextDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        if idx >= len(self.labels):
            raise IndexError("Index out of range in Dataset __getitem__ method.")
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(labels)

In [86]:
train_dataset = TextDataset(train_encodings, train_labels)
test_dataset = TextDataset(test_encodings, test_labels)

In [87]:
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

In [70]:
from transformers import BertForSequenceClassification

In [72]:
# Load the BERT Model for Classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(encoder.classes_))

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [74]:
# If cuda available then use it, otherwise CPU
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
# Send to device
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [77]:
# Optimizer
from transformers import AdamW
optimizer = AdamW(model.parameters(), lr=5e-5)



### Training

In [78]:
num_epochs = 5

In [79]:
num_training_steps = num_epochs * len(train_loader)

In [80]:
from transformers import get_scheduler

In [81]:
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

In [88]:
from tqdm.auto import tqdm
progress_bar = tqdm(range(num_training_steps))

  0%|          | 0/130255 [00:00<?, ?it/s]

In [89]:
model.train()
for epoch in range(num_epochs):
    for batch in train_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

KeyError: 270222

In [84]:
assert len(train_encodings['input_ids']) == len(train_labels), "Training encodings and labels length mismatch"
assert len(test_encodings['input_ids']) == len(test_labels), "Test encodings and labels length mismatch"

print(f"Number of training samples: {len(train_encodings['input_ids'])}")
print(f"Number of training labels: {len(train_labels)}")
print(f"Number of test samples: {len(test_encodings['input_ids'])}")
print(f"Number of test labels: {len(test_labels)}")

Number of training samples: 333447
Number of training labels: 333447
Number of test samples: 83362
Number of test labels: 83362
