In [None]:
!pip install transformers
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from datasets import load_dataset
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

In [None]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly i

In [None]:
DATA_ROOT = "/content/drive/My Drive/dataset/"
df = pd.read_csv(DATA_ROOT+'toxic_classification_preprocessed.csv',names = ['target','comment_text','preprocessed_text'], skiprows = 1)
#df = df.dropna(axis = 0, how = 'any')
train = df.sample(frac=0.1, random_state=42).reset_index(drop=True)
test = df.head(10000)
class ToxicDataset(Dataset):
    def __init__(self, dataframe):
        self.data = dataframe
        self.texts = self.data.comment_text
        self.labels = self.data.target
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        review = self.texts[idx]
        sentiment = self.labels[idx]
        return {'text': review, 'sentiment': sentiment}


train_dataset = ToxicDataset(train)
test_dataset = ToxicDataset(test)

# class ToxicDataset(Dataset):
#     def __init__(self, csv_file):
#         DATA_ROOT = "/content/drive/My Drive/dataset/"
#         self.data = pd.read_csv(DATA_ROOT+csv_file,names = ['target','comment_text','preprocessed_text'], skiprows = 1)
#         self.data = self.data.dropna(axis = 0, how = 'any')
#         self.data = self.data.sample(frac=0.001, random_state=42).reset_index(drop=True)
#         self.texts = self.data.preprocessed_text
#         self.labels = self.data.target
#     def __len__(self):
#         return len(self.data)

#     def __getitem__(self, idx):
#         review = self.texts[idx]
#         sentiment = self.labels[idx]
#         return {'text': review, 'sentiment': sentiment}


In [None]:
test.iloc[139]

target                                                               0
comment_text         I'm wondering why the "parent" went directly t...
preprocessed_text    wonder parent went direct ww instead email tea...
Name: 139, dtype: object

In [None]:
test.iloc[140]

target                     0
comment_text         Me too!
preprocessed_text        NaN
Name: 140, dtype: object

In [None]:
optimizer = AdamW(model.parameters(), lr=1e-5)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

model.train()



BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [None]:
len(test_dataset)

10000

In [None]:
print(f"Train Dataset Length: {len(train_dataset)}")
print(f"Test Dataset Length: {len(test_dataset)}")

Train Dataset Length: 180487
Test Dataset Length: 10000


In [None]:
loss_fn = torch.nn.CrossEntropyLoss()

In [None]:
num_epoch = 0
num_batch = 0
for epoch in range(1):  # Adjust the number of training epochs as needed
    model.train()
    num_batch = 0
    for batch in train_loader:
        
        try:
          inputs = tokenizer(batch['text'], padding=True, truncation=True, return_tensors='pt')
        except:
          print(batch['text'])
          print("error length:   ", len(batch['text']))

        #inputs = tokenizer(batch['text'], padding=True, truncation=True, return_tensors='pt')
    
        inputs = {k: v.to(device) for k, v in inputs.items()}
        labels = batch['sentiment'].to(device)

        model.zero_grad()

        optimizer.zero_grad()
        outputs = model(**inputs)
        #print(labels)
        loss = loss_fn(outputs.logits, labels)
        loss.backward()

        
        optimizer.step()
        print("end: ", num_batch)
        num_batch += 1
        
# Step 6: Evaluation
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

model.eval()

true_labels = []
predicted_labels = []

with torch.no_grad():
    for batch in test_loader:
        inputs = tokenizer(batch['text'], padding=True, truncation=True, return_tensors='pt')
        inputs = {k: v.to(device) for k, v in inputs.items()}
        labels = batch['sentiment'].to(device)

        outputs = model(**inputs)
        _, predicted = torch.max(outputs.logits, 1)

        true_labels.extend(labels.tolist())
        predicted_labels.extend(predicted.tolist())

classification_rep = classification_report(true_labels, predicted_labels)
print(f'Classification Report:\n{classification_rep}')


end:  0
end:  1
end:  2
end:  3
end:  4
end:  5
end:  6
end:  7
end:  8
end:  9
end:  10
end:  11
end:  12
end:  13
end:  14
end:  15
end:  16
end:  17
end:  18
end:  19
end:  20
end:  21
end:  22
end:  23
end:  24
end:  25
end:  26
end:  27
end:  28
end:  29
end:  30
end:  31
end:  32
end:  33
end:  34
end:  35
end:  36
end:  37
end:  38
end:  39
end:  40
end:  41
end:  42
end:  43
end:  44
end:  45
end:  46
end:  47
end:  48
end:  49
end:  50
end:  51
end:  52
end:  53
end:  54
end:  55
end:  56
end:  57
end:  58
end:  59
end:  60
end:  61
end:  62
end:  63
end:  64
end:  65
end:  66
end:  67
end:  68
end:  69
end:  70
end:  71
end:  72
end:  73
end:  74
end:  75
end:  76
end:  77
end:  78
end:  79
end:  80
end:  81
end:  82
end:  83
end:  84
end:  85
end:  86
end:  87
end:  88
end:  89
end:  90
end:  91
end:  92
end:  93
end:  94
end:  95
end:  96
end:  97
end:  98
end:  99
end:  100
end:  101
end:  102
end:  103
end:  104
end:  105
end:  106
end:  107
end:  108
end:  109
end:  110


OutOfMemoryError: ignored

In [None]:
test_loader[140]