In [2]:
# !pip install transformers
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import transformers
from transformers import AutoModel, BertTokenizerFast
from transformers import BertForSequenceClassification
from transformers import Trainer, TrainingArguments

In [3]:
combined_data = pd.read_csv('combined_data.csv')
# combined_data = pd.read_csv('combined_data_sentence_broken.csv')
combined_data.head()

Unnamed: 0,link,subject,name,count,class,text
0,https://forge.medium.com/you-dont-need-more-mo...,motivation,0.txt,358,0,'one greatest talents has always been coming w...
1,https://medium.com/swlh/theres-no-such-thing-a...,motivation,1.txt,1243,0,"highly motivated.', don’t have amazing willpow..."
2,https://medium.com/the-mission/the-most-motiva...,motivation,2.txt,639,0,motivational statement comes down three words:...
3,https://medium.com/swlh/how-to-make-yourself-w...,motivation,3.txt,884,0,"break the chain.”', 'these four simple words h..."
4,https://betterhumans.pub/how-to-do-a-life-chan...,motivation,4.txt,980,0,'when most people think accountability partner...


In [4]:
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased", do_lower_case=True)


In [5]:
train_text, valid_text, train_labels, val_labels = train_test_split(combined_data['text'].tolist(), combined_data['class'].tolist(), 
                                                                    test_size=0.2)



In [6]:
# tokenize the dataset, truncate when passed `max_length`, 
# and pad with 0's when less than `max_length`
max_length = 512
train_encodings = tokenizer(train_text, truncation=True, padding=True, max_length=max_length)
val_encodings = tokenizer(valid_text, truncation=True, padding=True, max_length=max_length)

In [7]:
class myDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = myDataset(train_encodings, train_labels)
val_dataset = myDataset(val_encodings, val_labels)

In [8]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased")


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [9]:
training_args = TrainingArguments(
    output_dir='./results',          
    evaluation_strategy="epoch",     
    num_train_epochs=20,             
    per_device_train_batch_size=8,  
    per_device_eval_batch_size=8,   
    warmup_steps=500,               
    weight_decay=0.03,              
    save_total_limit=1,             
)


trainer = Trainer(
    model=model,                    
    args=training_args,             
    train_dataset=train_dataset,    
    eval_dataset=val_dataset,       
)

trainer.train()

Epoch,Training Loss,Validation Loss
1,No log,0.716219
2,No log,0.687508
3,No log,0.670945
4,No log,0.621141
5,No log,0.459523
6,No log,0.357964
7,No log,0.296726
8,No log,0.198959
9,No log,0.158584
10,No log,0.123078


TrainOutput(global_step=280, training_loss=0.21137819290161133, metrics={'train_runtime': 272.4645, 'train_samples_per_second': 7.928, 'train_steps_per_second': 1.028, 'total_flos': 726481798594560.0, 'train_loss': 0.21137819290161133, 'epoch': 20.0})

In [36]:
trainer.evaluate()

{'epoch': 20.0,
 'eval_loss': 0.0958588570356369,
 'eval_runtime': 1.0078,
 'eval_samples_per_second': 27.782,
 'eval_steps_per_second': 3.969}

In [42]:
model.save_pretrained("/content/gdrive/MyDrive/models/bert_classification_lm")

In [40]:
# model.from_pretrained("/content/gdrive/MyDrive/models/bert_classification_lm")

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [49]:
# !zip "content/bert_classification_lm.zip" "content/gdrive/My Drive/models/bert_classification_lm"
!cp "/content/gdrive/MyDrive/models/bert_classification_lm" "content/gdrive/My Drive/models"

cp: -r not specified; omitting directory '/content/gdrive/MyDrive/models/bert_classification_lm'
