<a href="https://colab.research.google.com/github/SiriRRR/590-project/blob/main/Copy_of_classification_demo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
from datasets import load_dataset
from collections import Counter
import pandas as pd
import torch
from transformers.tokenization_utils_base import BatchEncoding
from typing import List, Tuple
from tqdm import tqdm
from transformers import DistilBertTokenizerFast
from transformers import DistilBertForSequenceClassification,AdamW, get_linear_schedule_with_warmup

tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-cased')

In [11]:
labels = {
    0: "World",
    1: "Sports",
    2: "Business",
    3: "Sci/Tech",
}
 
K = len(labels)

dataset = load_dataset("ag_news")
dataset



  0%|          | 0/2 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 120000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 7600
    })
})

In [12]:
batch_size = 8
epochs = 4
lr_init = 5e-5
max_len = 256
warmup_steps = 3

tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-cased')

In [13]:
def batch_data(data: pd.DataFrame, bsize: int) -> List[Tuple[BatchEncoding, List[int], List[str]]]:
    lst = []
    l = len(data)//bsize
    for i in range(l):
        batch_text = data['text'][bsize*i:bsize*(i+1)].tolist()
        X = tokenizer.batch_encode_plus(batch_text, truncation =True, padding= 'max_length',max_length = max_len, add_special_tokens=True, return_tensors='pt')
        Y = torch.LongTensor(data['label'][bsize*i:bsize*(i+1)].tolist())
        s = batch_text
        lst.append((X,Y,s))
    return lst

In [14]:
# subset
df_train = dataset['train'].to_pandas().sample(frac=1).reset_index(drop=True)
df_test = dataset['test'].to_pandas().sample(frac=1).reset_index(drop=True)

In [15]:
train_batches = batch_data(df_train, bsize=batch_size)
test_batches = batch_data(df_test, bsize=batch_size)

In [16]:
# Transformer model
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-cased', num_labels=K, output_hidden_states=True)

# The torch `device` on which to execute the model computation
if torch.cuda.is_available():
    device = torch.device('cuda:0') # GPU
else:
    device = torch.device('cpu') # CPU
model.to(device)

# The gradient descent optimizer used for fine tuning
optimizer = AdamW(model.parameters(), lr=lr_init)

# The gradient descent learning rate
lr = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=warmup_steps, 
    num_training_steps=len(train_batches))

Downloading:   0%|          | 0.00/263M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-cased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier

In [18]:
from datasets import load_metric
from tqdm.auto import tqdm


def runner(batches, train=True):
    if train == True:
        # train loop
        model.train()
        
        for epoch in range(epochs):

            progress_bar = tqdm(range(len(batches))) # add tqdm bar
            a = load_metric("accuracy") # load accuracy metric
            prediction = []
            acc = []
            loss_lst = []
            

            for item in batches:
                # get each batch from batches
                batch = {'input_ids':torch.as_tensor(item[0]['input_ids'], device=device),
                         'attention_mask':torch.as_tensor(item[0]['attention_mask'], device=device),
                         'labels': torch.as_tensor(item[1], device=device)}
                
                with torch.enable_grad():
                    
                    # if train is true, so this gradient descent process
                    outputs = model(**batch)
                    loss = outputs.loss
                    loss.backward()
                    optimizer.step()
                    lr.step()
                    optimizer.zero_grad()
                    
                    progress_bar.update(1) # update progree bar
                
                # add the model outputs to corresponding results list
                predict_label = torch.argmax(outputs.logits, dim=1) # get the label with the maximum probability
                prediction.append(predict_label)
                loss_lst.append(loss)
                acc.append(a.compute(predictions = predict_label, references = item[1])['accuracy'])
           
            # compute mean metrics computed over data in batches
            mean_loss = float(sum(loss_lst)/len(loss_lst))
            mean_accuracy = sum(acc)/len(acc)

        
        
    else:
        
        # testing loop
        model.eval()
        
        for epoch in range(epochs):

            progress_bar = tqdm(range(len(batches))) # add tqdm bar
            a = load_metric("accuracy") # load accuracy metric
            prediction = []
            acc = []
            loss_lst = []
            

            for item in batches:
                # get each batch from batches
                batch = {'input_ids':torch.as_tensor(item[0]['input_ids'], device=device),
                         'attention_mask':torch.as_tensor(item[0]['attention_mask'], device=device),
                         'labels': torch.as_tensor(item[1], device=device)}
                
                # if train is not true, just get the results from the current model
                with torch.no_grad():
                    outputs = model(**batch)
                    
                    # update progree bar
                    progress_bar.update(1)
                
                # add the model outputs to corresponding results list
                predict_label = torch.argmax(outputs.logits, dim=1) # get the label with the maximum probability
                prediction.append(predict_label)
                loss_lst.append(outputs.loss)
                acc.append(a.compute(predictions = predict_label, references = item[1])['accuracy'])
            
            # compute mean metrics computed over data in batches
            mean_loss = float(sum(loss_lst)/len(loss_lst))
            mean_accuracy = sum(acc)/len(acc)
            
        
    return (prediction,mean_loss, mean_accuracy)

In [19]:
train_results = runner(train_batches, train = True)

  0%|          | 0/15000 [00:00<?, ?it/s]

  del sys.path[0]


Downloading builder script:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

  0%|          | 0/15000 [00:00<?, ?it/s]

  0%|          | 0/15000 [00:00<?, ?it/s]

  0%|          | 0/15000 [00:00<?, ?it/s]

In [26]:
print('Traning Set:')

print('loss:',train_results[1])
print('accuracy:','{:.2f}'.format(train_results[2]*100) + '%')

Traning Set:
loss: 0.12471339851617813
accuracy: 95.81%


In [21]:
test_results = runner(test_batches, train = False)

  0%|          | 0/950 [00:00<?, ?it/s]

  0%|          | 0/950 [00:00<?, ?it/s]

  0%|          | 0/950 [00:00<?, ?it/s]

  0%|          | 0/950 [00:00<?, ?it/s]

In [27]:
print('Testing Set:')

print('loss:',test_results[1])
print('accuracy:','{:.2f}'.format(test_results[2]*100) + '%')

Testing Set:
loss: 0.17026114463806152
accuracy: 94.29%


In [None]:
# connect to hugging face hub
!huggingface-cli login

In [None]:
!pip install huggingface_hub

In [30]:
from huggingface_hub import notebook_login

notebook_login()

Token is valid.
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /root/.huggingface/token
Login successful


In [31]:
# push the trained model to hub
model.push_to_hub("SiriRRR/news-classification")

CommitInfo(commit_url='https://huggingface.co/SiriRRR/news-classification/commit/4996cdfc6d97a8883dce5e60deda90d2c2d656d8', commit_message='Upload DistilBertForSequenceClassification', commit_description='', oid='4996cdfc6d97a8883dce5e60deda90d2c2d656d8', pr_url=None, pr_revision=None, pr_num=None)

In [32]:
#push the tokenizer to hub
tokenizer.push_to_hub("SiriRRR/news-classification")

CommitInfo(commit_url='https://huggingface.co/SiriRRR/news-classification/commit/f2b18ed358e1ae459cfcf3c104ce5b81c6989a27', commit_message='Upload tokenizer', commit_description='', oid='f2b18ed358e1ae459cfcf3c104ce5b81c6989a27', pr_url=None, pr_revision=None, pr_num=None)

In [54]:
# load the pushed model to check
mymodel = DistilBertForSequenceClassification.from_pretrained('SiriRRR/news-classification', num_labels=4, output_hidden_states=True)