#1. Library imports

In [1]:
!pip install -q transformers
!pip install -q datasets

In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [3]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV

import tqdm
import nltk
import torch
import numpy as np
import pandas as pd
from torch import nn
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import confusion_matrix, classification_report
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
import transformers

In [5]:
from transformers import DataCollatorWithPadding,AutoModelForSequenceClassification, Trainer, TrainingArguments,AutoTokenizer,AutoModel,AutoConfig
from transformers.modeling_outputs import TokenClassifierOutput
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
import tqdm
from sklearn.metrics import confusion_matrix, classification_report

# 2. Data Imports

In [6]:
def load_data(split_name='train', columns=['text', 'stars'], folder='gdrive/MyDrive/COMP_4332/Project1/data'):
        df = pd.read_csv(f'{folder}/{split_name}.csv')
        return df

In [7]:
train_df = load_data('train', columns=['text', 'stars'])
valid_df = load_data('valid', columns=['text', 'stars'])
# the test set labels (the 'stars' column) are not available! So the following code will instead return all columns
test_df = load_data('test', columns=['text', 'stars'])

In [8]:
train_df = train_df[["text","stars"]]
valid_df = valid_df[["text","stars"]]

# MODEL BERT

In [None]:
#train_df.head()
#valid_df.head()
#test_df.head()

##Tokenizer

In [None]:
#default checkpoint for sentiment-analysis = distilbert-base-uncased-finetuned-sst-2-english
from transformers import AutoTokenizer

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [None]:
#Testing the tokenizer
tokenizer("Here is the example of tokenizer using BERT")
#Save the pretrained tokenizer (memory save)
tokenizer.save_pretrained("/gdrive/MyDrive/COMP_4332/Project1/cache")

('/gdrive/MyDrive/COMP_4332/Project1/cache/tokenizer_config.json',
 '/gdrive/MyDrive/COMP_4332/Project1/cache/special_tokens_map.json',
 '/gdrive/MyDrive/COMP_4332/Project1/cache/vocab.txt',
 '/gdrive/MyDrive/COMP_4332/Project1/cache/added_tokens.json',
 '/gdrive/MyDrive/COMP_4332/Project1/cache/tokenizer.json')

In [None]:
train_text_list = list(train_df["text"])
tokens = tokenizer(train_text_list, padding=True, truncation=True, return_tensors="tf")

In [None]:
#Sampling 2 for experiment only
small_train_dataset = train_text_list[:2]
small_tokens = tokenizer(small_train_dataset, padding=True, truncation=True, return_tensors="tf")

Checking for the input IDs and the attention masks

In [None]:
#small_tokens.input_ids
#small_tokens.attention_mask

## Tokenizer - REAL

In [26]:
#default checkpoint for sentiment-analysis = distilbert-base-uncased-finetuned-sst-2-english
from transformers import AutoTokenizer
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

### Custom Dataset (Dict of Dict)

In [None]:
train_dict = []
for i, j in zip(list(train_df["stars"]),list(train_df["text"])):
    train_dict.append({"label": i, "text": j})
valid_dict = []
for i, j in zip(list(valid_df["stars"]),list(valid_df["text"])):
    valid_dict.append({"label": i, "text": j})

In [None]:
MyDataset = {}
MyDataset["train"] = train_dict
MyDataset["validation"] = valid_dict
#MyDataset["test"] = 

In [None]:
MyDataset["train"][0]

{'label': 5,
 'text': "I've been here a handful of times now and I've never been disappointed.  The food is always good and the servers are quick.   So far my two favorite items are the Peppersauce Burger with pastrami and the Peppersauce Patty.  Even as I type this my mouth is watering and I just had the Peppersauce Burger.  \n\nThe burgers are well done and still juicy!  I always leave stuffed and happy.  The burgers can be a little on the greasy side, need two or three napkins.  I've also had them when you only needed on napkin to clean up.  Either way it was still tasty!\n\nI've seen a couple of people get salads and they are huge and look good.\n\nThe servers have always been friendly even when it was really busy."}

### MyDataset Class

In [None]:
def tokenize_function(x):
    return tokenizer(x, truncation=True)

def tokenize_df(df):
  tokens = df['text'].map(tokenize_function)
  df['input_ids'] = [x['input_ids'] for x in tokens]
  df['attention_mask'] = [x['attention_mask'] for x in tokens]
  df.rename(columns={"stars":"labels"},inplace=True)
  return df

train_df = tokenize_df(train_df)
valid_df = tokenize_df(valid_df)
test_df = tokenize_df(test_df)

In [None]:
class MyDataset(Dataset):
    def __init__(self, df):
        assert len(df['input_ids']) == len(df['labels'])
        self.input_ids = df['input_ids']
        self.attention_mask = df['attention_mask']
        self.label = df['labels']-1
    
    def __getitem__(self, idx):
        return np.asarray(self.input_ids[idx]), np.asarray(self.attention_mask[idx]), self.label[idx]

    def __len__(self):
        return len(self.label)

In [None]:
batch_size = 2
train_dataloader = DataLoader(MyDataset(train_df), batch_size=batch_size, shuffle=True)
valid_dataloader = DataLoader(MyDataset(valid_df), batch_size=batch_size)

### DataCollator

In [27]:
import datasets
import pandas as pd
import tensorflow as tf
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")

In [28]:
def tokenize_function(x):
    return tokenizer(x, truncation=True)

def tokenize_df(df):
  tokens = df['text'].map(tokenize_function)
  df['input_ids'] = [x['input_ids'] for x in tokens]
  df['attention_mask'] = [x['attention_mask'] for x in tokens]
  df['token_type_ids'] = [x['token_type_ids'] for x in tokens]
  df.rename(columns={"stars":"labels"},inplace=True)
  return df

train_df = tokenize_df(train_df)
valid_df = tokenize_df(valid_df)
test_df = tokenize_df(test_df)

In [29]:
print(train_df.columns)
print(valid_df.columns)
print(test_df.columns)

Index(['text', 'labels', 'input_ids', 'attention_mask', 'token_type_ids'], dtype='object')
Index(['text', 'labels', 'input_ids', 'attention_mask', 'token_type_ids'], dtype='object')
Index(['business_id', 'cool', 'date', 'funny', 'review_id', 'text', 'useful',
       'user_id', 'input_ids', 'attention_mask', 'token_type_ids'],
      dtype='object')


In [17]:
"""
train_dict = []
for i,j,k,l in zip(list(train_df["labels"]),list(train_df["text"]),list(train_df["input_ids"]),list(train_df["attention_mask"])):
    train_dict.append({"label": i, "text": j, "input_ids":k, "attention_mask":l})
valid_dict = []
for i,j,k,l in zip(list(valid_df["labels"]),list(valid_df["text"]),list(valid_df["input_ids"]),list(valid_df["attention_mask"])):
    valid_dict.append({"label": i, "text": j, "input_ids":k, "attention_mask":l})
"""

CUDA 터져서 일단 기본적으로 이렇게

In [30]:
small_train = train_df.head(100)
small_validate = valid_df.head(100)

In [14]:
import tempfile
import pathlib
import pyarrow as pa
import pyarrow.parquet as pq

In [31]:
#table_train = pa.table({'label': list(small_train["labels"]), 'text': list(small_train["text"]), 'input_ids':list(small_train["input_ids"]), 'attention_mask':list(small_train["attention_mask"]),'token_type_ids':list(small_train["token_type_ids"]) })
#table_validate = pa.table({'label': list(small_validate["labels"]), 'text': list(small_validate["text"]), 'input_ids':list(small_validate["input_ids"]), 'attention_mask':list(small_validate["attention_mask"]),'token_type_ids':list(small_validate["token_type_ids"])})

In [40]:
table_train = pa.table({'label': list(small_train["labels"]), 'input_ids':list(small_train["input_ids"]), 'attention_mask':list(small_train["attention_mask"]),'token_type_ids':list(small_train["token_type_ids"]) })
table_validate = pa.table({'label': list(small_validate["labels"]),'input_ids':list(small_validate["input_ids"]), 'attention_mask':list(small_validate["attention_mask"]),'token_type_ids':list(small_validate["token_type_ids"])})

In [41]:
#training = datasets.DatasetDict({"labels":list(train_df["labels"]), "text": list(train_df["text"])})
training = datasets.Dataset(table_train)
valid = datasets.Dataset(table_validate)

In [42]:
training.features
valid.features

{'attention_mask': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None),
 'input_ids': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None),
 'label': Value(dtype='int64', id=None),
 'token_type_ids': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None)}

In [43]:
MyDataset = datasets.DatasetDict({"train":training,"validation":valid})

In [44]:
MyDataset

DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids', 'attention_mask', 'token_type_ids'],
        num_rows: 100
    })
    validation: Dataset({
        features: ['label', 'input_ids', 'attention_mask', 'token_type_ids'],
        num_rows: 100
    })
})

In [74]:
#x = MyDataset.set_format("torch")

In [45]:
train_dataloader = DataLoader(
    MyDataset["train"], shuffle=True, batch_size=4, collate_fn=data_collator
)
eval_dataloader = DataLoader(
    MyDataset["validation"], batch_size=4, collate_fn=data_collator
)

In [46]:
for batch in train_dataloader:
    break
{k: v.shape for k, v in batch.items()}

{'attention_mask': TensorShape([4, 167]),
 'input_ids': TensorShape([4, 167]),
 'labels': TensorShape([4]),
 'token_type_ids': TensorShape([4, 167])}

## Model - sequence

In [92]:
from transformers import TFAutoModelForSequenceClassification
checkpoint = "bert-base-cased"
bert_model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=5)

loading configuration file https://huggingface.co/bert-base-cased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/a803e0468a8fe090683bdc453f4fac622804f49de86d7cecaee92365d4a0f829.a64a22196690e0e82ead56f388a3ef3a50de93335926ccfa20610217db589307
Model config BertConfig {
  "_name_or_path": "bert-base-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad

Downloading:   0%|          | 0.00/502M [00:00<?, ?B/s]

storing https://huggingface.co/bert-base-cased/resolve/main/tf_model.h5 in cache at /root/.cache/huggingface/transformers/01800f4158e284e2447020e0124bc3f6aea3ac49848e744594f7cce8ee5ac0a4.a7137b2090d9302d722735af604b4c142ec9d1bfc31be7cbbe230aea9d5cfb76.h5
creating metadata file for /root/.cache/huggingface/transformers/01800f4158e284e2447020e0124bc3f6aea3ac49848e744594f7cce8ee5ac0a4.a7137b2090d9302d722735af604b4c142ec9d1bfc31be7cbbe230aea9d5cfb76.h5
loading weights file https://huggingface.co/bert-base-cased/resolve/main/tf_model.h5 from cache at /root/.cache/huggingface/transformers/01800f4158e284e2447020e0124bc3f6aea3ac49848e744594f7cce8ee5ac0a4.a7137b2090d9302d722735af604b4c142ec9d1bfc31be7cbbe230aea9d5cfb76.h5
All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier']
You should probably TRAIN this m

In [93]:
outputs = bert_model(**batch)
print(outputs.loss)
print(outputs.logits.shape)


tf.Tensor([1.9614848       nan 1.2173408 1.5471948], shape=(4,), dtype=float32)
(4, 5)


## Optimize

In [94]:
tf_train_dataset = MyDataset["train"].to_tf_dataset(
    columns=["attention_mask", "input_ids"],
    label_cols=["label"],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=4,
)

tf_validation_dataset = MyDataset["validation"].to_tf_dataset(
    columns=["attention_mask", "input_ids"],
    label_cols=["label"],
    shuffle=False,
    collate_fn=data_collator,
    batch_size=4,
)

In [95]:
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.optimizers.schedules import PolynomialDecay

batch_size = 4
num_epochs = 3
num_train_steps = len(tf_train_dataset) * num_epochs
lr_scheduler = PolynomialDecay(
    initial_learning_rate=5e-5, end_learning_rate=0.0, decay_steps=num_train_steps
)

opt = Adam(learning_rate=lr_scheduler)
print(num_train_steps)


"""
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)
"""

75


'\nlr_scheduler = get_scheduler(\n    "linear",\n    optimizer=optimizer,\n    num_warmup_steps=0,\n    num_training_steps=num_training_steps,\n)\n'

## Training

In [96]:
small_train_dataset = MyDataset["train"].shuffle(seed=1002).select(range(10))
small_eval_dataset = MyDataset["validation"].shuffle(seed=1002).select(range(10))
train_dataloader = DataLoader(small_train_dataset, shuffle=True, batch_size=8)
eval_dataloader = DataLoader(small_eval_dataset, batch_size=8)

In [97]:
from torch.optim import AdamW
optimizer = AdamW(bert_model.parameters(), lr=5e-5)
from transformers import get_scheduler

num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

AttributeError: ignored

In [84]:
import torch
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [85]:
num_epochs

3

In [87]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

model.train()

for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

  0%|          | 0/6 [00:00<?, ?it/s]

RuntimeError: ignored

In [88]:
from datasets import load_metric
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

train_args = TrainingArguments(output_dir="test_trainer",
                               evaluation_strategy = 'epoch')

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

metric = load_metric("accuracy")

trainer = Trainer(
    model=model,
    args=train_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics
)

Downloading builder script:   0%|          | 0.00/1.41k [00:00<?, ?B/s]

AttributeError: ignored

In [70]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_train_steps))

bert_model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v for k, v in batch.items()}
        outputs = bert_model(**batch)
        loss = outputs.loss
        loss.backward()

        opt.step()
        lr_scheduler.step()
        opt.zero_grad()
        progress_bar.update(1)

  0%|          | 0/75 [00:00<?, ?it/s]

AttributeError: ignored

In [34]:
from tensorflow.keras.losses import SparseCategoricalCrossentropy

model.compile(
    optimizer="adam",
    loss=SparseCategoricalCrossentropy(from_logits=True),
    metrics=["accuracy"],
)
model.fit(
    tf_train_dataset,
    validation_data=tf_validation_dataset,
    epochs = 3
)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7fc8bc9b0590>

In [35]:
preds = model.predict(tf_validation_dataset)["logits"]
class_preds = np.argmax(preds, axis=1)
print(preds.shape, class_preds.shape)

(5, 2) (5,)


## Model

In [71]:
from transformers import TFAutoModel
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
model = TFAutoModel.from_pretrained(checkpoint)

Some layers from the model checkpoint at distilbert-base-uncased-finetuned-sst-2-english were not used when initializing TFDistilBertModel: ['dropout_19', 'pre_classifier', 'classifier']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-uncased-finetuned-sst-2-english.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


In [72]:
model

<transformers.models.distilbert.modeling_tf_distilbert.TFDistilBertModel at 0x7fbf60488e50>

In [None]:
#train_text = model(**small_tokens)

Configuration

In [None]:
from transformers import AutoConfig
bert_config = AutoConfig.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")
# SAME AS Building the config directly
#bert_config = BertConfig()
type(bert_config)

transformers.models.distilbert.configuration_distilbert.DistilBertConfig

# Optimize & Tune

## Training by batch

In [None]:

"""
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy")
labels = tf.convert_to_tensor([1, 1])
model.train_on_batch(batch, labels)
"""

0.604586124420166

## Pre-process texts

In [None]:
stopword_list = set(stopwords.words('english'))
ps = PorterStemmer()

def lower(s):
    """
    :param s: a string.
    return a string with lower characters
    Note that we allow the input to be nested string of a list.
    e.g.
    Input: 'Text mining is to identify useful information.'
    Output: 'text mining is to identify useful information.'
    """
    if isinstance(s, list):
        return [lower(t) for t in s]
    if isinstance(s, str):
        return s.lower()
    else:
        raise NotImplementedError("unknown datatype")


def tokenize(text):
    """
    :param text: a doc with multiple sentences, type: str
    return a word list, type: list
    e.g.
    Input: 'Text mining is to identify useful information.'
    Output: ['Text', 'mining', 'is', 'to', 'identify', 'useful', 'information', '.']
    """
    return nltk.word_tokenize(text)


def stem(tokens):
    """
    :param tokens: a list of tokens, type: list
    return a list of stemmed words, type: list
    e.g.
    Input: ['Text', 'mining', 'is', 'to', 'identify', 'useful', 'information', '.']
    Output: ['text', 'mine', 'is', 'to', 'identifi', 'use', 'inform', '.']
    """
    ### equivalent code
    # results = list()
    # for token in tokens:
    #     results.append(ps.stem(token))
    # return results

    return [ps.stem(token) for token in tokens]

def n_gram(tokens, n=1):
    """
    :param tokens: a list of tokens, type: list
    :param n: the corresponding n-gram, type: int
    return a list of n-gram tokens, type: list
    e.g.
    Input: ['text', 'mine', 'is', 'to', 'identifi', 'use', 'inform', '.'], 2
    Output: ['text mine', 'mine is', 'is to', 'to identifi', 'identifi use', 'use inform', 'inform .']
    """
    if n == 1:
        return tokens
    else:
        results = list()
        for i in range(len(tokens)-n+1):
            # tokens[i:i+n] will return a sublist from i th to i+n th (i+n th is not included)
            results.append(" ".join(tokens[i:i+n]))
        return results

def filter_stopwords(tokens):
    """
    :param tokens: a list of tokens, type: list
    return a list of filtered tokens, type: list
    e.g.
    Input: ['text', 'mine', 'is', 'to', 'identifi', 'use', 'inform', '.']
    Output: ['text', 'mine', 'identifi', 'use', 'inform', '.']
    """
    ### equivalent code
    # results = list()
    # for token in tokens:
    #     if token not in stopwords and not token.isnumeric():
    #         results.append(token)
    # return results

    return [token for token in tokens if token not in stopword_list and not token.isnumeric()]

import numpy as np

def get_onehot_vector(feats, feats_dict):
    """
    :param data: a list of features, type: list
    :param feats_dict: a dict from features to indices, type: dict
    return a feature vector,
    """
    # initialize the vector as all zeros
    vector = np.zeros(len(feats_dict), dtype=np.float)
    for f in feats:
        # get the feature index, return -1 if the feature is not existed
        f_idx = feats_dict.get(f, -1)
        if f_idx != -1:
            # set the corresponding element as 1
            vector[f_idx] = 1
    return vector

In [None]:
train_df['tokens'] = train_df['text'].map(tokenize).map(filter_stopwords).map(lower)
valid_df['tokens'] = valid_df['text'].map(tokenize).map(filter_stopwords).map(lower)
test_df['tokens'] = test_df['text'].map(tokenize).map(filter_stopwords).map(lower)

In [None]:
train_Dataloader = DataLoader(MyDataset(x_train, y_train), batch_size=4,shuffle=True)
valid_Dataloader = DataLoader(MyDataset(valid_x, valid_y), batch_size=4)

# 4. Advanced Model (with finetuning)

In [None]:
train_df.head()
valid_df.head()
#test_df.head()

Unnamed: 0,text,stars
0,We came in today during closing & they still a...,5
1,"Tiny, but casual location for breakfast/brunch...",4
2,We keep going to the same plaza to eat pizza b...,4
3,Tim Hortons is the epitome of Canadian mediocr...,1
4,Workers here are very friendly and they know a...,5


In [None]:
from datasets import load_metric
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

finetune_bert = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)
train_args = TrainingArguments(output_dir="test_trainer",
                               evaluation_strategy = 'epoch')

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

metric = load_metric("accuracy")

trainer = Trainer(
    model=finetune_bert,
    args=train_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics
)
trainer.train()

loading configuration file https://huggingface.co/bert-base-cased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/a803e0468a8fe090683bdc453f4fac622804f49de86d7cecaee92365d4a0f829.a64a22196690e0e82ead56f388a3ef3a50de93335926ccfa20610217db589307
Model config BertConfig {
  "_name_or_path": "bert-base-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad

Downloading:   0%|          | 0.00/416M [00:00<?, ?B/s]

storing https://huggingface.co/bert-base-cased/resolve/main/pytorch_model.bin in cache at /root/.cache/huggingface/transformers/092cc582560fc3833e556b3f833695c26343cb54b7e88cd02d40821462a74999.1f48cab6c959fc6c360d22bea39d06959e90f5b002e77e836d2da45464875cda
creating metadata file for /root/.cache/huggingface/transformers/092cc582560fc3833e556b3f833695c26343cb54b7e88cd02d40821462a74999.1f48cab6c959fc6c360d22bea39d06959e90f5b002e77e836d2da45464875cda
loading weights file https://huggingface.co/bert-base-cased/resolve/main/pytorch_model.bin from cache at /root/.cache/huggingface/transformers/092cc582560fc3833e556b3f833695c26343cb54b7e88cd02d40821462a74999.1f48cab6c959fc6c360d22bea39d06959e90f5b002e77e836d2da45464875cda
Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.trans

Downloading builder script:   0%|          | 0.00/1.41k [00:00<?, ?B/s]

RuntimeError: ignored

### Custom Dataset

In [None]:
train_dict = []
for i, j in zip(list(train_df["stars"]),list(train_df["text"])):
    train_dict.append({"label": i, "text": j})
valid_dict = []
for i, j in zip(list(valid_df["stars"]),list(valid_df["text"])):
    valid_dict.append({"label": i, "text": j})

In [None]:
from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
train_texts = list(train_df["text"])
valid_texts = list(valid_df["text"])
test_texts = list(test_df["text"])

In [None]:
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(valid_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

In [None]:
import torch

class TestDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_label = train_df["stars"]
valid_label = valid_df["stars"]

train_dataset = TestDataset(train_encodings, train_label)
val_dataset = TestDataset(val_encodings, valid_label)
#test_dataset = TestDataset(test_encodings, test_labels)

In [None]:
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import DataLoader
from transformers import DistilBertForSequenceClassification, AdamW

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

finetune_distilbert = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased') 

training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
)

model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")

trainer = Trainer(
    model=finetune_distilbert,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset             # evaluation dataset
)

trainer.train()

Downloading:   0%|          | 0.00/256M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier

RuntimeError: ignored

In [None]:
import torch

class MyDataset:
    def __init__(self, df):
        self.label = list(df['stars'])
        self.text = list(df['text'])
    def __getitem__(self,idx):
      return np.asarray(self.label[idx]), np.asarray(self.text[idx])
      return item
    def __len__(self):
      return len(self.label)

"""
import torch

class IMDbDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)
"""

new_train = MyDataset(train_df)

In [None]:
#new_train.text
#new_train.label

### Failed Attempts (Running out of GPU)

In [None]:
from datasets import load_metric
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

finetune_bert = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)
train_args = TrainingArguments(output_dir="test_trainer",
                               evaluation_strategy = 'epoch')

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

metric = load_metric("accuracy")

trainer = Trainer(
    model=finetune_bert,
    args=train_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics
)


Downloading:   0%|          | 0.00/416M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

Downloading builder script:   0%|          | 0.00/1.41k [00:00<?, ?B/s]

In [None]:
finetune_bert