In [None]:
!pip install transformers



# correct cells

In [12]:
import pandas as pd
import torch
import math
from transformers import AutoTokenizer, AutoModel
from transformers import BertConfig, BertTokenizer
from transformers import BertModel
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import gc
from transformers import set_seed

In [13]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [14]:
def make_trait(row):
    row["trait_0"] = 0.0 if row["mbti_result"][0] == "I" else 1.0
    row["trait_1"] = 0.0 if row["mbti_result"][1] == "N" else 1.0
    row["trait_2"] = 0.0 if row["mbti_result"][2] == "T" else 1.0
    row["trait_3"] = 0.0 if row["mbti_result"][3] == "J" else 1.0
    row["tweets_text"] = " ".join(row["tweets"])
    return row

In [15]:
df = pd.read_json("/content/drive/MyDrive/NLP/Project/datasets.json")

In [16]:
df = df.apply(make_trait, axis=1)
df = df.iloc[:400]

In [17]:
from sklearn.model_selection import train_test_split

SEED_NUM = 1234
labels = df["trait_3"]

X_train, X_test, y_train, y_test = train_test_split(df["tweets_text"], labels, test_size=0.2,
                                                                    random_state=SEED_NUM)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25,
                                                                  random_state=SEED_NUM)
X_train = X_train.tolist()
X_test = X_test.tolist()
X_val = X_val.tolist()
y_train = y_train.tolist()
y_test = y_test.tolist()
y_val = y_val.tolist()

In [18]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
SEED_NUM = 1234
MAX_TOKENS = 512
DROPOUT_PROB = 0.1
HIDDEN_SIZE = 768
NUM_LABELS = 2
LAST_NUM_NEURON = 1
NUM_EPOCHS = 8
BATCH_SIZE = 14
torch.manual_seed(SEED_NUM)
set_seed(SEED_NUM)

class TweetsPersonality(nn.Module):


    def __init__(self, model_name):

        super(TweetsPersonality, self).__init__()

        self.bert = BertModel.from_pretrained(model_name)

        self.dropout = nn.Dropout(DROPOUT_PROB)  # dropout layer

        self.classifier = nn.Linear(HIDDEN_SIZE, LAST_NUM_NEURON)  # linear binary classifier layer


    def forward(self, input_ids, attention_mask):

        bert_embeddings = self.bert(input_ids=input_ids, attention_mask=attention_mask)

        sentence_embeddings = bert_embeddings.last_hidden_state[:, 0, :]
        # extract CLS embeddings so from size (batch_size, sequence_length, hidden_size) --> (batch_size, hidden_size)

        sentence_embeddings = self.dropout(sentence_embeddings)  # apply dropout layer

        logits = self.classifier(sentence_embeddings)  # (batch_size, 1)

        logits = torch.sigmoid(logits)

        return logits



In [19]:
torch.cuda.empty_cache()
tokenizer = BertTokenizer.from_pretrained('HooshvareLab/bert-fa-base-uncased', model_max_length=512)

model = TweetsPersonality('HooshvareLab/bert-fa-base-uncased').to(device)

loss_fn = nn.BCELoss()

# Define the optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)

Some weights of the model checkpoint at HooshvareLab/bert-fa-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [20]:
print(f"on batch :", end=" ")
for j in range(10):
  print(f"{j} -->", end=" ")



on batch : 0 --> 1 --> 2 --> 3 --> 4 --> 5 --> 6 --> 7 --> 8 --> 9 --> 

In [21]:
from tqdm import tqdm
import random


epochs_loss = {"train": [], "val": [], "test": []}
epochs_acc = {"train": [], "val": [], "test": []}

model.train()

# Train the model
for epoch in range(NUM_EPOCHS):
    running_loss = 0.0
    indices = list(range(len(X_train)))
    random.shuffle(indices)
    X_train_shuffled = [X_train[i] for i in indices]
    y_train_shuffled = [y_train[i] for i in indices]
    counter = 0
    print("on batch ", end=" ")
    for i in range(0, len(X_train_shuffled), BATCH_SIZE):
        tweets = X_train_shuffled[i:i + BATCH_SIZE]
        labels = torch.tensor(y_train_shuffled[i:i + BATCH_SIZE]).unsqueeze(1).to(device)

        model_inputs = tokenizer(tweets, return_tensors="pt", padding=True, truncation=True).to(device)
        logits = model(input_ids=model_inputs.input_ids, attention_mask=model_inputs.attention_mask)

        loss = loss_fn(logits, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        counter += 1

        running_loss += loss.item()
        print(f"{counter} ->", end=" ")
        # break
    print(f"\nEpoch {epoch + 1}/{NUM_EPOCHS} - Loss: {running_loss}")

    with torch.no_grad():
        tmp_X_train = X_train[:30]
        tmp_y_train = y_train[:30]

        labels = torch.tensor(tmp_y_train).unsqueeze(1).to(device)

        model_inputs = tokenizer(tmp_X_train, return_tensors="pt", padding=True, truncation=True).to(device)
        logits = model(input_ids=model_inputs.input_ids, attention_mask=model_inputs.attention_mask)
        train_loss = loss_fn(logits, labels)
        train_predicted_labels = logits.round()
        train_accuracy = (labels == train_predicted_labels).float().mean()
        epochs_loss["train"].append(train_loss.to("cpu").item())
        epochs_acc["train"].append(train_accuracy.to("cpu").item())




        tmp_X_val = X_val[:30]
        tmp_y_val = y_val[:30]

        labels = torch.tensor(tmp_y_val).unsqueeze(1).to(device)

        model_inputs = tokenizer(tmp_X_val, return_tensors="pt", padding=True, truncation=True).to(device)
        logits = model(input_ids=model_inputs.input_ids, attention_mask=model_inputs.attention_mask)
        val_loss = loss_fn(logits, labels)
        val_predicted_labels = logits.round()
        val_accuracy = (labels == val_predicted_labels).float().mean()
        epochs_loss["val"].append(val_loss.to("cpu").item())
        epochs_acc["val"].append(val_accuracy.to("cpu").item())

        tmp_X_test = X_test[:30]
        tmp_y_test = y_test[:30]

        labels = torch.tensor(tmp_y_test).unsqueeze(1).to(device)

        model_inputs = tokenizer(tmp_X_test, return_tensors="pt", padding=True, truncation=True).to(device)
        logits = model(input_ids=model_inputs.input_ids, attention_mask=model_inputs.attention_mask)
        test_loss = loss_fn(logits, labels)
        test_predicted_labels = logits.round()
        test_accuracy = (labels == test_predicted_labels).float().mean()
        epochs_loss["test"].append(test_loss.to("cpu").item())
        epochs_acc["test"].append(test_accuracy.to("cpu").item())

        print(f"Train Loss: {train_loss}, Train Accuracy: {train_accuracy}")
        print(f"Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}")
        print(f"Test Loss: {test_loss}, Test Accuracy: {test_accuracy}\n")

        del model_inputs
        del logits
        del test_loss, train_loss, val_loss
        del test_predicted_labels, val_predicted_labels, train_predicted_labels
        del test_accuracy, train_accuracy, val_accuracy
        gc.collect()

        torch.cuda.empty_cache()
    torch.cuda.empty_cache()


on batch  1 -> 2 -> 3 -> 4 -> 5 -> 6 -> 7 -> 8 -> 9 -> 10 -> 11 -> 12 -> 13 -> 14 -> 15 -> 16 -> 17 -> 18 -> 
Epoch 1/8 - Loss: 15.333462327718735
Train Loss: 0.7940398454666138, Train Accuracy: 0.4333333671092987
Validation Loss: 0.7409650087356567, Validation Accuracy: 0.5666667222976685
Test Loss: 0.7961530685424805, Test Accuracy: 0.5333333611488342

on batch  1 -> 2 -> 3 -> 4 -> 5 -> 6 -> 7 -> 8 -> 9 -> 10 -> 11 -> 12 -> 13 -> 14 -> 15 -> 16 -> 17 -> 18 -> 
Epoch 2/8 - Loss: 13.838119596242905
Train Loss: 0.7207702398300171, Train Accuracy: 0.4333333671092987
Validation Loss: 0.7048749923706055, Validation Accuracy: 0.5666667222976685
Test Loss: 0.7410329580307007, Test Accuracy: 0.5

on batch  1 -> 2 -> 3 -> 4 -> 5 -> 6 -> 7 -> 8 -> 9 -> 10 -> 11 -> 12 -> 13 -> 14 -> 15 -> 16 -> 17 -> 18 -> 
Epoch 3/8 - Loss: 14.05545711517334
Train Loss: 0.6979138255119324, Train Accuracy: 0.4333333671092987
Validation Loss: 0.6796666979789734, Validation Accuracy: 0.5666667222976685
Test Loss: 

In [None]:
torch.save(model.state_dict(), "/content/drive/MyDrive/NLP/BERT_TRUNCATED_MODEL/model.pth")

In [22]:
import json

with open("/content/drive/MyDrive/NLP/outputs_bert_truncated_3.json", "a+", encoding="utf-8") as f:
  results = {"acc" : epochs_acc, "loss": epochs_loss}
  json.dump(results, f)

In [None]:
torch.cuda.empty_cache()

In [None]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return {
            'text': self.texts.iloc[idx],
            'labels': self.labels.iloc[idx]
        }

In [None]:
!pip install accelerate -U

Collecting accelerate
  Downloading accelerate-0.20.3-py3-none-any.whl (227 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/227.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.6/227.6 kB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.20.3


In [None]:
tokenizer = BertTokenizer.from_pretrained('HooshvareLab/bert-fa-base-uncased', model_max_length=512)
model = TweetsPersonality('HooshvareLab/bert-fa-base-uncased')

Some weights of the model checkpoint at HooshvareLab/bert-fa-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


# below cells are alternative way which is not working

In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-2.13.1-py3-none-any.whl (486 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m486.2/486.2 kB[0m [31m16.6 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.7,>=0.3.0 (from datasets)
  Downloading dill-0.3.6-py3-none-any.whl (110 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 kB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.5/212.5 kB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.14-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.3/134.3 kB[0m [31m16.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: xxhash, dill, multiprocess, datasets
Successfully installed datasets-2.13.

In [None]:
from datasets import load_dataset, DatasetDict

In [None]:
small_tokenized_dataset["train"]

Dataset({
    features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 128
})

In [None]:
from datasets import Dataset
df = df.iloc[:100]
df["labels"] = df["trait_0"]
dataset = Dataset.from_pandas(df[['tweets_text', "labels"]])

train_dataset = DatasetDict(
    train=dataset,
)

tokenized_dataset = train_dataset.map(
    lambda example: tokenizer(example['tweets_text'], truncation=True, padding="max_length"),
    batched=True,
    batch_size=16
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["labels"] = df["trait_0"]


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [None]:
from transformers import TrainingArguments, Trainer
from sklearn.metrics import accuracy_score
import numpy as np

loss_fn = nn.BCELoss()


# Define the optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)


# train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)

# val_dataset = CustomDataset(df, tokenizer, 512)

# val_dataloader = DataLoader(val_dataset, batch_size=32, shuffle=True)

def compute_metrics(eval_pred):
    """Called at the end of validation. Gives accuracy"""
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    # TODO
    # calculate the accuracy
    accuracy = accuracy_score(y_true=labels, y_pred=predictions)
    return {"accuracy": accuracy}

arguments = TrainingArguments(
    output_dir="sample_hf_trainer",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    evaluation_strategy="epoch", # run validation at the end of each epoch
    save_strategy="epoch",
    learning_rate=2e-5,
    load_best_model_at_end=True,
    seed=SEED_NUM
)

# tokenized_train = tokenizer(X_train, return_tensors="pt", padding=True, truncation=True)
# tokenized_val = tokenizer(tweets, return_tensors="pt", padding=True, truncation=True)
trainer = Trainer(
    model=model,
    args=arguments,
    train_dataset=tokenized_dataset["train"],
    # tokenizer=tokenizer, you can turn this on when padding is not set in tokenizer
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

