In [1]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m28.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m36.9 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m63.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m78.1 MB/s[0m eta [36m0:00:0

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import random
top_k = 2500
def make_trait(row):
    row["trait_0"] = 0.0 if row["mbti_result"][0] == "I" else 1.0
    row["trait_1"] = 0.0 if row["mbti_result"][1] == "N" else 1.0
    row["trait_2"] = 0.0 if row["mbti_result"][2] == "T" else 1.0
    row["trait_3"] = 0.0 if row["mbti_result"][3] == "J" else 1.0
    tweets = row["tweets"]
    tweets_length = len(tweets)
    if tweets_length > top_k:
      row["tweets"] = random.sample(tweets, top_k)
    return row

In [4]:
import pandas as pd

In [5]:
df = pd.read_json("/content/drive/MyDrive/NLP/Project/datasets.json")

In [6]:
df.groupby("mbti_result").count()

Unnamed: 0_level_0,twitter_id,tweets
mbti_result,Unnamed: 1_level_1,Unnamed: 2_level_1
ENFJ,164,164
ENFP,284,284
ENTJ,276,276
ENTP,264,264
ESFJ,60,60
ESFP,75,75
ESTJ,186,186
ESTP,99,99
INFJ,304,304
INFP,524,524


In [7]:
from sklearn.utils import resample

upsampling_labels = {"ESFJ" : 100, "ESFP" : 120, "ISFJ" : 120, "ISFP" : 120, "ESTP" : 130}
unique_labels = df["mbti_result"].unique()
manipulated_labels = []
for label in unique_labels:
    labels_df = df[df["mbti_result"] == label]
    if label in upsampling_labels:
        # do upsampling
        labels_df = resample(labels_df,
             replace=True,
             n_samples=upsampling_labels[label],
             random_state=1234)
    manipulated_labels.append(labels_df)

upsampled_df = pd.concat(manipulated_labels)

In [8]:
upsampled_df.groupby("mbti_result").count()

Unnamed: 0_level_0,twitter_id,tweets
mbti_result,Unnamed: 1_level_1,Unnamed: 2_level_1
ENFJ,164,164
ENFP,284,284
ENTJ,276,276
ENTP,264,264
ESFJ,100,100
ESFP,120,120
ESTJ,186,186
ESTP,130,130
INFJ,304,304
INFP,524,524


In [9]:
df = upsampled_df
df = df.apply(make_trait, axis=1)
# df = df.iloc[:400]

In [10]:
from sklearn.model_selection import train_test_split

SEED_NUM = 1234
labels = df["trait_0"]

X_train, X_test, y_train, y_test = train_test_split(df["tweets"], labels, test_size=0.25,
                                                                    random_state=SEED_NUM)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25,
                                                                  random_state=SEED_NUM)
X_train = X_train.tolist()
X_test = X_test.tolist()
X_val = X_val.tolist()
y_train = y_train.tolist()
y_test = y_test.tolist()
y_val = y_val.tolist()

In [11]:
import torch
import math
from transformers import AutoTokenizer, AutoModel
from transformers import BertConfig, BertTokenizer
from transformers import BertModel
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import gc
from transformers import set_seed

SEED_NUM = 1234
MAX_TOKENS = 512
DROPOUT_PROB = 0.1
HIDDEN_SIZE = 768
NUM_LABELS = 2
LAST_NUM_NEURON = 1
NUM_EPOCHS = 1
BATCH_SIZE = 32
torch.manual_seed(SEED_NUM)
set_seed(SEED_NUM)
# torch.backends.cudnn.deterministic = True

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

class BatchTokenizerDataset(Dataset):
    def __init__(self, input_ids, attention_mask):
        self.input_ids = input_ids
        self.attention_mask = attention_mask

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return (self.input_ids[idx], self.attention_mask[idx])

class TweetsPersonality(nn.Module):


    def __init__(self, model_name):
        super(TweetsPersonality, self).__init__()


        self.tweets_bert = BertModel.from_pretrained(model_name)  # first BERT model which is feature extractor of a single user tweets

        for param in self.tweets_bert.parameters():
            param.requires_grad = False

        self.embedding_bert = BertModel.from_pretrained(model_name)  # second BERT model which takes as input embedding of batch of tweets


        self.dropout = nn.Dropout(DROPOUT_PROB)  # dropout layer

        self.classifier = nn.Linear(HIDDEN_SIZE, LAST_NUM_NEURON)  # linear binary classifier layer

    def handle_limit_tokens_size(self, embeds):
        tweets_len = embeds.shape[0]
        if tweets_len > MAX_TOKENS:
            merge_num = math.ceil(tweets_len / MAX_TOKENS)  # each merge_num tweets are averaged together
            # for example if it's 4, every 4 tweets are average embeds[:4], embeds[4:8], ...


            reshaped_main_tweets = embeds[:merge_num * (tweets_len // merge_num)].view((tweets_len // merge_num), merge_num, -1).mean(axis=1)
            # for example if MAX_TOKEN = 500 and we have 1602 tweets --> merge_num = 4, (tweets_len // merge_num) = 400 , merge_num * (tweets_len // merge_num) = 1600
            # reshaped to (tweets_len // merge_num, merge_num, 768) and the            # get mean which results in (tweets_len // merge_num, 768)

            remainder_tweets = embeds[merge_num * (tweets_len // merge_num) : tweets_len]
            # we are getting the two remainder from 1600 to 1602

            if len(remainder_tweets) > 0:
                reshaped_remainder_tweets = remainder_tweets.mean(axis=0).unsqueeze(0)  # get mean which results in (1, 768)
                return torch.cat((reshaped_main_tweets, reshaped_remainder_tweets))

            return reshaped_main_tweets


        return embeds


    def forward(self, input_ids, attention_mask):
        # add for here for batch_size of 32

        batch_embeddings = []
        total_tweets_length = input_ids.shape[0]
        # print(f"batch_size : {BATCH_SIZE}, number of batch : {math.ceil(total_tweets_length / BATCH_SIZE)}")
        tokenized_dataset = BatchTokenizerDataset(input_ids, attention_mask)
        data_loader = DataLoader(tokenized_dataset, batch_size=BATCH_SIZE, shuffle=True)

        counter = 0
        # print("in batch", end=" ")
        for input_ids_batch, attention_mask_batch in data_loader:
            extracted_embeddings = self.tweets_bert(input_ids=input_ids_batch, attention_mask=attention_mask_batch)  # applying first BERT

            sentences_embedding = extracted_embeddings.last_hidden_state  # last layer of BERT(extract all tweets for a user embeddings)

            CLS_embeddings = sentences_embedding[:, 0, :]  # extract CLS layer
            # (batch_size, tokens, hidden_size) reshaped to (batch_size, hidden_size)


            CLS_embeddings = CLS_embeddings.unsqueeze(0)  # add one outer dim for creating batch_size of length 1. (1, MAX_TOKENS, HIDDEN_SIZE)

            last_embeddings = self.embedding_bert(inputs_embeds=CLS_embeddings)  # apply second BERT to extract final embeddings

            CLS_embeddings = last_embeddings.last_hidden_state[:, 0, :]  # extract the CLS token of second BERT

            batch_embeddings.append(CLS_embeddings)
            # batch_embeddings = torch.cat((batch_embeddings, CLS_embeddings.to("cpu"))).to("cpu")
            counter += 1

            # print(f"{counter} ->", end=" ")
            # print(f"batch_embeddings shape : {batch_embeddings.shape}")

            # del extracted_embeddings
            # del sentences_embedding
            # del last_embeddings
            # del CLS_embeddings
            # gc.collect()
            # torch.cuda.empty_cache()

        # print("\nstarting to feed second BERT")
        batch_embeddings = torch.cat(batch_embeddings)

        batch_embeddings = self.handle_limit_tokens_size(batch_embeddings)

        feed_input_second_bert = batch_embeddings.unsqueeze(0)

        last_embeddings = self.embedding_bert(inputs_embeds=feed_input_second_bert)  # apply second BERT to extract final embeddings

        CLS_embeddings = last_embeddings.last_hidden_state[:, 0, :]  # extract the CLS token of second BERT

        pooled_output = self.dropout(CLS_embeddings)  # apply dropout layer

        logits = self.classifier(pooled_output)  # apply linear layer

        # del batch_embeddings, feed_input_second_bert, last_embeddings, CLS_embeddings, pooled_output
        # gc.collect()
        # torch.cuda.empty_cache()

        return torch.sigmoid(logits.squeeze(-1))



In [12]:
# users_tweets = [["i'm so grateful for this occassion", "my name is Mohammad and this is what I don't like", "here there is some shity things"] * 4000 + ["wow awesome"],
#          ["wow this is gonna hurt", "can't imagine a situation better than this"] * 300] * 2000

torch.cuda.empty_cache()

model_name = "mjavadmt/fine-tune-BERT-MLM"

tokenizer = BertTokenizer.from_pretrained(model_name, model_max_length=512)

model = TweetsPersonality(model_name).to(device)

loss_fn = nn.BCELoss()

# Define the optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)


label2id = {"I" : 0, "E" : 1}
id2label = {1 : "E", 0 : "I"}

# labels = torch.tensor([0, 1] * 600).unsqueeze(1).to(device)

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/1.20M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/394 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/646 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/652M [00:00<?, ?B/s]

Some weights of BertModel were not initialized from the model checkpoint at mjavadmt/fine-tune-BERT-MLM and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertModel were not initialized from the model checkpoint at mjavadmt/fine-tune-BERT-MLM and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
# approches
# 1. set the require_grad = False for at least one model(when I use output of first model to feed second model I get gpu memory issue)
# 2. average on each batch output(can't be fine tuned)
# 3. merge each users tweets into a document and then make overlapping text(sliding window approach)
# 4. use bert as feature extractor

# ---
# best approach is fine tune BERT on each subpart with at most 512 tokens but use their whole part label(for example if document is divided into 12 parts fine tune each
# 12 part separately but with their main label) and after that use this BERT as feature extractor and then feed this extracted features to another BERT
# and then you can fine tune it on second BERT

# ---
# alternative without fine tuning is I just freeze first BERT and use it as feature extractor and feed embedding to the second BERT and fine tune on second BERT that's it.

In [None]:
from tqdm import tqdm
import random

epochs_loss = {"train": [], "val": [], "test": []}
epochs_acc = {"train": [], "val": [], "test": []}


model.train()

# Train the model
for epoch in range(NUM_EPOCHS):
    running_loss = 0.0
    indices = list(range(len(X_train)))
    random.shuffle(indices)
    counter = 0
    loop = tqdm(indices)
    for index in loop:
        model_inputs = tokenizer(X_train[index], return_tensors="pt", padding=True, truncation=True).to(device)
        logits = model(input_ids=model_inputs.input_ids, attention_mask=model_inputs.attention_mask)
        label = torch.tensor([y_train[index]]).to(device)

        # Compute loss
        loss = loss_fn(logits, label)

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        counter += 1
        # Log the loss
        running_loss += loss.item()
        loop.set_description(f'Epoch {epoch + 1}')
        loop.set_postfix(user_number=counter, loss=loss.item())
        # print(f"{counter} ->", end=" ")
        # print(f"on user number {counter}")

    print(f"Epoch {epoch + 1}/{NUM_EPOCHS} - Loss: {running_loss}")
    # del model_inputs
    # del loss
    # del logits
    # gc.collect()
    # torch.cuda.empty_cache()




    # # Log the average loss for the epoch
    # print(f'loss {running_loss}')
    # # running_loss = 0
    # torch.cuda.empty_cache()
    # gc.collect()

 # now we are making each user as a separate user and then getting mean on each splitted user



Epoch 1:  41%|████      | 935/2291 [2:10:50<1:47:49,  4.77s/it, loss=1.3, user_number=935]

In [None]:
torch.save(model.state_dict(), "/content/drive/MyDrive/NLP/final_model/model_0.pth")

In [None]:
def model_validation(X, y, model):
    loss = []
    prediction_is_correct = []
    with torch.no_grad():
        for i, tweets in enumerate(X):
            print(f"user {i}:")
            model_inputs = tokenizer(tweets, return_tensors="pt", padding=True, truncation=True).to(device)
            logits = model(input_ids=model_inputs.input_ids, attention_mask=model_inputs.attention_mask)
            label = torch.tensor([y[i]]).to(device)
            user_loss = loss_fn(logits, label)
            loss.append(user_loss.item())
            # try different value of threshold for sigmoid
            train_predicted_label = 0 if logits[0].item() < 0.32 else 0
            print(f"predicated label is : {logits}")
            if train_predicted_label == y[i]:
                prediction_is_correct.append(1)
            else:
                prediction_is_correct.append(0)
    print(f"average loss is : {sum(loss) / len(loss)}\naccuracy is : {sum(prediction_is_correct) / len(prediction_is_correct)}")
    return sum(loss) / len(loss), sum(prediction_is_correct) / len(prediction_is_correct)

In [None]:
model.load_state_dict(torch.load("/content/drive/MyDrive/NLP/final_model/model_0.pth"))


<All keys matched successfully>

In [None]:
X_train_tmp, y_train_tmp = X_train[:50], y_train[:50]
X_test_tmp, y_test_tmp = X_test[:50], y_test[:50]
X_val_tmp, y_val_tmp = X_val[:50], y_val[:50]

In [None]:
1 == y_train_tmp[0]


True

In [None]:
loss_train, acc_train = model_validation(X_train_tmp, y_train_tmp, model)

user 0:
batch_size : 32, number of batch : 17
predicated label is : tensor([0.4908], device='cuda:0')
user 1:
batch_size : 32, number of batch : 17
predicated label is : tensor([0.5000], device='cuda:0')
user 2:
batch_size : 32, number of batch : 22
predicated label is : tensor([0.4628], device='cuda:0')
user 3:
batch_size : 32, number of batch : 22
predicated label is : tensor([0.4494], device='cuda:0')
user 4:
batch_size : 32, number of batch : 21
predicated label is : tensor([0.4114], device='cuda:0')
user 5:
batch_size : 32, number of batch : 17
predicated label is : tensor([0.4033], device='cuda:0')
user 6:
batch_size : 32, number of batch : 22
predicated label is : tensor([0.3840], device='cuda:0')
user 7:
batch_size : 32, number of batch : 22
predicated label is : tensor([0.4363], device='cuda:0')
user 8:
batch_size : 32, number of batch : 22
predicated label is : tensor([0.4928], device='cuda:0')
user 9:
batch_size : 32, number of batch : 20
predicated label is : tensor([0.4593

In [None]:
loss_test, acc_test = model_validation(X_test_tmp, y_test_tmp, model)

user 0:
batch_size : 32, number of batch : 22
predicated label is : tensor([0.3914], device='cuda:0')
user 1:
batch_size : 32, number of batch : 9
predicated label is : tensor([0.4995], device='cuda:0')
user 2:
batch_size : 32, number of batch : 12
predicated label is : tensor([0.3772], device='cuda:0')
user 3:
batch_size : 32, number of batch : 17
predicated label is : tensor([0.4095], device='cuda:0')
user 4:
batch_size : 32, number of batch : 22
predicated label is : tensor([0.4342], device='cuda:0')
user 5:
batch_size : 32, number of batch : 22
predicated label is : tensor([0.3991], device='cuda:0')
user 6:
batch_size : 32, number of batch : 20
predicated label is : tensor([0.4221], device='cuda:0')
user 7:
batch_size : 32, number of batch : 22
predicated label is : tensor([0.4182], device='cuda:0')
user 8:
batch_size : 32, number of batch : 22
predicated label is : tensor([0.4249], device='cuda:0')
user 9:
batch_size : 32, number of batch : 22
predicated label is : tensor([0.4082]

In [None]:
loss_val, acc_val = model_validation(X_val_tmp, y_val_tmp, model)

user 0:
batch_size : 32, number of batch : 22
predicated label is : tensor([0.4151], device='cuda:0')
user 1:
batch_size : 32, number of batch : 15
predicated label is : tensor([0.4263], device='cuda:0')
user 2:
batch_size : 32, number of batch : 12
predicated label is : tensor([0.4665], device='cuda:0')
user 3:
batch_size : 32, number of batch : 22
predicated label is : tensor([0.3806], device='cuda:0')
user 4:
batch_size : 32, number of batch : 22
predicated label is : tensor([0.5132], device='cuda:0')
user 5:
batch_size : 32, number of batch : 22
predicated label is : tensor([0.4365], device='cuda:0')
user 6:
batch_size : 32, number of batch : 6
predicated label is : tensor([0.4248], device='cuda:0')
user 7:
batch_size : 32, number of batch : 4
predicated label is : tensor([0.3437], device='cuda:0')
user 8:
batch_size : 32, number of batch : 22
predicated label is : tensor([0.3953], device='cuda:0')
user 9:
batch_size : 32, number of batch : 22
predicated label is : tensor([0.4537],

In [None]:
results = pd.DataFrame({"train" : [loss_train, acc_train], "val": [loss_val, acc_val], "test" : [loss_test, acc_test]})
results = results.T
results.columns = ["loss", "acc"]

In [None]:
results.to_csv("/content/drive/MyDrive/NLP/outputs_bert_into_bert_2.csv")