In [None]:
!pip install transformers[torch]
!pip install accelerate -U

Collecting transformers[torch]
  Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m71.6 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers[torch])
  Downloading huggingface_hub-0.16.2-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.5/268.5 kB[0m [31m26.2 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers[torch])
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m112.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers[torch])
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m

In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset, random_split
from transformers import GPT2Tokenizer, TrainingArguments, Trainer, GPT2LMHeadModel


In [None]:
torch.manual_seed(42)

<torch._C.Generator at 0x7f3b51c03630>

In [None]:
import random
random.seed(1234)

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained('flax-community/gpt2-medium-persian', bos_token='<|startoftext|>',
                                          eos_token='<|endoftext|>', pad_token='<|pad|>')

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.41M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/1.07M [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/921 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:

model = GPT2LMHeadModel.from_pretrained('flax-community/gpt2-medium-persian').cuda()
model.resize_token_embeddings(len(tokenizer))

Downloading pytorch_model.bin:   0%|          | 0.00/1.44G [00:00<?, ?B/s]

Embedding(50001, 1024)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
df = pd.read_json("/content/drive/MyDrive/NLP/Project/datasets.json")

In [None]:
def make_trait(row):
    row["trait_0"] = row["mbti_result"][0]
    row["trait_1"] = row["mbti_result"][1]
    row["trait_2"] = row["mbti_result"][2]
    row["trait_3"] = row["mbti_result"][3]
    return row

df = df.apply(make_trait, axis=1)

In [None]:
class PersonalityDataset(Dataset):
    def __init__(self, txt_list, tokenizer, max_length):
        self.input_ids = []
        self.attn_masks = []
        self.labels = []
        for txt in txt_list:
            encodings_dict = tokenizer('<|startoftext|>' + txt + '<|endoftext|>', truncation=True,
                                       max_length=max_length, padding="max_length")
            self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
            self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.attn_masks[idx]

In [None]:
for trait in range(1):
    curr_label_str = f"trait_{trait}"
    curr_df = df[["tweets", curr_label_str]]
    grouped_curr_df = curr_df.groupby(curr_label_str)
    for label, frame in grouped_curr_df:
      if label == "E":
        continue
      print(f"training model on trait {label} ...")
      top_k = 30
      max_length = 750
      tweets = frame["tweets"].apply(lambda x : " ".join(random.sample(x, top_k))[:max_length])
      max_length = max([len(tokenizer.encode(tweet)) for tweet in tweets])
      dataset = PersonalityDataset(tweets, tokenizer, max_length=max_length)
      train_size = int(0.9 * len(dataset))
      train_dataset, val_dataset = random_split(dataset, [train_size, len(dataset) - train_size])
      training_args = TrainingArguments(output_dir='./models', num_train_epochs=2, logging_steps=100, save_steps=5000,
                                  per_device_train_batch_size=1, per_device_eval_batch_size=1,
                                  warmup_steps=10, weight_decay=0.01, logging_dir='./logs', report_to = 'none')
      Trainer(model=model,  args=training_args, train_dataset=train_dataset,
        eval_dataset=val_dataset, data_collator=lambda data: {'input_ids': torch.stack([f[0] for f in data]),
                                                              'attention_mask': torch.stack([f[1] for f in data]),
                                                              'labels': torch.stack([f[0] for f in data])}).train()


training model on trait I ...




Step,Training Loss
100,6.223
200,2.4432
300,2.2496
400,2.283
500,2.2146
600,2.2047
700,2.1431
800,2.1283
900,2.0839
1000,2.1515


In [None]:
model.save_pretrained("/content/drive/MyDrive/NLP/I.language_model")

In [None]:
load_model = GPT2LMHeadModel.from_pretrained("/content/drive/MyDrive/NLP/I.language_model")

In [None]:
generated = tokenizer("<|startoftext|> ", return_tensors="pt").input_ids.cuda()

In [None]:
load_model = load_model.cuda()

In [None]:
sample_outputs = load_model.generate(generated, do_sample=True, top_k=50,
                                max_length=300, top_p=0.95, temperature=1.9, num_return_sequences=20)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:5 for open-end generation.


In [None]:
with open("/content/drive/MyDrive/NLP/I_example.txt", "a+", encoding="utf-8") as f:
  for i, sample_output in enumerate(sample_outputs):
      f.write("{}: {}\n".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))