In [2]:
!pip install transformers



In [5]:
import os
import json
import torch
from transformers import BertTokenizer, BertForMaskedLM, AdamW
from torch.utils.data import Dataset, DataLoader

In [6]:
data_path = './project_data/preprocessed_data_no_stemming'

In [7]:
data = []
for filename in os.listdir(data_path):
    if filename.endswith(".json"):
        file_path = os.path.join(data_path, filename)

        with open(file_path, 'r') as file:
            try:
                file_content = json.load(file)
                data.append(file_content)
            except json.JSONDecodeError as e:
                print(f"Error decoding JSON in {filename}: {e}")

In [9]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
tokenized_data = tokenizer([entry["Text"] for entry in data], padding=True, truncation=True, return_tensors="pt")

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [10]:
class CustomDataset(Dataset):
    def __init__(self, tokenized_data):
        self.tokenized_data = tokenized_data

    def __len__(self):
        return len(self.tokenized_data["input_ids"])

    def __getitem__(self, idx):
        return {
            "input_ids": self.tokenized_data["input_ids"][idx],
            "attention_mask": self.tokenized_data["attention_mask"][idx],
        }

In [11]:
batch_size = 8
train_dataset = CustomDataset(tokenized_data)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)


In [17]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


In [18]:
model = BertForMaskedLM.from_pretrained("bert-base-uncased").to(device)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'cls.seq_relationship.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [19]:
optimizer = AdamW(model.parameters(), lr=1e-5)
num_epochs = 10



In [20]:
for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for batch in train_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=input_ids)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    average_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch + 1}/{num_epochs}, Average Loss: {average_loss}")

Epoch 1/10, Average Loss: 3.3473904728889465
Epoch 2/10, Average Loss: 2.3742774054408073
Epoch 3/10, Average Loss: 2.443785432726145
Epoch 4/10, Average Loss: 1.7449168507009745
Epoch 5/10, Average Loss: 1.6066503562033176
Epoch 6/10, Average Loss: 1.5360700190067291
Epoch 7/10, Average Loss: 1.3808880764991045
Epoch 8/10, Average Loss: 1.1825059354305267
Epoch 9/10, Average Loss: 1.0938133597373962
Epoch 10/10, Average Loss: 0.9913296401500702


In [22]:
num_sentences = 5

for _ in range(num_sentences):
    prompt = "America is"  # You can choose a seed sentence
    input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)

    # Generate a sequence of tokens
    with torch.no_grad():
        output = model.generate(input_ids, max_length=100, num_beams=5, no_repeat_ngram_size=2, top_k=50, top_p=0.95)

    # Decode the generated sequence and print the result
    generated_sentence = tokenizer.decode(output[0], skip_special_tokens=True)
    print("Generated Sentence:", generated_sentence)

Generated Sentence: america is europe africa asia europe oceania oceania asia oceania africa oceania australia oceania americas americas asia americas africa americas south america north america central west asia america south asia southeast asia pacific southwest pacific asia asia southwest asia arctic asia central east pacific south pacific arctic north asia africa arctic central america northeast asia northeast pacific pacific central asia northwest pacific northwest southwest southeast southeast siberia northeast southwest siberia alaska arctic siberia arctic siberian northeast arctic northeast central north arctic arctic southeast siberian arctic northwest northwest central pacific alaska southwest alaska siberian siberian siberia siberian northwest siberian
Generated Sentence: america is america... america america americas americas... americas europe americas european asia europe asia asia pacific asia oceania africa oceania tropical asia tropical africa tropical oceania oceania 