In [3]:
import pandas as pd
from transformers import BertTokenizer, BertForMaskedLM, DataCollatorForLanguageModeling
import torch
from torch.utils.data import Dataset, DataLoader
import torch
import time
from tqdm import tqdm
from sklearn.model_selection import train_test_split

In [4]:
df = pd.read_csv("balancedmodified.csv")

In [5]:
modified_df = df[df['modified'] == True].reset_index(drop=True)
base_df = df[df['modified'] == False].reset_index(drop=True)

In [6]:
len(modified_df)

137553

In [8]:
train_modified, val_modified = train_test_split(
    modified_df,
    test_size=25000,
    random_state=42
)

val_base = base_df.sample(10000, random_state=42)

train_data = {
    'text': train_modified['polypersonal_sent'].tolist()[:60000]
}
pd.DataFrame(train_data).to_csv("train_dataset.csv", index=False)

In [9]:
model_name = "bert-base-german-cased"
tokenizer = BertTokenizer.from_pretrained(model_name)
base_model = BertForMaskedLM.from_pretrained(model_name)

for param in base_model.parameters():
    param.requires_grad = False

class CustomBERT(torch.nn.Module):
    def __init__(self, base_model, hidden_size=768):
        super().__init__()
        self.bert = base_model.bert
        self.lstm = torch.nn.LSTM(hidden_size, hidden_size, batch_first=True)
        self.classifier = torch.nn.Linear(hidden_size, tokenizer.vocab_size)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        sequence_output = outputs.last_hidden_state
        lstm_output, _ = self.lstm(sequence_output)
        logits = self.classifier(lstm_output)
        return logits

model = CustomBERT(base_model)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/255k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/485k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/433 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/439M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-german-cased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [10]:
class MLMDataset(Dataset):
    def __init__(self, texts):
        self.texts = texts

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = tokenizer(
            self.texts[idx],
            padding="max_length",
            truncation=True,
            max_length=128,
            return_tensors="pt"
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(),
            "attention_mask": encoding["attention_mask"].squeeze()
        }

train_dataset = MLMDataset(train_data['text'])
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)

In [12]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm_probability=0.15)

for epoch in range(5):
    start_time = time.time()

    model.train()
    total_loss = 0
    num_batches = len(train_loader)


    for batch_idx, batch in tqdm(enumerate(train_loader), total=num_batches, desc=f'Epoch {epoch + 1}/{7}', unit='batch'):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)


        masked_inputs = data_collator([{"input_ids": ids.cpu().numpy()} for ids in input_ids])
        masked_ids = masked_inputs["input_ids"].to(device)

        outputs = model(masked_ids, attention_mask)
        loss = torch.nn.functional.cross_entropy(
            outputs.view(-1, tokenizer.vocab_size),
            input_ids.view(-1),
            ignore_index=tokenizer.pad_token_id
        )

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    end_time = time.time()
    epoch_duration = end_time - start_time

    print(f"Epoch {epoch + 1}, Loss: {total_loss / num_batches:.4f}, Duration: {epoch_duration:.2f} seconds")

Epoch 1/7: 100%|██████████| 7500/7500 [16:24<00:00,  7.62batch/s]


Epoch 1, Loss: 5.4435, Duration: 984.65 seconds


Epoch 2/7: 100%|██████████| 7500/7500 [16:28<00:00,  7.59batch/s]


Epoch 2, Loss: 3.9654, Duration: 988.74 seconds


Epoch 3/7: 100%|██████████| 7500/7500 [16:28<00:00,  7.58batch/s]


Epoch 3, Loss: 3.2060, Duration: 988.91 seconds


Epoch 4/7: 100%|██████████| 7500/7500 [16:29<00:00,  7.58batch/s]


Epoch 4, Loss: 2.6388, Duration: 989.45 seconds


Epoch 5/7: 100%|██████████| 7500/7500 [16:31<00:00,  7.56batch/s]

Epoch 5, Loss: 2.1871, Duration: 991.78 seconds





In [None]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

def get_embeddings(model, texts, batch_size=32):
    model.eval()
    embeddings = []

    dataset = MLMDataset(texts)
    loader = DataLoader(dataset, batch_size=batch_size)

    with torch.no_grad():
        for batch in tqdm(loader):
            inputs = {k: v.to(device) for k, v in batch.items() if k != "flag"}
            outputs = model.bert(**inputs).last_hidden_state
            embeddings.append(outputs.mean(dim=1).cpu().numpy())

    return np.concatenate(embeddings)

base_texts = df[df['modified'] == False]['base_sent'].tolist()[:5000]
poly_texts = df[df['modified'] == True]['polypersonal_sent'].tolist()[:5000]

In [35]:
base_emb = get_embeddings(base_model, base_texts)
poly_emb = get_embeddings(base_model, poly_texts)

X = np.concatenate([base_emb, poly_emb])
y = np.array([0]*len(base_emb) + [1]*len(poly_emb))

np.random.seed(42)
y_shuffled = np.random.permutation(y)

def train_and_evaluate(X, y, test_size=0.2):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)
    clf = LogisticRegression(max_iter=1000)
    clf.fit(X_train, y_train)
    return accuracy_score(y_test, clf.predict(X_test))


real_acc = train_and_evaluate(X, y)
shuffled_acc = train_and_evaluate(X, y_shuffled)

print(f"Результаты:\n"
      f"Точность на реальных данных: {real_acc:.4f}\n"
      f"Точность на перемешанных данных: {shuffled_acc:.4f}\n"
      f"Majority baseline: 0.5")

100%|██████████| 157/157 [00:34<00:00,  4.53it/s]
100%|██████████| 157/157 [00:34<00:00,  4.58it/s]


Результаты:
Точность на реальных данных: 0.9920
Точность на перемешанных данных: 0.5020
Majority baseline: 0.5


In [15]:
def evaluate_perplexity(model, loader, device="cpu"):
    model.to(device)
    model.eval()
    total_loss = 0
    total_tokens = 0

    with torch.no_grad():
        for batch in tqdm(loader, desc="Evaluating perplexity", unit="batch"):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)

            if hasattr(outputs, 'logits'):
                logits = outputs.logits
            else:
                logits = outputs

            loss = torch.nn.functional.cross_entropy(
                logits.view(-1, tokenizer.vocab_size),
                input_ids.view(-1),
                ignore_index=tokenizer.pad_token_id,
                reduction='sum'
            )

            non_pad_tokens = (input_ids != tokenizer.pad_token_id).sum().item()
            total_loss += loss.item()
            total_tokens += non_pad_tokens

    return torch.exp(torch.tensor(total_loss / total_tokens)).item()
original_model = BertForMaskedLM.from_pretrained("bert-base-german-cased")

n_samples = 10000

base_texts = val_base['base_sent'].tolist()[:n_samples]
poly_texts = val_modified['polypersonal_sent'].tolist()[:n_samples]

base_dataset = MLMDataset(base_texts)
poly_dataset = MLMDataset(poly_texts)

base_loader = DataLoader(base_dataset, batch_size=8, shuffle=False)
poly_loader = DataLoader(poly_dataset, batch_size=8, shuffle=False)

print("Оригинальная модель:")
orig_base_ppl = evaluate_perplexity(original_model, base_loader, device)
orig_poly_ppl = evaluate_perplexity(original_model, poly_loader, device)

print("\nНаша модель:")
your_base_ppl = evaluate_perplexity(model, base_loader, device)
your_poly_ppl = evaluate_perplexity(model, poly_loader, device)

Some weights of the model checkpoint at bert-base-german-cased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Оригинальная модель:


Evaluating perplexity: 100%|██████████| 1250/1250 [01:34<00:00, 13.25batch/s]
Evaluating perplexity: 100%|██████████| 1250/1250 [01:33<00:00, 13.38batch/s]



Наша модель:


Evaluating perplexity: 100%|██████████| 1250/1250 [01:45<00:00, 11.84batch/s]
Evaluating perplexity: 100%|██████████| 1250/1250 [01:45<00:00, 11.80batch/s]


In [16]:
print(f"{'model':<20} | {'Base PPL':<10} | {'Poly PPL':<10}")
print("-"*45)
print(f"{'fine-tuned':<20} | {your_base_ppl:>10.2f} | {your_poly_ppl:>10.2f}")
print(f"{'base':<20} | {orig_base_ppl:>10.2f} | {orig_poly_ppl:>10.2f}")

model                | Base PPL   | Poly PPL  
---------------------------------------------
fine-tuned           |       5.86 |       5.29
base                 |      35.56 |      30.90


### With Flag

In [19]:
train_base, _ = train_test_split(
    base_df,
    test_size=25000,
    random_state=42
)

train_data_with_flag = {
    'text': train_modified['polypersonal_sent'].tolist()[:25000] + train_base['base_sent'].tolist()[:25000],
    'flag': [1] * 25000 + [0] * 25000
}

val_data_with_flag = {
    'text': val_modified['polypersonal_sent'].tolist()[:6000] + val_base['base_sent'].tolist()[:6000],
    'flag': [1] * 6000 + [0] * 6000
}

In [21]:
class MLMWithFlagDataset(Dataset):
    def __init__(self, texts, flags, tokenizer, mlm_probability=0.15):
        self.texts = texts
        self.flags = flags
        self.tokenizer = tokenizer
        self.collator = DataCollatorForLanguageModeling(
            tokenizer=tokenizer,
            mlm=True,
            mlm_probability=mlm_probability
        )

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            padding="max_length",
            truncation=True,
            max_length=128,
            return_tensors="pt"
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'flag': torch.tensor(self.flags[idx], dtype=torch.float)
        }

    def collate_fn(self, batch):
        mlm_batch = self.collator([{'input_ids': item['input_ids']} for item in batch])
        mlm_batch['attention_mask'] = torch.stack([item['attention_mask'] for item in batch])
        mlm_batch['flag'] = torch.stack([item['flag'] for item in batch])

        return mlm_batch

class CustomBERTWithFlag(torch.nn.Module):
    def __init__(self, base_model, hidden_size=768):
        super().__init__()
        self.bert = base_model.bert
        self.flag_proj = torch.nn.Linear(1, hidden_size)
        self.classifier = torch.nn.Linear(hidden_size, tokenizer.vocab_size)

    def forward(self, input_ids, attention_mask, flag):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        sequence_output = outputs.last_hidden_state

        flag_embed = self.flag_proj(flag.view(-1, 1, 1))
        sequence_output = sequence_output + flag_embed

        logits = self.classifier(sequence_output)
        return logits

In [22]:
model_name = "bert-base-german-cased"
tokenizer = BertTokenizer.from_pretrained(model_name)
base_model = BertForMaskedLM.from_pretrained(model_name)

Some weights of the model checkpoint at bert-base-german-cased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [24]:
train_dataset_with_flag = MLMWithFlagDataset(
    texts=train_data_with_flag['text'],
    flags=train_data_with_flag['flag'],
    tokenizer=tokenizer
)

train_loader_with_flag = DataLoader(
    train_dataset_with_flag,
    batch_size=16,
    shuffle=True,
    collate_fn=train_dataset_with_flag.collate_fn
)

In [25]:
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm_probability=0.15)

In [26]:
import time
from tqdm import tqdm
from datetime import timedelta

optimizer = torch.optim.AdamW(model_with_flag.parameters(), lr=5e-5)

total_epochs = 5
start_time = time.time()

for epoch in range(total_epochs):
    epoch_start_time = time.time()
    model_with_flag.train()
    total_loss = 0


    batch_progress = tqdm(
        train_loader_with_flag,
        desc=f"Epoch {epoch + 1}/{total_epochs}",
        leave=True
    )

    for batch in batch_progress:
        batch = {k: v.to(device) for k, v in batch.items()}

        outputs = model_with_flag(
            input_ids=batch['input_ids'],
            attention_mask=batch['attention_mask'],
            flag=batch['flag']
        )

        loss = torch.nn.functional.cross_entropy(
            outputs.view(-1, tokenizer.vocab_size),
            batch['labels'].view(-1),
            ignore_index=-100
        )

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()


        avg_loss_so_far = total_loss / (batch_progress.n + 1)
        batch_progress.set_postfix(
            loss=loss.item(),
            avg_loss=avg_loss_so_far,
            elapsed=str(timedelta(seconds=time.time() - start_time))[:-7]
        )

    avg_loss = total_loss / len(train_loader_with_flag)
    epoch_time = time.time() - epoch_start_time


    print(
        f"Epoch {epoch + 1} completed | "
        f"Avg Loss: {avg_loss:.4f} | "
        f"Time: {timedelta(seconds=epoch_time)} | "
        f"Total: {timedelta(seconds=time.time() - start_time)}"
    )

total_time = time.time() - start_time
print(f"\nTraining completed in {timedelta(seconds=total_time)}")


Epoch 1/5: 100%|██████████| 3125/3125 [22:19<00:00,  2.33it/s, avg_loss=5.84, elapsed=0:22:19, loss=5.96]


Epoch 1 completed | Avg Loss: 5.8428 | Time: 0:22:19.525534 | Total: 0:22:19.526061


Epoch 2/5: 100%|██████████| 3125/3125 [22:19<00:00,  2.33it/s, avg_loss=4.76, elapsed=0:44:39, loss=4.45]


Epoch 2 completed | Avg Loss: 4.7608 | Time: 0:22:19.639654 | Total: 0:44:39.165766


Epoch 3/5: 100%|██████████| 3125/3125 [22:19<00:00,  2.33it/s, avg_loss=4.27, elapsed=1:06:58, loss=4.92]


Epoch 3 completed | Avg Loss: 4.2739 | Time: 0:22:19.476818 | Total: 1:06:58.642630


Epoch 4/5: 100%|██████████| 3125/3125 [22:18<00:00,  2.33it/s, avg_loss=3.94, elapsed=1:29:17, loss=4.38]


Epoch 4 completed | Avg Loss: 3.9438 | Time: 0:22:18.731322 | Total: 1:29:17.374001


Epoch 5/5: 100%|██████████| 3125/3125 [22:20<00:00,  2.33it/s, avg_loss=3.7, elapsed=1:51:37, loss=3.13]

Epoch 5 completed | Avg Loss: 3.6988 | Time: 0:22:20.243589 | Total: 1:51:37.617637

Training completed in 1:51:37.617991





In [28]:
val_data_with_flag_1 = {
    'text': val_modified['polypersonal_sent'].tolist()[:5000] ,
    'flag': [1] * 5000
}

val_data_with_flag_0 = {
    'text': val_base['base_sent'].tolist()[:5000], # altered flag
    'flag': [0] * 5000
}

In [29]:
def evaluate_perplexity_with_flag(model, loader, device, is_poly):
    model.eval()
    total_loss = 0
    total_tokens = 0

    with torch.no_grad():
        for batch in loader:
            flag = torch.full((batch["input_ids"].size(0),),
                             is_poly,
                             dtype=torch.float).to(device)

            outputs = model(
                batch["input_ids"].to(device),
                batch["attention_mask"].to(device),
                flag
            )

            loss = torch.nn.functional.cross_entropy(
                outputs.view(-1, tokenizer.vocab_size),
                batch["input_ids"].to(device).view(-1),
                ignore_index=tokenizer.pad_token_id,
                reduction='sum'
            )

            non_pad = (batch["input_ids"] != tokenizer.pad_token_id).sum().item()
            total_loss += loss.item()
            total_tokens += non_pad

    return torch.exp(torch.tensor(total_loss / total_tokens)).item()

val_dataset_with_flag_0 = MLMWithFlagDataset(
    texts=val_data_with_flag_0['text'],
    flags=val_data_with_flag_0['flag'],
    tokenizer=tokenizer
)
val_loader_with_flag_0 = DataLoader(
    val_dataset_with_flag_0,
    batch_size=8,
    collate_fn=val_dataset_with_flag_0.collate_fn
)

val_dataset_with_flag_1 = MLMWithFlagDataset(
    texts=val_data_with_flag_1['text'],
    flags=val_data_with_flag_1['flag'],
    tokenizer=tokenizer
)
val_loader_with_flag_1 = DataLoader(
    val_dataset_with_flag_1,
    batch_size=8,
    collate_fn=val_dataset_with_flag_0.collate_fn
)

print("Модель с флагами:")
base_ppl = evaluate_perplexity_with_flag(model_with_flag, val_loader_with_flag_0, device, is_poly=False)
poly_ppl = evaluate_perplexity_with_flag(model_with_flag, val_loader_with_flag_1, device, is_poly=True)

Модель с флагами:


In [30]:
print(base_ppl)
print(poly_ppl)

350.6969299316406
272.5133056640625


In [None]:
base_ppl_rev = evaluate_perplexity_with_flag(model_with_flag, val_loader_with_flag_0, device, is_poly=True)
poly_ppl_rev = evaluate_perplexity_with_flag(model_with_flag, val_loader_with_flag_1, device, is_poly=False)

In [32]:
print(base_ppl_rev)
print(poly_ppl_rev)

590.1317749023438
183.89605712890625
