In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import RobertaTokenizer, RobertaModel
from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset
from sklearn.model_selection import KFold
import numpy as np
from tqdm import tqdm
import psutil
import os
#os.environ['CUDA_LAUNCH_BLOCKING']="1"
#os.environ['TORCH_USE_CUDA_DSA'] = "1"

class RedditAuthorshipDataset(Dataset):
    def __init__(self, texts, authors, tokenizer, max_length=128):
        self.texts = texts
        self.authors = authors
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text1, author1 = self.texts[idx], self.authors[idx]
        # Randomly select another sample
        other_idx = np.random.randint(len(self.texts))
        text2, author2 = self.texts[other_idx], self.authors[other_idx]
        
        label = 1 if author1 == author2 else 0

        encoding1 = self.tokenizer.encode_plus(
            text1,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        encoding2 = self.tokenizer.encode_plus(
            text2,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        return {
            'input_ids1': encoding1['input_ids'].flatten(),
            'attention_mask1': encoding1['attention_mask'].flatten(),
            'input_ids2': encoding2['input_ids'].flatten(),
            'attention_mask2': encoding2['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

class ContrastiveAuthorshipModel(nn.Module):
    def __init__(self, pretrained_model_name='roberta-base'):
        super().__init__()
        self.roberta = RobertaModel.from_pretrained(pretrained_model_name)
        self.projection = nn.Sequential(
            nn.Linear(self.roberta.config.hidden_size, 256),
            nn.ReLU(),
            nn.Linear(256, 128)
        )
        self.dropout = nn.Dropout(0.1)

    def forward(self, input_ids1, attention_mask1, input_ids2, attention_mask2):
        output1 = self.roberta(input_ids1, attention_mask=attention_mask1)
        output2 = self.roberta(input_ids2, attention_mask=attention_mask2)
        
        embedding1 = self.projection(self.dropout(output1.last_hidden_state[:, 0, :]))
        embedding2 = self.projection(self.dropout(output2.last_hidden_state[:, 0, :]))
        
        return embedding1, embedding2


def contrastive_loss(embedding1, embedding2, label, temperature=1):
    cosine_similarity = nn.functional.cosine_similarity(embedding1, embedding2)
    similarity_scaled = cosine_similarity / temperature
    loss = torch.mean((1 - label) * torch.pow(torch.clamp(similarity_scaled, min=0.0), 2) +
                      label * torch.pow(1 - similarity_scaled, 2))
    return loss

def train(model, train_loader, val_loader, optimizer, device, epochs=5, patience=3):
    model.train()
    best_val_loss = float('inf')
    early_stopping_counter = 0
    
    for epoch in range(epochs):
        start_memory = get_memory_usage()
        total_loss = 0
        for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}"):
            input_ids1 = batch['input_ids1'].to(device)
            attention_mask1 = batch['attention_mask1'].to(device)
            input_ids2 = batch['input_ids2'].to(device)
            attention_mask2 = batch['attention_mask2'].to(device)
            labels = batch['label'].to(device)

            optimizer.zero_grad()
            embedding1, embedding2 = model(input_ids1, attention_mask1, input_ids2, attention_mask2)
            loss = contrastive_loss(embedding1, embedding2, labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
        
        end_memory = get_memory_usage()
        print(f"Memory Usage: {end_memory - start_memory:.2f} MB")
        
        avg_train_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch+1}/{epochs}, Train Loss: {avg_train_loss:.4f}")
        
        # Validation step
        val_loss, val_accuracy = evaluate(model, val_loader, device)
        print(f"Validation - Loss: {val_loss:.4f}, Accuracy: {val_accuracy:.4f}")
        
        # Early stopping check
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            early_stopping_counter = 0
            # Save the best model
            torch.save(model.state_dict(), 'best_authorship_model.pth')
        else:
            early_stopping_counter += 1
            if early_stopping_counter >= patience:
                print(f"Early stopping triggered after epoch {epoch+1}")
                break
    
    return model

def evaluate(model, val_loader, device):
    model.eval()
    total_loss = 0
    correct = 0
    total = 0
    with torch.no_grad():
        for batch in tqdm(val_loader, desc="Evaluating"):
            input_ids1 = batch['input_ids1'].to(device)
            attention_mask1 = batch['attention_mask1'].to(device)
            input_ids2 = batch['input_ids2'].to(device)
            attention_mask2 = batch['attention_mask2'].to(device)
            labels = batch['label'].to(device)

            embedding1, embedding2 = model(input_ids1, attention_mask1, input_ids2, attention_mask2)
            loss = contrastive_loss(embedding1, embedding2, labels)
            total_loss += loss.item()

            similarity = nn.functional.cosine_similarity(embedding1, embedding2)
            predictions = (similarity > 0.5).long()
            correct += (predictions == labels).sum().item()
            total += labels.size(0)

    avg_loss = total_loss / len(val_loader)
    accuracy = correct / total
    return avg_loss, accuracy

def predict_authorship(model, tokenizer, text1, text2, device, threshold=0.5):
    model.eval()
    encoding1 = tokenizer.encode_plus(
        text1,
        add_special_tokens=True,
        max_length=128,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )
    encoding2 = tokenizer.encode_plus(
        text2,
        add_special_tokens=True,
        max_length=128,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )
    
    input_ids1 = encoding1['input_ids'].to(device)
    attention_mask1 = encoding1['attention_mask'].to(device)
    input_ids2 = encoding2['input_ids'].to(device)
    attention_mask2 = encoding2['attention_mask'].to(device)
    
    with torch.no_grad():
        embedding1, embedding2 = model(input_ids1, attention_mask1, input_ids2, attention_mask2)
        similarity = nn.functional.cosine_similarity(embedding1, embedding2).item()
    
    same_author = similarity > threshold
    return same_author, similarity

def get_memory_usage():
    process = psutil.Process(os.getpid())
    return process.memory_info().rss / (1024 * 1024)  # in MB

def estimate_memory_usage(model, batch_size, seq_length, dtype=torch.float32):
    def numel(model):
        return sum(p.numel() for p in model.parameters())

    # Model parameters
    model_params_memory = numel(model) * dtype.itemsize

    # Estimate memory for one forward + backward pass
    input_size = batch_size * seq_length
    activations_memory = input_size * model.roberta.config.hidden_size * dtype.itemsize * 2  # *2 for forward and backward
    gradients_memory = model_params_memory
    
    # Optimizer memory (assuming Adam)
    optimizer_memory = model_params_memory * 2  # Adam keeps two additional values per parameter

    # Estimate memory for embeddings
    embedding_memory = batch_size * 2 * seq_length * model.roberta.config.hidden_size * dtype.itemsize

    # Estimate memory for attention masks
    attention_mask_memory = batch_size * 2 * seq_length * torch.bool.itemsize

    # Total estimated memory
    total_memory = (model_params_memory + activations_memory + gradients_memory + 
                    optimizer_memory + embedding_memory + attention_mask_memory)

    # Convert to MB
    total_memory_mb = total_memory / (1024 * 1024)

    return total_memory_mb

### Main
# Load Reddit dataset
data = load_dataset("reddit", split="train[:10000]", trust_remote_code=True)
texts = data['content']
authors = data['author']

import gc
torch.cuda.empty_cache()
gc.collect()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#device = torch.device('cpu')
print(f"Using device: {device}")

# Parameters
batch_size = 32
seq_length = 128  # Max sequence length

tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
dataset = RedditAuthorshipDataset(texts, authors, tokenizer)

model = ContrastiveAuthorshipModel().to(device)
#model.load_state_dict(torch.load('best_authorship_model.pth'))

# Estimate memory usage
estimated_memory = estimate_memory_usage(model, batch_size, seq_length)
print(f"Estimated memory usage per batch: {estimated_memory:.2f} MB")

# Estimate for entire dataset
dataset_size = 10000  # Adjust this to your actual dataset size
num_batches = dataset_size // batch_size
total_estimated_memory = estimated_memory * num_batches
print(f"Estimated total memory usage for one epoch: {total_estimated_memory:.2f} MB")


# K-Fold Cross-Validation
n_splits = 2
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

for fold, (train_idx, val_idx) in enumerate(kf.split(dataset), 1):
    print(f"Fold {fold}/{n_splits}")

    train_subsampler = torch.utils.data.SubsetRandomSampler(train_idx)
    val_subsampler = torch.utils.data.SubsetRandomSampler(val_idx)

    train_loader = DataLoader(dataset, batch_size=32, sampler=train_subsampler)
    val_loader = DataLoader(dataset, batch_size=32, sampler=val_subsampler)

    optimizer = optim.AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)

    model = train(model, train_loader, val_loader, optimizer, device, epochs=2, patience=1)

# Load the best model for prediction
best_model = ContrastiveAuthorshipModel().to(device)
best_model.load_state_dict(torch.load('best_authorship_model.pth'))

# Example prediction
text1 = """There were no Dark Ages! They didn't happen! Byzantium was happily being Byzantium. The Muslims were doing fucking amazing things! Al-Andalus was a beacon of cultural integration, art, science, and philosophy! Ibn Khaldun was inventing modern history! 
        The 'Dark Ages' where when a bunch of dirt sucking savages from east-bumfuck lost contact with the First World, which is to say the Mediterranean. It was 'Dark' because no one who mattered gave two shits what was happening in Germania because Germania was utterly irrelevant to the world economy, sciences, history, and politics. Europe went through the 'Dark Ages' because Europe was not important. It was a worthless, cold, savage back water full of dirty hairy people who wore pants. 
        Right up until 700ish, when the Scandanavians went a viking and started to spread their culture across Northern Europe, setting up trade across the continent, forcing other NE cultures to centralize and become more efficient to resist the north men. 
        Seriously, though, the Muslims were rocking out with their Qu'ran out after about 600, and they did more for art, science, philosophy, and poetry than the Romans had done since 100ad. The period of Muslim ascendancy flowed smoothly out of the fall of Western Rome and then snugged seamlessly into the Renaissance. 
        And then the Norse were doing all sorts of wacky stuff with democracy and law from the mid millenium. Really, if there was a 'Dark Age' it was only from about 450, when the Romans abandoned Italy, to abou 600, when the Muslims really started kicking ass and taking names. 
        The only thing that was really 'lost' with the fall of Western Rome was the extremely powerful and centralized Roman state. All the cool technology they had persisted in other places (Specifically, everywhere except Europe), but without the massive centralization that let the Romans make use of it on such a large scale."""
text2 = """I would associate the decline of the church largely to the loss of power of the Roman Empire in germania and western Europe, which was due to a large number of complicated factors, including over extension of the Empire's resources, migration of 'barbarian' peoples into the empire, the conflict between Pagan Roman religion and Christianity (Fun fact, the Visigoths that sacked Rome were Arian Christians, followers of a creed that had been declared heretical at the council of Nicea), and many, many other things. The Roman Empire was extremely important to pre-medieval Europe, introducing all kinds of culture and technology. When the financial and military support of the Empire withdrew much of that culture and technology went with it. 
        Also, I would like to note that up until... hmm, probably the 1500s or 1600s many, many powerful political figures were members were both Clergy and princes. Many Bishops and other church figures held land, raised armies, went to war, and participated in the councils of kings. They fought with secular lords and also with each other. 
        I would not say that the Church caused any decline in Europe, on the grounds that in many way there is no Europe without the Church and their is no Church without Europe. Catholicism was the culture of Europe from around 500 to around 1700. The Church was as important and basic a component of culture at that time as the Internet is now. Priests were often the only people with a semblance of education, the only people able to write and receive letters. While some theologians certainly advocated a radical and oppressive form of Christianity, others provided council to their leaders that served to limit the gross abuses of Feudalism. 
        In the end it's far too complicated to say that the Christian Church was a good thing or a bad thing. It spurred Europe to destructive wars with the Muslims to the south and the pagan Slavs in the east. It provided the foundations for rational inquiry on which Science was founded. It founded and promulgated the Inquisition, which was both a machine of torture and oppression and an instrument of social and political justice. The church preserved knowledge from the time of Rome and suppressed new knowledge. The church contributed and obstructed philosophy. 
        If Catholicism hadn't become the dominant religion in Europe I don't know that things would have changed very much. Certainly a Europe that followed the Mithras cult or kept to Roman or Germanic Paganism would be different, but I don't think humanity would necessarily have made more social or technical progress. The Romans could be as brutal and sadistic as any Inquistor, and the Vikings were notorious for being savage in battle. The Muslims put whole cities to the sword, and the Mongols carved a swath across the entire world. If Roman Catholicism hadn't risen to become the dominant cultural framework of Europe then it seems likely to me that one of those four groups, the Muslims, the Norsemen, the Romans, or the Mongols, would have shaped the face of Europe. Each culture had its great triumphs and terrible deeds."""
same_author, similarity = predict_authorship(best_model, tokenizer, text1, text2, device)
print(f"Texts are by the same author: {same_author}")
print(f"Similarity score: {similarity:.4f}")


  from .autonotebook import tqdm as notebook_tqdm


Using device: cuda


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Estimated memory usage per batch: 1953.46 MB
Estimated total memory usage for one epoch: 609477.98 MB
Fold 1/2


Epoch 1/2: 100%|██████████| 157/157 [38:58<00:00, 14.90s/it]


Memory Usage: 1414.74 MB
Epoch 1/2, Train Loss: 0.0715


Evaluating: 100%|██████████| 157/157 [02:07<00:00,  1.23it/s]


Validation - Loss: 0.0283, Accuracy: 0.9908


Epoch 2/2: 100%|██████████| 157/157 [20:59<00:00,  8.02s/it]


Memory Usage: -798.73 MB
Epoch 2/2, Train Loss: 0.0158


Evaluating: 100%|██████████| 157/157 [02:07<00:00,  1.23it/s]


Validation - Loss: 0.0135, Accuracy: 0.9974
Fold 2/2


Epoch 1/2: 100%|██████████| 157/157 [39:49<00:00, 15.22s/it]


Memory Usage: -1099.51 MB
Epoch 1/2, Train Loss: 0.0114


Evaluating: 100%|██████████| 157/157 [02:00<00:00,  1.30it/s]


Validation - Loss: 0.0176, Accuracy: 0.9946


Epoch 2/2: 100%|██████████| 157/157 [21:40<00:00,  8.29s/it]


Memory Usage: -16.70 MB
Epoch 2/2, Train Loss: 0.0128


Evaluating: 100%|██████████| 157/157 [02:03<00:00,  1.27it/s]


Validation - Loss: 0.0122, Accuracy: 0.9980


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  best_model.load_state_dict(torch.load('best_authorship_model.pth'))


Texts are by the same author: True
Similarity score: 0.5149


In [2]:
import numpy as np
from datasets import load_dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
import matplotlib.pyplot as plt
from tqdm import tqdm
import os
#from sentence_transformers import SentenceTransformer


def get_embedding(model, text):
    return model.encode(text)

def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

def model_similarity(model, a, b):
    return model.similarity(a, b)

def softmax(x):
    e_x = np.exp(x - np.max(x))  # Subtract max for numerical stability
    return e_x / e_x.sum()

#def create_pairs(texts, authors, n_pairs):
#    pairs = []
#    labels = []
#    for _ in range(n_pairs):
#        idx1, idx2 = np.random.choice(len(texts), 2, replace=False)
#        pairs.append((texts[idx1], texts[idx2]))
#        labels.append(int(authors[idx1] == authors[idx2]))
#    return pairs, labels

def create_pairs(texts, authors, n_pairs):
    pairs = []
    labels = []
    unique_authors = list(set(authors))
    e = 0
    t = 0
    
    # Create positive pairs (same author)
    n_positive = n_pairs // 2
    print(n_positive)
    while t < n_positive and e < 50:
        author = np.random.choice(unique_authors)
        author_texts = [text for text, a in zip(texts, authors) if a == author]
        if len(author_texts) < 2:
            e+=1
            print("e: ", e)
            continue
        text1, text2 = np.random.choice(author_texts, 2, replace=False)
        pairs.append((text1, text2))
        labels.append(1)
        t+=1
        print("t: ", t)
        e=0
    
    # Create negative pairs (different authors)
    n_negative = n_pairs - len(pairs)
    for _ in range(n_negative):
        author1, author2 = np.random.choice(unique_authors, 2, replace=False)
        text1 = np.random.choice([text for text, a in zip(texts, authors) if a == author1])
        text2 = np.random.choice([text for text, a in zip(texts, authors) if a == author2])
        pairs.append((text1, text2))
        labels.append(0)
    
    return pairs, labels

def predict_authorship(model, tokenizer, text1, text2, device):
    model.eval()
    encoding1 = tokenizer.encode_plus(
        text1,
        add_special_tokens=True,
        max_length=128,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )
    encoding2 = tokenizer.encode_plus(
        text2,
        add_special_tokens=True,
        max_length=128,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )
    
    input_ids1 = encoding1['input_ids'].to(device)
    attention_mask1 = encoding1['attention_mask'].to(device)
    input_ids2 = encoding2['input_ids'].to(device)
    attention_mask2 = encoding2['attention_mask'].to(device)
    
    with torch.no_grad():
        embedding1, embedding2 = model(input_ids1, attention_mask1, input_ids2, attention_mask2)
        similarity = nn.functional.cosine_similarity(embedding1, embedding2).item()
    
    return similarity

def validate_similarities(model, tokenizer, pairs, device):
    similarities = []
    for text1, text2 in tqdm(pairs, desc="Computing similarities"):
        similarity = predict_authorship(model, tokenizer, text1, text2, device)
        #similarity = model_similarity(model, emb1, emb2)
        similarities.append(similarity)
    #softsims = softmax(similarities)
    return similarities

def validate_threshold(labels, threshold, similarities):
    predictions = [int(sim > threshold) for sim in similarities]
    accuracy = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions)
    return accuracy, f1

def main():
    # Load Reddit dataset
    data = load_dataset("reddit", split="train[10000:20000]", trust_remote_code=True)
    texts = data['content']
    authors = data['author']

    import gc
    torch.cuda.empty_cache()
    gc.collect()
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    #device = torch.device('cpu')
    print(f"Using device: {device}")

    #load tokenizer
    tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

    #Load embedding model
    #model = SentenceTransformer("all-mpnet-base-v2")
    model = ContrastiveAuthorshipModel().to(device)
    model.load_state_dict(torch.load('best_authorship_model.pth'))

    # Create pairs for validation
    pairs, labels = create_pairs(texts, authors, n_pairs=100)

    # Validate thresholds
    thresholds = np.arange(0.1, 0.8, 0.1)
    accuracies = []
    f1_scores = []

    similarities = validate_similarities(model, tokenizer, pairs, device)
    for threshold in thresholds:
        print(f"Validating threshold: {threshold}")
        accuracy, f1 = validate_threshold(labels, threshold, similarities)
        accuracies.append(accuracy)
        f1_scores.append(f1)
        print(f"Accuracy: {accuracy}, F1 Score: {f1}")

    # Visualize results
    plt.figure(figsize=(10, 6))
    plt.plot(thresholds, accuracies, label='Accuracy', marker='o')
    plt.plot(thresholds, f1_scores, label='F1 Score', marker='s')
    plt.xlabel('Similarity Threshold')
    plt.ylabel('Score')
    plt.title('Accuracy and F1 Score vs Similarity Threshold')
    plt.legend()
    plt.grid(True)
    plt.savefig('validation_results.png')
    plt.show()

    # Find best threshold
    best_threshold = thresholds[np.argmax(f1_scores)]
    print(f"Best threshold: {best_threshold}")
    print(f"Best F1 Score: {max(f1_scores)}")

if __name__ == "__main__":
    main()

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  model.load_state_dict(torch.load('best_authorship_model.pth'))


50
e:  1
e:  2
e:  3
e:  4
e:  5
e:  6
e:  7
e:  8
e:  9
e:  10
e:  11
e:  12
e:  13
e:  14
e:  15
e:  16
e:  17
e:  18
e:  19
e:  20
e:  21
e:  22
e:  23
e:  24
e:  25
e:  26
t:  1
e:  1
e:  2
e:  3
e:  4
e:  5
e:  6
e:  7
e:  8
e:  9
e:  10
e:  11
e:  12
e:  13
e:  14
e:  15
e:  16
t:  2
e:  1
e:  2
e:  3
e:  4
e:  5
e:  6
e:  7
e:  8
e:  9
e:  10
e:  11
e:  12
e:  13
e:  14
e:  15
e:  16
e:  17
e:  18
e:  19
e:  20
e:  21
e:  22
e:  23
e:  24
t:  3
e:  1
t:  4
e:  1
e:  2
e:  3
e:  4
e:  5
e:  6
e:  7
e:  8
e:  9
e:  10
e:  11
e:  12
e:  13
e:  14
e:  15
e:  16
e:  17
e:  18
e:  19
e:  20
e:  21
t:  5
t:  6
e:  1
e:  2
e:  3
e:  4
e:  5
e:  6
e:  7
e:  8
e:  9
e:  10
e:  11
e:  12
e:  13
e:  14
e:  15
e:  16
e:  17
e:  18
e:  19
e:  20
e:  21
e:  22
t:  7
e:  1
e:  2
e:  3
e:  4
e:  5
e:  6
e:  7
e:  8
e:  9
e:  10
e:  11
e:  12
e:  13
e:  14
e:  15
e:  16
e:  17
e:  18
e:  19
e:  20
e:  21
e:  22
e:  23
e:  24
e:  25
e:  26
e:  27
e:  28
e:  29
e:  30
e:  31
e:  32
e:  33
e:  34
e:

Computing similarities:   0%|          | 0/100 [00:00<?, ?it/s]


AttributeError: 'ContrastiveAuthorshipModel' object has no attribute 'encode'