In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from datasets import load_dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from tqdm import tqdm

# Load the TweetEval dataset
dataset = load_dataset("tweet_eval", "sentiment")

# Prepare the data
texts = dataset["train"]["text"] + dataset["test"]["text"]
labels = dataset["train"]["label"] + dataset["test"]["label"]

# Split the data
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

# Initialize the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

class TweetDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Create datasets
train_dataset = TweetDataset(train_texts, train_labels, tokenizer)
val_dataset = TweetDataset(val_texts, val_labels, tokenizer)

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

# Initialize the model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)

# Set up the optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)

# Training loop
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

num_epochs = 3

for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for batch in tqdm(train_loader, desc=f"Epoch {epoch + 1}/{num_epochs}"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    avg_train_loss = total_loss / len(train_loader)
    print(f"Average train loss: {avg_train_loss:.4f}")

    # Validation
    model.eval()
    val_preds, val_true = [], []

    with torch.no_grad():
        for batch in tqdm(val_loader, desc="Validation"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            preds = torch.argmax(outputs.logits, dim=1)

            val_preds.extend(preds.cpu().numpy())
            val_true.extend(labels.cpu().numpy())

    val_accuracy = accuracy_score(val_true, val_preds)
    print(f"Validation Accuracy: {val_accuracy:.4f}")
    print(classification_report(val_true, val_preds))

# Save the model
model.save_pretrained("tweet_authorship_model")
tokenizer.save_pretrained("tweet_authorship_model")

print("Training completed and model saved.")

In [None]:
#!pip install transformers datasets torch scikit-learn pandas nltk

import pandas as pd
import numpy as np
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import torch
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re

# Download NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# Load the Reddit dataset
dataset = load_dataset("reddit", split="train[:100000]", trust_remote_code=True)  # Limiting to 100k samples for this example

# Convert to pandas DataFrame for easier preprocessing
df = pd.DataFrame(dataset)

# Preprocessing function
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Tokenize
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    # Join tokens back into string
    return ' '.join(tokens)

# Apply preprocessing
df['processed_text'] = df['content']

# Use 'author' as our target for authorship attribution
# Keep only authors with at least 50 comments
author_counts = df['author'].value_counts()
authors_to_keep = author_counts[author_counts >= 8].index
df = df[df['author'].isin(authors_to_keep)]

# Encode author labels
le = LabelEncoder()
df['author_encoded'] = le.fit_transform(df['author'])

# When splitting the data
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['processed_text'], df['author_encoded'], 
    test_size=0.2, random_state=42, stratify=df['author_encoded']
)

# Further split train into train and validation
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_texts, train_labels, 
    test_size=0.1, random_state=42, stratify=train_labels
)

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def tokenize_function(examples):
    return tokenizer(examples, padding="max_length", truncation=True, max_length=128)

train_encodings = tokenize_function(train_texts.tolist())
val_encodings = tokenize_function(val_texts.tolist())
test_encodings = tokenize_function(test_texts.tolist())

# Dataset class
class RedditDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = RedditDataset(train_encodings, train_labels.tolist())
val_dataset = RedditDataset(val_encodings, val_labels.tolist())
test_dataset = RedditDataset(test_encodings, test_labels.tolist())

# Model
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(le.classes_))

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=2,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="steps",
    eval_steps=500,
    save_steps=1000,
    load_best_model_at_end=True,
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

# Train the model
trainer.train()

# Evaluate the model
predictions = trainer.predict(test_dataset)
preds = np.argmax(predictions.predictions, axis=-1)

from sklearn.metrics import accuracy_score, classification_report
print(accuracy_score(test_labels, preds))
print(classification_report(test_labels, preds, target_names=le.classes_))

In [None]:
from sklearn.metrics import accuracy_score, classification_report

# After making predictions
predictions = trainer.predict(test_dataset)
preds = np.argmax(predictions.predictions, axis=-1)

# Get the unique classes in our test set
unique_classes = np.unique(test_labels)

# Create a mapping from the original label encoder to the classes in our test set
label_map = {i: le.classes_[i] for i in unique_classes}

# Generate the classification report
print(accuracy_score(test_labels, preds))
print(classification_report(test_labels, preds, 
                            target_names=[label_map[i] for i in sorted(label_map.keys())],
                            labels=sorted(label_map.keys())))

# If you want to see which authors are in the test set
print("Authors in test set:")
for i, author in label_map.items():
    print(f"Label {i}: {author}")

In [None]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification

# Replace the BERT tokenizer with RoBERTa
tokenizer = RobertaTokenizer.from_pretrained("roberta-large")

def tokenize_function(examples):
    return tokenizer(examples, padding="max_length", truncation=True, max_length=128)

# ... (rest of the data preparation code remains the same)

# Replace the BERT model with RoBERTa
model = RobertaForSequenceClassification.from_pretrained("roberta-large", num_labels=len(le.classes_))

# Adjust training arguments for the larger model
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,  # You might need fewer epochs with a more powerful model
    per_device_train_batch_size=8,  # Reduced batch size due to larger model
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="steps",
    eval_steps=500,
    save_steps=1000,
    load_best_model_at_end=True,
    gradient_accumulation_steps=2,  # This effectively doubles the batch size
)

In [2]:
# Convert to pandas DataFrame for easier preprocessing
df = pd.DataFrame(dataset)

# Preprocessing function
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Tokenize
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    # Join tokens back into string
    return ' '.join(tokens)

# Apply preprocessing
df['processed_text'] = df['content'].apply(preprocess_text)

In [3]:
# Apply preprocessing
df['processed_text'] = df['content']

In [None]:
# Use 'author' as our target for authorship attribution
# Keep only authors with at least 50 comments
author_counts = df['author'].value_counts()
authors_to_keep = author_counts[author_counts >= 8].index
df = df[df['author'].isin(authors_to_keep)]

# Encode author labels
le = LabelEncoder()
df['author_encoded'] = le.fit_transform(df['author'])

# Split the data
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['processed_text'], df['author_encoded'], test_size=0.2, random_state=42
)

# Further split train into train and validation
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_texts, train_labels, test_size=0.1, random_state=42
)

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def tokenize_function(examples):
    return tokenizer(examples, padding="max_length", truncation=True, max_length=128)

train_encodings = tokenize_function(train_texts.tolist())
val_encodings = tokenize_function(val_texts.tolist())
test_encodings = tokenize_function(test_texts.tolist())

# Dataset class
class RedditDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = RedditDataset(train_encodings, train_labels.tolist())
val_dataset = RedditDataset(val_encodings, val_labels.tolist())
test_dataset = RedditDataset(test_encodings, test_labels.tolist())

# Model
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(le.classes_))

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=2,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="steps",
    eval_steps=500,
    save_steps=1000,
    load_best_model_at_end=True,
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

# Train the model
trainer.train()

# Evaluate the model
predictions = trainer.predict(test_dataset)
preds = np.argmax(predictions.predictions, axis=-1)

from sklearn.metrics import accuracy_score, classification_report
print(accuracy_score(test_labels, preds))
print(classification_report(test_labels, preds, target_names=le.classes_))

In [None]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification

# Replace the BERT tokenizer with RoBERTa
tokenizer = RobertaTokenizer.from_pretrained("roberta-large")

def tokenize_function(examples):
    return tokenizer(examples, padding="max_length", truncation=True, max_length=128)

train_encodings = tokenize_function(train_texts.tolist())
val_encodings = tokenize_function(val_texts.tolist())
test_encodings = tokenize_function(test_texts.tolist())

# Dataset class
class RedditDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = RedditDataset(train_encodings, train_labels.tolist())
val_dataset = RedditDataset(val_encodings, val_labels.tolist())
test_dataset = RedditDataset(test_encodings, test_labels.tolist())

# Replace the BERT model with RoBERTa
model = RobertaForSequenceClassification.from_pretrained("roberta-large", num_labels=len(le.classes_))

# Adjust training arguments for the larger model
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,  # You might need fewer epochs with a more powerful model
    per_device_train_batch_size=8,  # Reduced batch size due to larger model
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="steps",
    eval_steps=500,
    save_steps=1000,
    load_best_model_at_end=True,
    gradient_accumulation_steps=2,  # This effectively doubles the batch size
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

# Train the model
trainer.train()

from sklearn.metrics import accuracy_score, classification_report

# After making predictions
predictions = trainer.predict(test_dataset)
preds = np.argmax(predictions.predictions, axis=-1)

# Get the unique classes in our test set
unique_classes = np.unique(test_labels)

# Create a mapping from the original label encoder to the classes in our test set
label_map = {i: le.classes_[i] for i in unique_classes}

# Generate the classification report
print(accuracy_score(test_labels, preds))
print(classification_report(test_labels, preds, 
                            target_names=[label_map[i] for i in sorted(label_map.keys())],
                            labels=sorted(label_map.keys())))

# If you want to see which authors are in the test set
print("Authors in test set:")
for i, author in label_map.items():
    print(f"Label {i}: {author}")

In [None]:
import time 
# Fib
def fibonacci_of(n):
    if n in {0, 1}:  # Base case
        return n
    return fibonacci_of(n - 1) + fibonacci_of(n - 2)  # Recursive case

def ultra_fib(n, lookup):
    if n<=2:  # Base case
        lookup.append(0)
        lookup.append(1)
        lookup.append(lookup[0] + lookup[1])
        return lookup[n]
    if n>2:
        lookup.append(ultra_fib(n-1,lookup) + lookup[n-2])
        return lookup[n]

def basic_fib(n):
    # Handle the base cases

    if n in {0, 1}:
        return n
    
    previous, fib_number = 0, 1
    for _ in range(2, n + 1):
        previous, fib_number = fib_number, previous + fib_number

    return fib_number

def super_ultra_fib(n):
    if n<=2:  # Base case
        lookup = []
        lookup.append(0)
        lookup.append(1)
        lookup.append(lookup[0] + lookup[1])
        return lookup[n]
    if n>2:
        lookup.append(super_ultra_fib(n-1) + lookup[n-2])
        return lookup[n]
    
start = time.time()
print([fibonacci_of(n) for n in range(40)])
end = time.time()
print(f"Iteration: \tTime taken: {(end-start)*10**3:.09f}ms")

start2 = time.time()
print([ultra_fib(n, []) for n in range(40)])
end2 = time.time()
print(f"Iteration: \tTime taken: {(end2-start2)*10**3:.09f}ms")

start3 = time.time()
lookup = []
print([basic_fib(n) for n in range(40)])
end3 = time.time()
print(f"Iteration: \tTime taken: {(end3-start3)*10**3:.09f}ms")