In [None]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.preprocessing import LabelEncoder
from datasets import Dataset, DatasetDict


In [None]:
# Load your dataset
df = pd.read_csv("../data/processed/top_150_fantasy_reviews_cleaned_balanced.csv")

# Map recommendation to sentiment labels
def map_recommendation(rec):
    if rec == 'Recommended':
        return 'positive'
    elif rec == 'Not Recommended':
        return 'negative'
    else:
        return 'neutral'

df["label_text"] = df["recommendation"].map(map_recommendation)

# Encode labels (e.g., positive → 2, negative → 0, etc.)
label_encoder = LabelEncoder()
df["label"] = label_encoder.fit_transform(df["label_text"])

# Show class mappings
print("Label classes:", label_encoder.classes_)

# Select only relevant columns
df = df[["review_sentiment", "label"]].rename(columns={"review_sentiment": "text"})


In [None]:
dataset = Dataset.from_pandas(df)
dataset = dataset.train_test_split(test_size=0.2, seed=42)


In [None]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

def tokenize_function(example):
    return tokenizer(example["text"], padding="max_length", truncation=True)

tokenized_dataset = dataset.map(tokenize_function, batched=True)


In [None]:
num_labels = len(label_encoder.classes_)  # Usually 3 (positive, neutral, negative)

model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=num_labels)


In [None]:
device = torch.device("cpu")

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="no",
    save_strategy="no",
    num_train_epochs=1,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    logging_steps=1000000,
    logging_dir="./logs",
    disable_tqdm=True,
    report_to="none",
    dataloader_num_workers=16,
    remove_unused_columns=True,
    fp16=False,
    no_cuda=True  # <--- This disables CUDA entirely
)


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"].select(range(300)),
    eval_dataset=tokenized_dataset["test"].select(range(100)),
)

trainer.train()


In [None]:
predictions = trainer.predict(tokenized_dataset["test"])
preds = torch.argmax(torch.tensor(predictions.predictions), dim=1)
labels = torch.tensor(predictions.label_ids)

accuracy = (preds == labels).sum().item() / len(labels)
print(f"Test Accuracy: {accuracy * 100:.2f}%")


In [None]:
def predict_sentiment(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        logits = model(**inputs).logits
    predicted_class = torch.argmax(logits).item()
    return label_encoder.inverse_transform([predicted_class])[0]

# Try it out
print(predict_sentiment("The plot was amazing, I enjoyed every moment."))
print(predict_sentiment("I found it boring and hard to follow."))


In [1]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from datasets import Dataset
import numpy as np

# Load dataset
df = pd.read_csv("../data/processed/top_150_fantasy_reviews_cleaned_balanced.csv")

# Map recommendation to sentiment labels
def map_recommendation(rec):
    if rec == 'Recommended':
        return 'positive'
    elif rec == 'Not Recommended':
        return 'negative'
    else:
        return 'neutral'

df["label_text"] = df["recommendation"].apply(map_recommendation)

# Encode labels
label_encoder = LabelEncoder()
df["label"] = label_encoder.fit_transform(df["label_text"])

print("Label classes:", label_encoder.classes_)

# Prepare data
df = df[["review_sentiment", "label"]].rename(columns={"review_sentiment": "text"})
df = df.dropna()  # Remove any NaN values

# Create dataset
dataset = Dataset.from_pandas(df)
dataset = dataset.train_test_split(test_size=0.2, seed=42)

# Initialize tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=512
    )

# Tokenize datasets
tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Model setup
num_labels = len(label_encoder.classes_)
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=num_labels)

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="no",
    num_train_epochs=1,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    logging_steps=50,
    logging_dir="./logs",
    disable_tqdm=False,
    report_to="none",
    dataloader_num_workers=0,  # Set to 0 to avoid multiprocessing issues
    remove_unused_columns=True,
    fp16=False,
    use_cpu=True
)

# Metric function
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {"accuracy": accuracy_score(labels, predictions)}

# Trainer with small subset for fast training
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"].select(range(50)),
    eval_dataset=tokenized_dataset["test"].select(range(20)),
    compute_metrics=compute_metrics,
)

# Train
print("Starting training...")
trainer.train()

# Evaluate
print("Evaluating...")
results = trainer.evaluate()
print(f"Test Accuracy: {results['eval_accuracy']:.4f}")

# Save model
model.save_pretrained("./bert_sentiment_model")
tokenizer.save_pretrained("./bert_sentiment_model")

  from .autonotebook import tqdm as notebook_tqdm


Label classes: ['negative' 'neutral' 'positive']


Map: 100%|██████████| 3888/3888 [00:34<00:00, 113.72 examples/s]
Map: 100%|██████████| 972/972 [00:08<00:00, 120.13 examples/s]
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting training...


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.03791,0.55


Evaluating...


Test Accuracy: 0.5500


('./bert_sentiment_model/tokenizer_config.json',
 './bert_sentiment_model/special_tokens_map.json',
 './bert_sentiment_model/vocab.txt',
 './bert_sentiment_model/added_tokens.json')