In [2]:
import pandas as pd
import torch
from transformers import BertForSequenceClassification, Trainer, DataCollatorWithPadding, BertTokenizer
import json

# Load fine-tuned model and tokenizer
model_path = r"C:\Users\nindi\AppData\Roaming\Python\Python312\site-packages\model_sentiment"
model = BertForSequenceClassification.from_pretrained(model_path)
tokenizer = BertTokenizer.from_pretrained("indobenchmark/indobert-base-p2")

# Load entire raw dataset (61,188 rows)
data_path = r"C:\Users\nindi\AppData\Roaming\Python\Python312\site-packages\pandas\io\excel\Final\final_processed_raw_61188.xlsx"
data = pd.read_excel(data_path)

# Ensure token_ids are lists of integers (parse if stored as strings)
data['token_ids'] = data['token_ids'].apply(lambda x: json.loads(x) if isinstance(x, str) else x)

# Truncate each token_ids list to a maximum of 512 tokens
data['token_ids'] = data['token_ids'].apply(lambda x: x[:512] if len(x) > 512 else x)

# Prepare dataset
class SentimentDataset(torch.utils.data.Dataset):
    def __init__(self, token_ids):
        self.token_ids = token_ids

    def __getitem__(self, idx):
        return {
            "input_ids": torch.tensor(self.token_ids[idx], dtype=torch.long)  # Ensuring long tensor type
        }

    def __len__(self):
        return len(self.token_ids)

# Create dataset for the entire unlabeled data
unlabeled_dataset = SentimentDataset(data['token_ids'].tolist())

# Initialize data collator for padding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Define trainer for the model (no training or evaluation is done here, just predictions)
trainer = Trainer(
    model=model,
    data_collator=data_collator
)

# Run predictions on the entire dataset
predictions = trainer.predict(unlabeled_dataset).predictions.argmax(-1)

# Add predictions to the dataframe
data['bert_sentiment'] = predictions  # new column for BERT sentiment predictions

# Save the dataframe with sentiment predictions
output_path = r"C:\Users\nindi\finalsentiment_analysis_results.xlsx"
data.to_excel(output_path, index=False)

print("Sentiment classification completed! Results saved as 'finalsentiment_analysis_results.xlsx'.")


Sentiment classification completed! Results saved as 'finalsentiment_analysis_results.xlsx'.
