In [None]:
import torch 

# Ensure CUDA support is available 
if torch.cuda.is_available(): 
    
# Perform operations on GPU 
    device = torch.device("cuda") # ... 
# Clear CUDA memory 
    torch.cuda.empty_cache() 
else: 
    print("CUDA is not available.")

In [None]:
device = torch.device("cuda:0")
device

In [None]:
import requests
import json
from typing import Any, List, Tuple
import numpy as np
import torch
from numpy import ndarray
from tqdm import tqdm
from torch.utils.data import DataLoader, Dataset
from transformers import AutoModelForSequenceClassification, BatchEncoding, AutoTokenizer, PreTrainedTokenizerBase
import pandas as pd


In [None]:
class SentimentDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = self.texts[item]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten()
        }

In [None]:
model_path = 'needed_e5_large'  # Replace with your model path
model = AutoModelForSequenceClassification.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

In [None]:
import pandas as pd
df = pd.read_csv('combined_reviews.csv')

In [None]:

class MyTextDataset(Dataset):
    def __init__(self, sentence_list: List[str]) -> None:
        self.sentences = sentence_list
    
    def __len__(self) -> int:
        return len(self.sentences)
    
    def __getitem__(self, idx: int) -> Tuple[int, str]:
        return idx, self.sentences[idx]

class MyCollateBatch:
    def __init__(self, tokenizer: PreTrainedTokenizerBase) -> None:
        self.tokenizer = tokenizer
    
    def __call__(self, batch: List[Tuple[int, str]]) -> BatchEncoding:
        sentences = [b[1] for b in batch]
        idx = [b[0] for b in batch]
        text = self.tokenizer(sentences, max_length=512, padding="max_length", truncation=True, return_tensors="pt")
        text["idx"] = idx
        return text

class ModelSentiment:
    def __init__(self, model_folder: str, device: torch.device) -> None:
        self.device = device
        self.model_folder = model_folder
        self.tokenizer = AutoTokenizer.from_pretrained(model_folder)
        self.model = AutoModelForSequenceClassification.from_pretrained(model_folder, return_dict=True)
        self.collate_fn = MyCollateBatch(self.tokenizer)
        self.model.to(device)
        self.model.eval()
    
    def __call__(self, sentence_list: List[str]):
        data_ds = MyTextDataset(sentence_list)
        loader = DataLoader(data_ds, batch_size=1, collate_fn=self.collate_fn)
        result = np.zeros((len(sentence_list), len(self.class_names())))
        print('Processing sentences...')
        for batch in tqdm(loader):
            idx = batch["idx"]
            batch = {k: v.to(self.device) for k, v in batch.items() if k != "idx"}
            with torch.no_grad():
                outputs = self.model(**batch)
                logits = outputs.logits
                predictions = torch.softmax(logits, dim=-1)
                result[idx, :] = predictions.to("cpu").numpy()
        return result
    
    def class_names(self) -> Any:
        return self.model.config.id2label


In [None]:
# Load the pre-trained model
model_folder = "needed_e5_large"  # Update this to your model folder path
device = torch.device('cuda')
model_sentiment = ModelSentiment(model_folder, device)


In [None]:
df.info()

In [None]:
model_sentiment

In [None]:
# Get the text data from the 'Review' column
reviews = df['Review'].astype(str).tolist()
reviews

In [None]:
predictions = model_sentiment(reviews)

In [None]:
df['Sentiment'] = np.argmax(predictions, axis=1)
df['Confidence'] = np.max(predictions, axis=1)

In [None]:
output_csv_path = 'sentiment_results.csv'
df.to_csv(output_csv_path, index=False)

print(f"Sentiment analysis results saved to {output_csv_path}")