In [None]:
import pandas as pd
from tqdm import tqdm
tqdm.pandas()
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
import torch

import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

import nltk
nltk.download('stopwords')
nltk.download('punkt')

stop_words = set(stopwords.words('english'))


In [None]:
reviews = pd.read_pickle('../Pickle/reviews.pkl')

In [None]:
reviews

In [None]:
#tokenizer and model
model_name = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(model_name, model_max_length=512)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
sentiment_pipeline = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)


In [None]:
def preprocess_text(text):
    text = re.sub(r'http\S+', '', text)
    words = word_tokenize(text)
    filtered_words = [word for word in words if word.isalnum() and word.lower() not in stop_words]
    cleaned_text = ' '.join(filtered_words)
    
    return cleaned_text

In [None]:
def truncate_text(text): 
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=512) 
    truncated_text = tokenizer.decode(inputs['input_ids'][0], skip_special_tokens=True) 
    return truncated_text

In [None]:
reviews['cleaned_text'] = reviews['review_text'].progress_apply(preprocess_text)
reviews['truncated_text'] = reviews['cleaned_text'].progress_apply(truncate_text)

In [None]:
reviews = reviews.dropna(subset=['truncated_text'])

In [None]:
import os
import pandas as pd
from tqdm import tqdm

# Initialize your sentiment analysis model
def save_sentiment_incrementally(reviews_df, sentiment_pipeline, interval=300):
    sentiment_file = '../Pickle/review_score.pkl'
    
    # Load existing sentiment data if it exists
    if os.path.exists(sentiment_file):
        reviews_with_sentiment = pd.read_pickle(sentiment_file)
    else:
        reviews_with_sentiment = pd.DataFrame(columns=['review_id', 'sentiment', 'confidence'])
    
    # Ensure that reviews are non-null and reset index
    reviews_df = reviews_df.dropna(subset=['truncated_text']).reset_index(drop=True)

    # Get processed review IDs
    processed_review_ids = set(reviews_with_sentiment['review_id'].values)
    
    new_sentiments = []

    for i in tqdm(range(len(reviews_df)), desc="Processing reviews for sentiment"):
        review_id = reviews_df.at[i, 'review_id']
        
        # Skip if the review_id is already processed
        if review_id in processed_review_ids:
            continue
        
        # Get the review text and perform sentiment analysis
        review_text = reviews_df.at[i, 'truncated_text']
        sentiment_result = sentiment_pipeline(review_text)[0]
        sentiment = sentiment_result['label']
        confidence = sentiment_result['score']
        
        # Append new sentiment result
        new_sentiments.append({'review_id': review_id, 'sentiment': sentiment, 'confidence': confidence})
        
        # Mark this review_id as processed
        processed_review_ids.add(review_id)
        
        # Save periodically after every 'interval' reviews
        if len(new_sentiments) % interval == 0:
            new_sentiments_df = pd.DataFrame(new_sentiments)
            reviews_with_sentiment = pd.concat([reviews_with_sentiment, new_sentiments_df], ignore_index=True)
            reviews_with_sentiment.to_pickle(sentiment_file)
            new_sentiments = []  # Reset the list
            print(f"Processed and saved batch {i + 1}/{len(reviews_df)}.")

    # Save any remaining sentiments after the loop
    if new_sentiments:
        new_sentiments_df = pd.DataFrame(new_sentiments)
        reviews_with_sentiment = pd.concat([reviews_with_sentiment, new_sentiments_df], ignore_index=True)
        reviews_with_sentiment.to_pickle(sentiment_file)

    print(f"Sentiment analysis results saved to {sentiment_file} successfully!")

# Example usage
save_sentiment_incrementally(reviews, sentiment_pipeline)


In [28]:
review_sentiment = pd.read_pickle('../Pickle/review_score.pkl')

In [29]:
review_sentiment['confidence_score'] = [
    1 - row['confidence'] if row['sentiment'] == 0 else row ['confidence']
    for _,row in review_sentiment.iterrows()]

In [31]:
review_sentiment

Unnamed: 0,review_id,sentiment,confidence,confidence_score
0,5cd416f3efc3f944fce4ce2db2290d5e,POSITIVE,0.979499,0.979499
1,dfdbb7b0eb5a7e4c26d59a937e2e5feb,POSITIVE,0.719051,0.719051
2,5e212a62bced17b4dbe41150e5bb9037,POSITIVE,0.991836,0.991836
3,fdd13cad0695656be99828cd75d6eb73,POSITIVE,0.996752,0.996752
4,bd0df91c9d918c0e433b9ab3a9a5c451,POSITIVE,0.999605,0.999605
...,...,...,...,...
1000995,fca575edd179f0d349a0949bb4a2dc42,NEGATIVE,0.900867,0.900867
1000996,4803d91f0555983fc4db7c8b44b3eba8,NEGATIVE,0.811458,0.811458
1000997,cc25c79c14b0b94509b0eee08164fe73,POSITIVE,0.999701,0.999701
1000998,2422ddc0e9e39c6eb72f9af566c27baa,POSITIVE,0.999779,0.999779


In [36]:
review_sentiment.to_pickle('../Pickle/review_score.pkl')