In [None]:
import pandas as pd
from tqdm import tqdm
tqdm.pandas()
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
nltk.download('stopwords')
nltk.download('punkt')
import os
stop_words = set(stopwords.words('english'))


In [None]:
reviews = pd.read_pickle('../Pickle/reviews.pkl')

In [None]:
model_name = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(model_name, model_max_length=512)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
sentiment_pipeline = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)


In [None]:
def preprocess_text(text):
    """
    Preprocesses input text by performing the following steps:
    1. Removes any URLs from the text.
    2. Tokenizes the text into individual words.
    3. Filters out non-alphanumeric words and stop words (defined by the `stop_words` list).
    4. Joins the filtered words back into a single string of text.

    Parameters:
    text (str): The text to be preprocessed.

    Returns:
    str: The cleaned and filtered text.
    """
    text = re.sub(r'http\S+', '', text)
    words = word_tokenize(text)
    filtered_words = [word for word in words if word.isalnum() and word.lower() not in stop_words]
    cleaned_text = ' '.join(filtered_words)
    
    return cleaned_text


In [None]:
def truncate_text(text):
    """
    Truncates the input text to a maximum length of 512 tokens using a tokenizer. 
    The text is tokenized, truncated to the specified length, and then decoded back to a string.

    Parameters:
    text (str): The text to be truncated.

    Returns:
    str: The truncated text.
    """
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=512)
    truncated_text = tokenizer.decode(inputs['input_ids'][0], skip_special_tokens=True)
    return truncated_text


In [None]:
reviews['cleaned_text'] = reviews['review_text'].progress_apply(preprocess_text)
reviews['truncated_text'] = reviews['cleaned_text'].progress_apply(truncate_text)

In [None]:
reviews = reviews.dropna(subset=['truncated_text'])

In [None]:
def save_sentiment_incrementally(reviews_df, sentiment_pipeline, interval=300):
    """
    Processes the reviews dataframe and applies sentiment analysis incrementally in batches.
    The results are saved in a pickle file to avoid recomputation.

    Parameters:
    reviews_df (pd.DataFrame): The dataframe containing the reviews to be processed, including a column 'truncated_text'.
    sentiment_pipeline (callable): A sentiment analysis pipeline that returns sentiment labels and confidence scores.
    interval (int): The batch size to process and save at a time (default is 300).

    Returns:
    None: The function saves the processed sentiment analysis results to a pickle file.
    """
    sentiment_file = '../Pickle/review_score.pkl'
    
    if os.path.exists(sentiment_file):
        reviews_with_sentiment = pd.read_pickle(sentiment_file)
    else:
        reviews_with_sentiment = pd.DataFrame(columns=['review_id', 'sentiment', 'confidence'])
    
    reviews_df = reviews_df.dropna(subset=['truncated_text']).reset_index(drop=True)

    processed_review_ids = set(reviews_with_sentiment['review_id'].values)
    
    new_sentiments = []

    for i in tqdm(range(len(reviews_df)), desc="Processing"):
        review_id = reviews_df.at[i, 'review_id']
        
        if review_id in processed_review_ids:
            continue
        
        review_text = reviews_df.at[i, 'truncated_text']
        sentiment_result = sentiment_pipeline(review_text)[0]
        sentiment = sentiment_result['label']
        confidence = sentiment_result['score']
        
        new_sentiments.append({'review_id': review_id, 'sentiment': sentiment, 'confidence': confidence})
        
        processed_review_ids.add(review_id)
        
        if len(new_sentiments) % interval == 0:
            new_sentiments_df = pd.DataFrame(new_sentiments)
            reviews_with_sentiment = pd.concat([reviews_with_sentiment, new_sentiments_df], ignore_index=True)
            reviews_with_sentiment.to_pickle(sentiment_file)
            new_sentiments = [] 
            print(f"saved batch {i + 1}/{len(reviews_df)}.")

    if new_sentiments:
        new_sentiments_df = pd.DataFrame(new_sentiments)
        reviews_with_sentiment = pd.concat([reviews_with_sentiment, new_sentiments_df], ignore_index=True)
        reviews_with_sentiment.to_pickle(sentiment_file)



In [28]:
review_sentiment = pd.read_pickle('../Pickle/review_score.pkl')

In [None]:
"""
Adjusts the confidence score in the 'review_sentiment' dataframe based on the sentiment label.

For rows where the sentiment label is 0, the confidence score is inverted (1 - confidence).
Otherwise, the confidence score remains unchanged.

Parameters:
- review_sentiment (pd.DataFrame): A dataframe containing the 'sentiment' and 'confidence' columns.

Returns:
- pd.DataFrame: The dataframe with an updated 'confidence_score' column.
"""


review_sentiment['confidence_score'] = [
    1 - row['confidence'] if row['sentiment'] == 0 else row ['confidence']
    for _,row in review_sentiment.iterrows()]

In [36]:
review_sentiment.to_pickle('../Pickle/review_score.pkl')