In [1]:
import pandas as pd
from tqdm import tqdm
tqdm.pandas()
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
import torch

import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

import nltk
nltk.download('stopwords')
nltk.download('punkt')

stop_words = set(stopwords.words('english'))


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\maddy\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\maddy\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
reviews = pd.read_pickle('Pickle/reviews.pkl')

In [None]:
reviews

In [None]:
#tokenizer and model
model_name = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(model_name, model_max_length=512)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
sentiment_pipeline = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)


In [None]:
def preprocess_text(text):
    text = re.sub(r'http\S+', '', text)
    words = word_tokenize(text)
    filtered_words = [word for word in words if word.isalnum() and word.lower() not in stop_words]
    cleaned_text = ' '.join(filtered_words)
    
    return cleaned_text

In [None]:
def truncate_text(text): 
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=512) 
    truncated_text = tokenizer.decode(inputs['input_ids'][0], skip_special_tokens=True) 
    return truncated_text

In [None]:
reviews['cleaned_text'] = reviews['review_text'].progress_apply(preprocess_text)
reviews['truncated_text'] = reviews['cleaned_text'].progress_apply(truncate_text)

In [None]:
reviews['truncated_text']

In [None]:
# Function to get sentiment for each preprocessed and truncated review
def get_review_sentiment(review_text):
    # Ensure review_text is a string
    if isinstance(review_text, str):
        try:
            result = sentiment_pipeline(review_text)[0]
            return result['label'], result['score']
        except Exception as e:
            print(f"Error processing review_text: {review_text}")
            print(e)
            return None, None
    else:
        return None, None

# Get sentiment for all truncated reviews with debugging
sentiments = []
confidence_scores = []

for review in tqdm(reviews['truncated_text']):
    sentiment, confidence = get_review_sentiment(review)
    sentiments.append(sentiment)
    confidence_scores.append(confidence)

reviews['sentiment'] = sentiments
reviews['confidence'] = confidence_scores


In [8]:
reviews['Confidence Score'] = [
    1 - row['confidence'] if row['sentiment'] == 0 else row ['confidence']
    for _,row in reviews.iterrows()]

In [9]:
reviews

Unnamed: 0,review_id,rating,review_text,n_votes,n_comments,user_id,book_id,cleaned_text,sentiment,confidence,Confidence Score
0,5cd416f3efc3f944fce4ce2db2290d5e,5,Mind blowingly cool. Best science fiction I've...,16,0,0,14,Mind blowingly cool Best science fiction read ...,1,0.979499,0.979499
1,dfdbb7b0eb5a7e4c26d59a937e2e5feb,5,This is a special book. It started slow for ab...,28,1,0,21,special book started slow first third middle t...,1,0.719050,0.719050
2,5e212a62bced17b4dbe41150e5bb9037,3,I haven't read a fun mystery book in a while a...,6,0,0,22,read fun mystery book sure ever read Poirot lo...,1,0.991836,0.991836
3,fdd13cad0695656be99828cd75d6eb73,4,"Fun, fast paced, and disturbing tale of murder...",22,4,0,23,Fun fast paced disturbing tale murder Great be...,1,0.996752,0.996752
4,bd0df91c9d918c0e433b9ab3a9a5c451,4,A fun book that gives you a sense of living in...,8,0,0,24,fun book gives sense living Paris expat apprec...,1,0.999605,0.999605
...,...,...,...,...,...,...,...,...,...,...,...
999995,d53ddb7d0ee121a47e8d1ed144f04709,0,I GIVE THIS BOOK: 3 1/2 out of 5 stars \n This...,0,0,26619,34363,GIVE BOOK 3 5 stars book gives lot storyline d...,1,0.696703,0.696703
999996,ecfec5fc89629da466d5c49358e55a33,4,I read a .pdf version of this book that was se...,0,0,26619,125749,read version book sent author giveaway drawing...,1,0.957435,0.957435
999997,6afb14b8dc8e42dbef89964bce8586cc,4,"""Every now and then, you have an encounter wit...",0,0,26619,248053,Every encounter someone simply changes life co...,1,0.998328,0.998328
999998,884fb68ca4e27d3a255d37620fc438e5,0,I GIVE THIS BOOK: 4 1/2 out of 5 stars \n This...,0,0,26619,35425,GIVE BOOK 4 5 stars first official book parano...,1,0.939541,0.939541


In [10]:
reviews.to_pickle('Pickle/reviews.pkl')