In [11]:
import pandas as pd
from tqdm import tqdm
tqdm.pandas()
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
import torch

import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

import nltk
nltk.download('stopwords')
nltk.download('punkt')

stop_words = set(stopwords.words('english'))


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\maddy\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\maddy\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [12]:
reviews = pd.read_pickle('Pickle/reviews.pkl')

In [13]:
reviews

Unnamed: 0,review_id,rating,review_text,n_votes,n_comments,user_id,book_id
0,5cd416f3efc3f944fce4ce2db2290d5e,5,Mind blowingly cool. Best science fiction I've...,16,0,0,14
1,dfdbb7b0eb5a7e4c26d59a937e2e5feb,5,This is a special book. It started slow for ab...,28,1,0,21
2,5e212a62bced17b4dbe41150e5bb9037,3,I haven't read a fun mystery book in a while a...,6,0,0,22
3,fdd13cad0695656be99828cd75d6eb73,4,"Fun, fast paced, and disturbing tale of murder...",22,4,0,23
4,bd0df91c9d918c0e433b9ab3a9a5c451,4,A fun book that gives you a sense of living in...,8,0,0,24
...,...,...,...,...,...,...,...
999995,d53ddb7d0ee121a47e8d1ed144f04709,0,I GIVE THIS BOOK: 3 1/2 out of 5 stars \n This...,0,0,26619,34363
999996,ecfec5fc89629da466d5c49358e55a33,4,I read a .pdf version of this book that was se...,0,0,26619,125749
999997,6afb14b8dc8e42dbef89964bce8586cc,4,"""Every now and then, you have an encounter wit...",0,0,26619,248053
999998,884fb68ca4e27d3a255d37620fc438e5,0,I GIVE THIS BOOK: 4 1/2 out of 5 stars \n This...,0,0,26619,35425


In [14]:
#tokenizer and model
model_name = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(model_name, model_max_length=512)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
sentiment_pipeline = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)


In [15]:
def preprocess_text(text):
    text = re.sub(r'http\S+', '', text)
    words = word_tokenize(text)
    filtered_words = [word for word in words if word.isalnum() and word.lower() not in stop_words]
    cleaned_text = ' '.join(filtered_words)
    
    return cleaned_text

In [16]:
def truncate_text(text): 
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=512) 
    truncated_text = tokenizer.decode(inputs['input_ids'][0], skip_special_tokens=True) 
    return truncated_text

In [17]:
reviews['cleaned_text'] = reviews['review_text'].progress_apply(preprocess_text)
reviews['truncated_text'] = reviews['cleaned_text'].progress_apply(truncate_text)

  2%|▏         | 17556/1000000 [00:10<09:46, 1675.47it/s]


KeyboardInterrupt: 

In [None]:
reviews['truncated_text']

0         mind blowingly cool best science fiction read ...
2         read fun mystery book sure ever read poirot lo...
6         giving high rating heard organizer long founda...
7         decided give eating processed sugar month janu...
9         kevin kelly wired lays technological trends in...
                                ...                        
999991    admit quite understand stories like book found...
999992    author weaved tale romance daily family life w...
999997    every encounter someone simply changes life co...
999998    give book 4 5 stars first official book parano...
999999    easy tidy read sometimes wonder grew different...
Name: truncated_text, Length: 500000, dtype: object

In [10]:
# Function to get sentiment for each preprocessed and truncated review
def get_review_sentiment(review_text):
    # Ensure review_text is a string
    if isinstance(review_text, str):
        try:
            result = sentiment_pipeline(review_text)[0]
            return result['label'], result['score']
        except Exception as e:
            print(f"Error processing review_text: {review_text}")
            print(e)
            return None, None
    else:
        return None, None

# Get sentiment for all truncated reviews with debugging
sentiments = []
confidence_scores = []

for review in tqdm(reviews['truncated_text']):
    sentiment, confidence = get_review_sentiment(review)
    sentiments.append(sentiment)
    confidence_scores.append(confidence)

reviews['sentiment'] = sentiments
reviews['confidence'] = confidence_scores


  2%|▏         | 8582/500000 [09:38<9:12:33, 14.82it/s] 


KeyboardInterrupt: 

In [None]:
reviews

Unnamed: 0,review_id,rating,review_text,n_votes,n_comments,user_id,book_id,embeddings,cleaned_text,truncated_text,sentiment,confidence
292265,a5b74bfc7d877e0b6bbbdc7a6caaf19e,4,"I only read ""Dead Sleep"" out of this 3 pack. F...",0,0,7587,662027,"[-0.0054971282, -0.058966663, -0.07498373, 0.0...",read Dead Sleep 3 pack reason particular title...,read dead sleep 3 pack reason particular title...,NEGATIVE,0.866768
247357,2824edc9861b0728abd34e39931d9863,5,"A rich, hilarious commentary on American life ...",0,0,6378,18789,"[-0.043407183, -0.021091254, -0.064387016, 0.0...",rich hilarious commentary American life time,rich hilarious commentary american life time,POSITIVE,0.999865
934030,7e956964da7e818b93b5bdbc371b8e0b,4,This book connected with me in several ways. A...,0,0,24481,46486,"[-0.00627925, -0.0045137024, -0.038882494, 0.0...",book connected several ways academician dazzle...,book connected several ways academician dazzle...,POSITIVE,0.547672
983180,7d6ad0c2285ac67da795a28ae3bfdd9b,2,"Review from Backchatting Books \n ""By Proxy"" i...",0,0,26062,11563,"[-0.07532537, -0.048045207, 0.019663217, 0.091...",Review Backchatting Books Proxy debut novel Ka...,review backchatting books proxy debut novel ka...,NEGATIVE,0.993422
859448,0b5d62ecdd7bef683a9fda60271ac5da,4,Invasion of Privacy is a collection of short s...,1,0,22534,864957,"[-0.01699472, -0.026205948, -0.01679936, -0.02...",Invasion Privacy collection short stories stor...,invasion privacy collection short stories stor...,POSITIVE,0.993304
...,...,...,...,...,...,...,...,...,...,...,...,...
682789,eea0a83ebf2e08c55b8f768528903084,4,Excellent read! Kaitlin and Beckett have alway...,0,0,17844,64761,"[-0.0140506495, 0.019863551, -0.032689232, 0.0...",Excellent read Kaitlin Beckett always verbal s...,excellent read kaitlin beckett always verbal s...,POSITIVE,0.997185
541785,aa48c575c42560a9efe8b61d26d6da5e,3,The Pact is definitely a love story. It tells ...,0,0,14255,270087,"[-0.020619314, 0.02975724, 0.006408088, -0.018...",Pact definitely love story tells compelling ta...,pact definitely love story tells compelling ta...,POSITIVE,0.901370
309356,dc9976808ddeb0d2404d1466f5fc9443,3,So I do these kind of notes/stream of consciou...,0,0,8065,45577,"[-0.12945415, -0.056942053, -0.05114944, 0.003...",kind consciousness reviews usually hope mind M...,kind consciousness reviews usually hope mind m...,NEGATIVE,0.971502
242581,ff8d3d55345351d83eb3af764737924b,3,"3+ stars! That was fun. Pretty over the top, b...",0,0,6193,103537,"[-0.031406842, -0.03604886, -0.0071208486, 0.0...",stars fun Pretty top fun nonetheless,stars fun pretty top fun nonetheless,POSITIVE,0.999871
