## STRUCTURE
- Read Scraped Reviews & Products
    - Create the asin list
    - Create reviews lists, parse by ASIN
    - Create products lists, parse by ASIN
- Run sentiment analysis on reviews
- Save results



In [1]:
import pandas as pd

from dotenv import load_dotenv
import os

load_dotenv()
HUGGINGFACEHUB_API_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN')

if os.getenv("HUGGINGFACEHUB_API_TOKEN") is not None:
    print ("HUGGINGFACEHUB_API_TOKEN is ready")
else:
    print ("HUGGINGFACEHUB_API_TOKEN environment variable not found")

OPENAI_API_KEY is ready


In [None]:
asin_list_path = './data/external/asin_list.csv'
asin_list = pd.read_csv(asin_list_path)['asin'].tolist()
# ASINS = asin_list

In [4]:
def read_data(folder_path):
    reviews = pd.DataFrame()
    products = pd.DataFrame()
    asins = pd.DataFrame()
    
    for file_name in os.listdir(folder_path):
        if file_name.startswith("reviews"):
            file_path = os.path.join(folder_path, file_name)
            df = pd.read_csv(file_path)
            reviews = pd.concat([reviews, df])
        elif file_name.startswith("asin"):
            file_path = os.path.join(folder_path, file_name)
            df = pd.read_csv(file_path)
            products = pd.concat([products, df])
        elif file_name.startswith("products"):
            file_path = os.path.join(folder_path, file_name)
            df = pd.read_csv(file_path)
            asins = pd.concat([asins, df])
    
    return reviews, products, asins


In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Load the pre-trained BERT model for sentiment analysis
model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

def get_sentiment_probabilities(text):
    inputs = tokenizer.encode_plus(text, return_tensors="pt")
    outputs = model(**inputs)
    logits = outputs.logits
    probabilities = torch.softmax(logits, dim=-1)
    
    # Combine probabilities for positive (4-5 stars) and negative (1-2 stars) sentiment
    positive = probabilities[0, 3] + probabilities[0, 4]
    negative = probabilities[0, 0] + probabilities[0, 1]

    return positive.item(), negative.item()

# Assuming your DataFrame is named "reviews"
def process_review(row):
    review_text = row["review"]
    print(f"Review text: {review_text}")

    # Check if review_text is a valid string
    if not isinstance(review_text, str):
        return pd.Series([0.5, 0.5])

    positive, negative = get_sentiment_probabilities(review_text)
    print(f"Sentiment allocation - Positive: {positive}, Negative: {negative}")
    
    return pd.Series([positive, negative])

In [8]:
reviews, products, asins = read_data("/Users/vladbordei/Documents/Development/OaieAmazoniana/data/MagneticBeadsBoardFreestyle")
# Apply the sentiment analysis to the "review" column
reviews[["positive_sentiment", "negative_sentiment"]] = reviews.apply(process_review, axis=1)

Review text: My granddaughter who is 2 1/2 loves this product. The minute she opened it. She started playing with that. Perfect for the car and long airplane trips.
Sentiment allocation - Positive: 0.9696183800697327, Negative: 0.009823201224207878
Review text: It is a nice car toy, non messy, good for any kid over 5,a bit loud and rattly at first, but worth it.
Sentiment allocation - Positive: 0.856742799282074, Negative: 0.0038723137695342302
Review text: Gift for our grandson
Sentiment allocation - Positive: 0.901494026184082, Negative: 0.022252976894378662
Review text: This product is great however can get annoying after time.
Sentiment allocation - Positive: 0.5012603402137756, Negative: 0.06428365409374237
Review text: Needed something for my grandson while he was recovering from surgery for an unknown amount of time at the time. I was trying to find something that he could play with that wouldn't get lost or go all over the place while he could remain fairly stationary while he 

KeyboardInterrupt: 

In [None]:
products.to_csv("products.csv", index=False)
asins.to_csv("asins.csv", index=False)
reviews.to_csv('reviews_with_sentiment.csv', index=False)