## STRUCTURE
- Read Scraped Reviews & Products
    - Create the asin list
    - Create reviews lists, parse by ASIN
    - Create products lists, parse by ASIN
- Run sentiment analysis on reviews
- Save results



In [1]:
import pandas as pd

from dotenv import load_dotenv
import os

load_dotenv()
HUGGINGFACEHUB_API_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN')

if os.getenv("HUGGINGFACEHUB_API_TOKEN") is not None:
    print ("HUGGINGFACEHUB_API_TOKEN is ready")
else:
    print ("HUGGINGFACEHUB_API_TOKEN environment variable not found")

HUGGINGFACEHUB_API_TOKEN is ready


In [2]:
# asin_list_path = './data/external/asin_list.csv'
asin_list_path = '/Users/vladbordei/Documents/Development/ProductExplorer/data/external/asin_list.csv'
asin_list = pd.read_csv(asin_list_path)['asin'].tolist()

In [4]:
def read_data(folder_path):
    reviews = pd.DataFrame()
    
    for file_name in os.listdir(folder_path):
        if file_name.startswith("reviews"):
            file_path = os.path.join(folder_path, file_name)
            df = pd.read_csv(file_path)
            reviews = pd.concat([reviews, df])
    
    return reviews

In [17]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Load the pre-trained BERT model for sentiment analysis
model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

def get_sentiment_probabilities(text):
    # Tokenize the text and truncate if it's too long
    inputs = tokenizer.encode_plus(text, return_tensors="pt", truncation=True, max_length=512)
    outputs = model(**inputs)
    logits = outputs.logits
    probabilities = torch.softmax(logits, dim=-1)

    # Combine probabilities for positive (4-5 stars) and negative (1-2 stars) sentiment
    positive = probabilities[0, 3] + probabilities[0, 4]
    negative = probabilities[0, 0] + probabilities[0, 1]

    return positive.item(), negative.item()


def process_review(row):
    review_text = row["review"]
    print(f"Review text: {review_text}")

    # Check if review_text is a valid string
    if not isinstance(review_text, str):
        return pd.Series([0.5, 0.5])

    positive, negative = get_sentiment_probabilities(review_text)
    print(f"Sentiment allocation - Positive: {positive}, Negative: {negative}")
    
    return pd.Series([positive, negative])


In [3]:
def read_data_from_filtered_h10_folder(folder_path):
    reviews = pd.DataFrame()
    
    for file_name in os.listdir(folder_path):
        file_path = os.path.join(folder_path, file_name)
        df = pd.read_csv(file_path)
        reviews = pd.concat([reviews, df])
    
    return reviews


In [8]:
reviews_path = '/Users/vladbordei/Documents/Development/ProductExplorer/data/raw/RaisedGardenBed/h10reviews'
# reviews = read_data(reviews_path)
reviews = read_data_from_filtered_h10_folder(reviews_path)

In [11]:
try:
    reviews.rename(columns={'Body': 'review'}, inplace=True)
except:
    pass

In [18]:
# Apply the sentiment analysis to the "review" column
reviews[["positive_sentiment", "negative_sentiment"]] = reviews.apply(process_review, axis=1)

Review text: We researched materials for our raised beds for a very long time.  My wife had a vision for her vegetable garden and was incredibly disappointed when we finally decided her first choice material of cedar was going to be just too expensive.  We thought galvanized would be our next option but she really didn't want the look of the steel so we opted for the black.  We saved (literally)hundreds by going with these black galvanized beds and we were both pleasantly surprised and impressed with the entire experience.  While waiting for the order we spent a lot of time watching youtube videos about "mistakes" and the "hassles" of putting together galvanized beds, and honestly, made ourselves anxious for no reason.Putting these beds tother IS time-consuming but NOT hard.  The larger 4X8 beds are easier with two people (just because of the number of screws and bolts you have to use and lining up the longer side pieces) but my wife actually did the 4X4 beds herself in under an hour e

In [19]:
# save_path = './data/interim/reviews_with_sentiment.csv'
save_path = '/Users/vladbordei/Documents/Development/ProductExplorer/data/interim/reviews_with_sentiment.csv'
reviews.to_csv(save_path, index=False)