In [8]:
import pandas as pd
import numpy as np
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [9]:
nltk.download('punkt')
nltk.download('vader_lexicon')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/muhammadusman/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/muhammadusman/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [10]:
df = pd.read_csv("amazon_products.csv")

In [11]:
# Drop rows with missing review data
df = df.dropna(subset=['Reviews'])

In [12]:
# Tokenize the reviews
tokenized_reviews = df['Reviews'].apply(nltk.word_tokenize)

In [13]:
# Initialize Sentiment Intensity Analyzer
sia = nltk.sentiment.SentimentIntensityAnalyzer()

In [14]:
# Perform sentiment analysis on each tokenized review and aggregate sentiment scores
sentiment_scores = []
for review_tokens in tokenized_reviews:
    pos_score = 0
    neg_score = 0
    neu_score = 0
    
    for token in review_tokens:
        sentiment_score = sia.polarity_scores(token)
        pos_score += sentiment_score['pos']
        neg_score += sentiment_score['neg']
        neu_score += sentiment_score['neu']
    
    total_tokens = len(review_tokens)
    pos_score /= total_tokens
    neg_score /= total_tokens
    neu_score /= total_tokens
    
    sentiment_scores.append((pos_score, neg_score, neu_score))


In [15]:
# Split data into features and target
X = sentiment_scores
y = df['Rating'].apply(lambda x: 'Positive' if x >= 4 else 'Neutral' if x == 3 else 'Negative')


In [16]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [22]:
# Check unique classes in y
unique_classes = y.unique()

# If only one class is present, print a message and handle the case accordingly
if len(unique_classes) == 1:
    print("Only one class present in the data:", unique_classes[0])
    # Handle this case appropriately, such as adjusting the data or using a different approach
else:
    try:
        # Split data into train and test sets
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

        # Train a Logistic Regression classifier
        classifier = LogisticRegression(max_iter=1000)
        classifier.fit(X_train, y_train)

        # Predict on the test set
        y_pred = classifier.predict(X_test)

        # Evaluate the model
        print(classification_report(y_test, y_pred))

        # Perform sentiment analysis on the entire dataset
        sentiments = []
        for sentiment_score in sentiment_scores:
            pos_score, neg_score, neu_score = sentiment_score
            if pos_score > neg_score and pos_score > neu_score:
                sentiments.append("Positive")
            elif neg_score > pos_score and neg_score > neu_score:
                sentiments.append("Negative")
            else:
                sentiments.append("Neutral")

        # Add a new column with sentiment analysis results to the dataframe
        df['Sentiment'] = sentiments

        # Save the updated dataframe to a new CSV file
        df.to_csv("amazon_products_with_sentiment.csv", index=False)
    except Exception as e:
        print("Error:", e)

Only one class present in the data: Positive
