In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Load the dataset
df = pd.read_csv('Final Amazon Products Data.csv')

# Handling missing values
df['Reviews'].fillna('', inplace=True)

# Preprocessing
# Assume 'Rating' is numerical, so no additional preprocessing is needed

# Tokenization and feature extraction using TF-IDF for the 'Reviews'
tfidf_vectorizer = TfidfVectorizer(max_features=1000)  # Adjust max_features as needed
X_text = tfidf_vectorizer.fit_transform(df['Reviews'])

# Include 'Rating' as a feature
X_rating = df['Rating'].values.reshape(-1, 1)

# Concatenate the features
X = pd.concat([pd.DataFrame(X_text.toarray()), pd.DataFrame(X_rating)], axis=1)

# Define thresholds for positive, neutral, and negative ratings
positive_threshold = 4
negative_threshold = 2

# Convert 'Rating' column to categorical based on thresholds
df['Sentiment'] = pd.cut(df['Rating'], bins=[-float('inf'), negative_threshold, positive_threshold, float('inf')], labels=['Negative', 'Neutral', 'Positive'])

# Target variable
y = df['Sentiment']

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Training the model (Logistic Regression example)
classifier = LogisticRegression(max_iter=1000)
classifier.fit(X_train, y_train)

# Predicting sentiment for the entire dataset based on 'Rating' column
df['Predicted_Sentiment'] = classifier.predict(X)

# Assessing accuracy
accuracy = accuracy_score(y_test, classifier.predict(X_test))
print(f"Accuracy: {accuracy}")

# Save the updated DataFrame to a new CSV file
df.to_csv('amazon_products_with_sentiment.csv', index=False)


Accuracy: 0.9807692307692307
