In [5]:
#Naive Bayes classifier and a count vectorizer in possitive negative word count feature

import pandas as pd
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
import re


# Set the file path to your dataset
file_path = r'C:\Users\nh013\Desktop\amazaon dataset\amazon_reviews.csv'

# Read the CSV file into a DataFrame
df = pd.read_csv(file_path)

# Remove any rows with missing values
df.dropna(inplace=True)

# Remove any duplicate rows
df.drop_duplicates(inplace=True)

# Remove any URLs
df['reviewText'] = df['reviewText'].apply(lambda x: re.sub(r'http\S+', '', x))

# Remove any special characters
df['reviewText'] = df['reviewText'].apply(lambda x: re.sub(r'[^\w\s]', '', x))

# Convert all text to lowercase
df['reviewText'] = df['reviewText'].apply(lambda x: x.lower())

# Remove stop words
stop_words = set(stopwords.words('english'))
df['reviewText'] = df['reviewText'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

# Apply stemming
stemmer = PorterStemmer()
df['reviewText'] = df['reviewText'].apply(lambda x: ' '.join([stemmer.stem(word) for word in x.split()]))

# Initialize SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()

# Positive and Negative word counts
positive_count = []
negative_count = []

# Perform sentiment analysis using VADER
for index, row in df.iterrows():
    review = row['reviewText']
    sentiment_scores = sia.polarity_scores(review)
    positive_count.append(sum([1 for word in review.split() if word in sia.lexicon and sia.lexicon[word] > 0]))
    negative_count.append(sum([1 for word in review.split() if word in sia.lexicon and sia.lexicon[word] < 0]))

  # Print the preprocessed reviewText column
print(df['reviewText'])


# Add positive and negative word count columns to the DataFrame
df['Positive Word Count'] = positive_count
df['Negative Word Count'] = negative_count

# Create a count vectorizer object
vectorizer = CountVectorizer()

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(df['reviewText'], df['Positive Word Count'], test_size=0.2, random_state=42)

# Fit the vectorizer to the training data and transform the training data
X_train_vectorized = vectorizer.fit_transform(X_train)

# Initialize the Naive Bayes classifier
classifier = MultinomialNB()

# Train the classifier
classifier.fit(X_train_vectorized, y_train)

# Transform the test data
X_test_vectorized = vectorizer.transform(X_test)

# Predict the positive word counts for the test data
predicted_counts = classifier.predict(X_test_vectorized)


# Print the predicted positive word counts
print("Predicted Positive Word Counts:", predicted_counts)


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\nh013\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


1       purchas devic work advertis never much phone m...
2       work expect sprung higher capac think made bit...
3       think work greathad diff bran 64gb card went s...
4       bought retail packag arriv legit orang envelop...
5       mini storag doesnt anyth els suppos purchas ad...
                              ...                        
4910    bought sandisk 16gb class 10 use htc inspir 3 ...
4911    use extend capabl samsung galaxi note 10 great...
4912    great card fast reliabl come option adapt sd s...
4913           good amount space stuff want fit gopro say
4914    ive heard bad thing 64gb micro sd card crap we...
Name: reviewText, Length: 4913, dtype: object
Predicted Positive Word Counts: [ 1  2  1  1  1  1  1  1  2  2  2  2  1  1  1  2  1  2  1  1  3  1  1  1
  2  1  1  1  1  2  2  2  1  2  1  1  1  1  1  3  1  1  1  2  3  2  1  1
  2  1  2  2  2  1  1  1  2  2  2  1  1  1  2  3  1  1  2  1  1  1  2  2
  2  1 13  2  2  2  2  1  2  1  1  2  2  1  2  1  1  2  2  1  2  

In [4]:
import pandas as pd
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
import re
from sklearn.metrics import accuracy_score

# Set the file path to your dataset
file_path = r'C:\Users\nh013\Desktop\amazaon dataset\amazon_reviews.csv'

# Read the CSV file into a DataFrame
df = pd.read_csv(file_path)

# Remove any rows with missing values
df.dropna(inplace=True)

# Remove any duplicate rows
df.drop_duplicates(inplace=True)

# Remove any URLs
df['reviewText'] = df['reviewText'].apply(lambda x: re.sub(r'http\S+', '', x))

# Remove any special characters
df['reviewText'] = df['reviewText'].apply(lambda x: re.sub(r'[^\w\s]', '', x))

# Convert all text to lowercase
df['reviewText'] = df['reviewText'].apply(lambda x: x.lower())

# Remove stop words
stop_words = set(stopwords.words('english'))
df['reviewText'] = df['reviewText'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

# Apply stemming
stemmer = PorterStemmer()
df['reviewText'] = df['reviewText'].apply(lambda x: ' '.join([stemmer.stem(word) for word in x.split()]))

# Initialize SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()

# Positive and Negative word counts
positive_count = []
negative_count = []

# Perform sentiment analysis using VADER
for index, row in df.iterrows():
    review = row['reviewText']
    sentiment_scores = sia.polarity_scores(review)
    positive_count.append(sum([1 for word in review.split() if word in sia.lexicon and sia.lexicon[word] > 0]))
    negative_count.append(sum([1 for word in review.split() if word in sia.lexicon and sia.lexicon[word] < 0]))

# Print the preprocessed reviewText column
print(df['reviewText'])

# Add positive and negative word count columns to the DataFrame
df['Positive Word Count'] = positive_count
df['Negative Word Count'] = negative_count

# Create a count vectorizer object
vectorizer = CountVectorizer()

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(df['reviewText'], df['Positive Word Count'], test_size=0.2, random_state=42)

# Fit the vectorizer to the training data and transform the training data
X_train_vectorized = vectorizer.fit_transform(X_train)

# Initialize the Naive Bayes classifier
classifier = MultinomialNB()

# Train the classifier
classifier.fit(X_train_vectorized, y_train)

# Transform the test data
X_test_vectorized = vectorizer.transform(X_test)

# Predict the positive word counts for the test data
predicted_counts = classifier.predict(X_test_vectorized)

# Calculate accuracy score of the model
accuracy = accuracy_score(y_test, predicted_counts)
print("Accuracy of Naive Bayes model:", accuracy)

# Print the predicted positive word counts
print("Predicted Positive Word Counts:", predicted_counts)


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\nh013\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


1       purchas devic work advertis never much phone m...
2       work expect sprung higher capac think made bit...
3       think work greathad diff bran 64gb card went s...
4       bought retail packag arriv legit orang envelop...
5       mini storag doesnt anyth els suppos purchas ad...
                              ...                        
4910    bought sandisk 16gb class 10 use htc inspir 3 ...
4911    use extend capabl samsung galaxi note 10 great...
4912    great card fast reliabl come option adapt sd s...
4913           good amount space stuff want fit gopro say
4914    ive heard bad thing 64gb micro sd card crap we...
Name: reviewText, Length: 4913, dtype: object
Accuracy of Naive Bayes model: 0.31943031536113936
Predicted Positive Word Counts: [ 1  2  1  1  1  1  1  1  2  2  2  2  1  1  1  2  1  2  1  1  3  1  1  1
  2  1  1  1  1  2  2  2  1  2  1  1  1  1  1  3  1  1  1  2  3  2  1  1
  2  1  2  2  2  1  1  1  2  2  2  1  1  1  2  3  1  1  2  1  1  1  2  2
  2  1 13  2  

In [6]:
#unsing naive bays model in this N-gram feature and sentiment polarity to predict overall(rating or score)

import pandas as pd
import nltk
from nltk.util import ngrams
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score
import re

# Set the file path to your dataset
file_path = r'C:\Users\nh013\Desktop\amazaon dataset\amazon_reviews.csv'

# Read the CSV file into a DataFrame
df = pd.read_csv(file_path)

# Remove any rows with missing values
df.dropna(inplace=True)

# Remove any duplicate rows
df.drop_duplicates(inplace=True)

# Remove any URLs
df['reviewText'] = df['reviewText'].apply(lambda x: re.sub(r'http\S+', '', x))

# Remove any special characters
df['reviewText'] = df['reviewText'].apply(lambda x: re.sub(r'[^\w\s]', '', x))

# Convert all text to lowercase
df['reviewText'] = df['reviewText'].apply(lambda x: x.lower())

# Remove stop words
stop_words = set(stopwords.words('english'))
df['reviewText'] = df['reviewText'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

# Apply stemming
stemmer = PorterStemmer()
df['reviewText'] = df['reviewText'].apply(lambda x: ' '.join([stemmer.stem(word) for word in x.split()]))

# Perform sentiment analysis using VADER
sia = SentimentIntensityAnalyzer()

# Function to generate N-grams
def generate_ngrams(text, n):
    tokenized_text = word_tokenize(text)
    ngrams_list = list(ngrams(tokenized_text, n))
    return [' '.join(grams) for grams in ngrams_list]

# Add N-grams and sentiment polarity as new columns
n = 2  # Set n as desired (e.g., n=2 for bigrams)
df['ngrams'] = df['reviewText'].apply(lambda x: generate_ngrams(x, n=n))
df['sentiment_polarity'] = df['reviewText'].apply(lambda x: sia.polarity_scores(x)['compound'])

# Prepare data for Naive Bayes classification
X = df['reviewText']
y = df['overall']
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the Naive Bayes model
nb = MultinomialNB()
nb.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = nb.predict(X_test)

# Evaluate the model's accuracy and performance
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
print("Accuracy:", accuracy)
print("Classification Report:\n", report)


Accuracy: 0.8097660223804679
Classification Report:
               precision    recall  f1-score   support

         1.0       0.72      0.43      0.53        54
         2.0       0.00      0.00      0.00        14
         3.0       0.00      0.00      0.00        25
         4.0       0.31      0.05      0.08       110
         5.0       0.83      0.98      0.90       780

    accuracy                           0.81       983
   macro avg       0.37      0.29      0.30       983
weighted avg       0.73      0.81      0.75       983



In [9]:
# using XGBOOST model to predict overall rating in sentiment polarity and N-grams feature

import pandas as pd
import nltk
from nltk.util import ngrams
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import re
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score

# Set the file path to your dataset
file_path = r'C:\Users\nh013\Desktop\amazaon dataset\amazon_reviews.csv'

# Read the CSV file into a DataFrame
df = pd.read_csv(file_path)

# Remove any rows with missing values
df.dropna(inplace=True)

# Remove any duplicate rows
df.drop_duplicates(inplace=True)

# Remove any URLs
df['reviewText'] = df['reviewText'].apply(lambda x: re.sub(r'http\S+', '', x))

# Remove any special characters
df['reviewText'] = df['reviewText'].apply(lambda x: re.sub(r'[^\w\s]', '', x))

# Convert all text to lowercase
df['reviewText'] = df['reviewText'].apply(lambda x: x.lower())

# Remove stop words
stop_words = set(stopwords.words('english'))
df['reviewText'] = df['reviewText'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

# Apply stemming
stemmer = PorterStemmer()
df['reviewText'] = df['reviewText'].apply(lambda x: ' '.join([stemmer.stem(word) for word in x.split()]))

# Perform sentiment analysis using VADER
sia = SentimentIntensityAnalyzer()

# Function to generate N-grams
def generate_ngrams(text, n):
    tokenized_text = word_tokenize(text)
    ngrams_list = list(ngrams(tokenized_text, n))
    return [' '.join(grams) for grams in ngrams_list]

# Add N-grams and sentiment polarity as new columns
n = 2  # Set n as desired (e.g., n=2 for bigrams)
df['ngrams'] = df['reviewText'].apply(lambda x: generate_ngrams(x, n=n))
df['sentiment_polarity'] = df['reviewText'].apply(lambda x: sia.polarity_scores(x)['compound'])

# Define features and target variable
x = df['reviewText']
y = df['overall']

# Map target variable values to start from 0
y_mapped = y - 1

# Split the data into training and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y_mapped.astype(int), test_size=0.2, random_state=42)

# Create a count vectorizer object
vectorizer = CountVectorizer()

# Fit and transform the training data
x_train_vectorized = vectorizer.fit_transform(x_train)

# Transform the test data
x_test_vectorized = vectorizer.transform(x_test)

# Initialize the XGBoost classifier
classifier = xgb.XGBClassifier()

# Train the classifier
classifier.fit(x_train_vectorized, y_train)

# Predict the overall ratings for the test data
predicted_ratings = classifier.predict(x_test_vectorized)

# Calculate accuracy score of the model
accuracy = accuracy_score(y_test, predicted_ratings)
print("Accuracy of XGBoost model:", accuracy)


# Map the predicted ratings back to the original scale
predicted_ratings_mapped = predicted_ratings + 1

# Print the predicted overall ratings
print("Predicted Overall Ratings:", predicted_ratings_mapped)

Accuracy of XGBoost model: 0.8107833163784334
Predicted Overall Ratings: [5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5
 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 1 5 5 5 5 5 5 5 5
 4 5 5 5 5 5 5 5 5 5 5 1 5 5 5 5 5 5 5 5 5 5 5 1 5 5 5 5 5 5 5 5 5 5 5 5 5
 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 1 5 5 4 5 5 5 5 5 5 5 5 5 4 1 5 4 4 5 5 5
 5 4 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5
 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 1 5 5 5 5
 5 5 5 5 5 5 5 5 5 5 4 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5
 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5
 5 5 5 5 5 5 5 5 5 1 4 5 5 5 3 5 5 5 5 5 5 5 5 5 5 5 5 5 5 4 5 5 5 5 5 4 5
 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 4 5 5 5 5 5 5 5 5 5 4 5 5 5 5 5 5 5 5 5 5 4
 5 5 5 5 1 5 5 1 5 5 5 5 5 5 5 5 5 5 5 5 5 5 1 5 5 5 5 5 5 5 5 5 5 5 5 5 5
 5 5 5 5 5 5 5 5 5 5 5 1 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5
 5 5 1 5 5 5 5 5 5 5 5 5 5 