In [3]:
#unsing naive bays model in this N-gram feature and sentiment polarity to predict overall(rating or score)

import pandas as pd
import nltk
from nltk.util import ngrams
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score
import re



#DATASET

file_path = r'C:\Users\nh013\Desktop\amazaon dataset\amazon_reviews.csv'

df = pd.read_csv(file_path)

# DROP ROWS WITH MISSING VALUES
df.dropna(inplace=True)

# REMOVE DUBLICATE ROWS
df.drop_duplicates(inplace=True)

# REMOVE ANY URL
df['reviewText'] = df['reviewText'].apply(lambda x: re.sub(r'http\S+', '', x))

# REMOVE SPECIAL CHARACTERS
df['reviewText'] = df['reviewText'].apply(lambda x: re.sub(r'[^\w\s]', '', x))

# CONVERT ALL TEXT TO LOWERCASE
df['reviewText'] = df['reviewText'].apply(lambda x: x.lower())

# REMOVE STOP WORDS
stop_words = set(stopwords.words('english'))
df['reviewText'] = df['reviewText'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

# STEMMING
stemmer = PorterStemmer()
df['reviewText'] = df['reviewText'].apply(lambda x: ' '.join([stemmer.stem(word) for word in x.split()]))

# SENTIMENT ANALYSIS USING VADER
sia = SentimentIntensityAnalyzer()

# BUILD FUNCTION TO GENERATE N GRMAS

def generate_ngrams(text, n):
    tokenized_text = word_tokenize(text)
    ngrams_list = list(ngrams(tokenized_text, n))
    return [' '.join(grams) for grams in ngrams_list]


# N-GRAMS AS A NEW COLUMN
n = 2  #  (EXAMPLE n=2 FOR BIGRAMS)
df['ngrams'] = df['reviewText'].apply(lambda x: generate_ngrams(x, n=n))
df['sentiment_polarity'] = df['reviewText'].apply(lambda x: sia.polarity_scores(x)['compound'])

# SPLIT DATA
X = df['reviewText']
y = df['overall']
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# TRAIN MODEL
nb = MultinomialNB()
nb.fit(X_train, y_train)

# PREDICTION
y_pred = nb.predict(X_test)

# EVALUATE THE MODEL ACCURACY AND PERFORMENCE
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
print("Accuracy:", accuracy)
print("Classification Report:\n", report)

Accuracy: 0.8097660223804679
Classification Report:
               precision    recall  f1-score   support

         1.0       0.72      0.43      0.53        54
         2.0       0.00      0.00      0.00        14
         3.0       0.00      0.00      0.00        25
         4.0       0.31      0.05      0.08       110
         5.0       0.83      0.98      0.90       780

    accuracy                           0.81       983
   macro avg       0.37      0.29      0.30       983
weighted avg       0.73      0.81      0.75       983



In [4]:
# using XGBOOST model to predict overall rating in sentiment polarity and N-grams feature

import pandas as pd
import nltk
from nltk.util import ngrams
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import re
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score


#DATASET

file_path = r'C:\Users\nh013\Desktop\amazaon dataset\amazon_reviews.csv'

df = pd.read_csv(file_path)

# DROP ROWS WITH MISSING VALUES
df.dropna(inplace=True)

# REMOVE DUBLICATE ROWS
df.drop_duplicates(inplace=True)

# REMOVE ANY URL
df['reviewText'] = df['reviewText'].apply(lambda x: re.sub(r'http\S+', '', x))

# REMOVE SPECIAL CHARACTERS
df['reviewText'] = df['reviewText'].apply(lambda x: re.sub(r'[^\w\s]', '', x))

# CONVERT ALL TEXT TO LOWERCASE
df['reviewText'] = df['reviewText'].apply(lambda x: x.lower())

# REMOVE STOP WORDS
stop_words = set(stopwords.words('english'))
df['reviewText'] = df['reviewText'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

# STEMMING
stemmer = PorterStemmer()
df['reviewText'] = df['reviewText'].apply(lambda x: ' '.join([stemmer.stem(word) for word in x.split()]))

# SENTIMENT ANALYSIS USING VADER
sia = SentimentIntensityAnalyzer()

# BUILD FUNCTION TO GENERATE N GRMAS

def generate_ngrams(text, n):
    tokenized_text = word_tokenize(text)
    ngrams_list = list(ngrams(tokenized_text, n))
    return [' '.join(grams) for grams in ngrams_list]


# N-GRAMS AS A NEW COLUMN
n = 2  #  (EXAMPLE n=2 FOR BIGRAMS)
df['ngrams'] = df['reviewText'].apply(lambda x: generate_ngrams(x, n=n))
df['sentiment_polarity'] = df['reviewText'].apply(lambda x: sia.polarity_scores(x)['compound'])

# SPLIT DATA
x = df['reviewText']
y = df['overall']

# MAP TARGET VARIABLES VALUES START FROM 0
y_mapped = y - 1


x_train, x_test, y_train, y_test = train_test_split(x, y_mapped.astype(int), test_size=0.2, random_state=42)


# COUNT VECTORIZE OBJECT

vectorizer = CountVectorizer()

# FIT THE MODEL
x_train_vectorized = vectorizer.fit_transform(x_train)

# TTRANSFORM THE TEST DATA
x_test_vectorized = vectorizer.transform(x_test)

# XGBOOST CLASSIFIER
classifier = xgb.XGBClassifier()

# TRAIN  MODEL
classifier.fit(x_train_vectorized, y_train)

# PREDICT OVERALL RATING ON TEST DATA
predicted_ratings = classifier.predict(x_test_vectorized)

# ACCURACY SCORE
accuracy = accuracy_score(y_test, predicted_ratings)
print("Accuracy of XGBoost model:", accuracy)


# MAP THE PREDICTED RATING BACK TO THE ORGINAL FORM
predicted_ratings_mapped = predicted_ratings + 1


print("Predicted Overall Ratings:", predicted_ratings_mapped)


Accuracy of XGBoost model: 0.8107833163784334
Predicted Overall Ratings: [5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5
 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 1 5 5 5 5 5 5 5 5
 4 5 5 5 5 5 5 5 5 5 5 1 5 5 5 5 5 5 5 5 5 5 5 1 5 5 5 5 5 5 5 5 5 5 5 5 5
 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 1 5 5 4 5 5 5 5 5 5 5 5 5 4 1 5 4 4 5 5 5
 5 4 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5
 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 1 5 5 5 5
 5 5 5 5 5 5 5 5 5 5 4 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5
 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5
 5 5 5 5 5 5 5 5 5 1 4 5 5 5 3 5 5 5 5 5 5 5 5 5 5 5 5 5 5 4 5 5 5 5 5 4 5
 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 4 5 5 5 5 5 5 5 5 5 4 5 5 5 5 5 5 5 5 5 5 4
 5 5 5 5 1 5 5 1 5 5 5 5 5 5 5 5 5 5 5 5 5 5 1 5 5 5 5 5 5 5 5 5 5 5 5 5 5
 5 5 5 5 5 5 5 5 5 5 5 1 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5
 5 5 1 5 5 5 5 5 5 5 5 5 5 