In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import string

In [3]:
# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\momin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\momin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
df = pd.read_csv("C:\\Users\\momin\\Desktop\\Product_Review_Authentication\\Sentiment-Analysis-Product-Reviews\\fake reviews dataset.csv")

In [6]:
df = df[df['category'] == 'Electronics_5']

In [7]:
def preprocess_text(text):
    # Tokenization
    tokens = word_tokenize(text)
    # Convert to lowercase
    tokens = [word.lower() for word in tokens]
    # Remove punctuation
    tokens = [word for word in tokens if word not in string.punctuation]
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    # Stemming
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]
    # Join tokens back into text
    preprocessed_text = ' '.join(tokens)
    return preprocessed_text

In [9]:
df['preprocessed_text'] = df['text_'].apply(preprocess_text)

df['features'] = df['rating'].astype(str) + ' ' + df['preprocessed_text']

# Split the dfset into training and testing sets
X = df['features']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

vectorizer = CountVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

# Train the Naive Bayes classifier
naive_bayes_classifier = MultinomialNB()
naive_bayes_classifier.fit(X_train_vectorized, y_train)

# Evaluate the model
y_pred = naive_bayes_classifier.predict(X_test_vectorized)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.8383458646616542
Classification Report:
               precision    recall  f1-score   support

          CG       0.83      0.87      0.85       419
          OR       0.85      0.81      0.83       379

    accuracy                           0.84       798
   macro avg       0.84      0.84      0.84       798
weighted avg       0.84      0.84      0.84       798



In [10]:
import joblib

# Save the trained model
joblib.dump(naive_bayes_classifier, 'naive_bayes_model.pkl')

# Save the vectorizer
joblib.dump(vectorizer, 'count_vectorizer.pkl')

['count_vectorizer.pkl']