In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import warnings
import dill
import re
warnings.filterwarnings("ignore")

sns.set_theme(style="whitegrid")
plt.style.use("Solarize_Light2")

ModuleNotFoundError: No module named 'matplotlib'

In [None]:
# ! py -3.11 -m pip install streamlit

# Problem Statement

* The goal is to perform sentiment analysis on the reviews.

* Instead of directly predicting ratings (1–5), we map ratings into 3 sentiment categories:

#### 1. Ratings 1–2 → Negative 

#### 2. Rating 3 → Neutral 

#### 3. Ratings 4–5 → Positive 

* This converts the task into a 3-class text classification problem.



# Input Features

#### Title (short text)

#### Body (detailed review text)

#### These will be combined or individually used as text input to the NLP model.

# Target Feature

* The target variable is Sentiment, which is derived from the Rating.

## Mapping logic :

#### 1 or 2 → Negative

#### 3 → Neutral

#### 4 or 5 → Positive

# Loading Dataset from Excel into Pandas DataFrame

In [None]:
excel_sheets = pd.ExcelFile("dataset -P582.xlsx")

In [None]:
excel_sheets.sheet_names

In [None]:
df = pd.read_excel("dataset -P582.xlsx", sheet_name="Sheet1")

In [None]:
df.head(5)

In [None]:
df.shape # There are 1440 rows and 3 columns

# Deriving the Sentiment Feature from Ratings Feature
* We derived a new column sentiment from the original rating column at the beginning of the workflow. This ensures that the dataset directly reflects the 3-class sentiment classification problem (Negative, Neutral, Positive), which is the actual objective of this project.

In [None]:
def map_sentiment(rating):
    if rating in [1,2]:
        return "Negative"
    elif rating == 3:
        return "Neutral"
    else:
        return "Positive"

df["sentiment"] = df["rating"].apply(map_sentiment)

In [None]:
df = df.drop("rating", axis=1)   # Droped Rating feature beacause we derived new featrue from rating feature the is sentement

# Combining Title and Body for Text Analysis
* We combined the title and body into a single review column since both convey the same review. This reduces extra preprocessing and gives the model full context, often improving accuracy.

In [None]:
df["review"] = df["title"] + " " + df["body"]

In [None]:
df = df.drop(columns=["title","body"],axis = 1) # Droped title and body feature beacause we combine these feature to make new feature that is review

# Exploratory Data Analysis (EDA)

# Step 1: Initial Exploration 

In [None]:
df.head(5)

In [None]:
df.tail(5)

In [None]:
df.shape  # There are 1440 rows and 2 columns

In [None]:
df.info()

In [None]:
df.dtypes

In [None]:
df.columns

In [None]:
df.index

In [None]:
print(f"In sentment feature have {df.sentiment.nunique()} uniques values and those are {df.sentiment.unique()}")

# Step 2: Checking Null Values: 

In [None]:
df.isnull().sum()

# Step 3: Checking Duplicates: 

In [None]:
df[df.duplicated()]

# Step 4: Data type conversion 

In [None]:
df["sentiment"] = df["sentiment"].astype("category")

In [None]:
df.dtypes

# Step 5: Univariate analysis

## 1.  Sentiment Class Distribution

In [None]:
df.sentiment.value_counts()

In [None]:
palette = sns.color_palette("Set2", n_colors=df['sentiment'].nunique())
plt.figure(figsize=(8, 5))
sns.countplot(x = "sentiment", data = df,palette = palette, hue="sentiment" ,edgecolor="black")
plt.show()

## 2. Review Length Distribution Analysis

In [None]:
df["review_length"] = df["review"].apply(lambda x: len(str(x).split()))

plt.figure(figsize=(8,5))
sns.histplot(df["review_length"], bins=30, kde=True, color="skyblue")
plt.title("Review Length Distribution (in words)")
plt.xlabel("Review Length")
plt.ylabel("Frequency")
plt.show()


# Step 6: Multivariate analysis: 

## 1. Review Length Distribution per Sentiment Class

In [None]:
plt.figure(figsize=(8,5))
sns.boxplot(data=df, x='sentiment', y='review_length', palette="Set2", hue = "sentiment")
plt.title("Review Length vs Sentiment")
plt.xlabel("Sentiment")
plt.ylabel("Review Length (in words)")
plt.show()

# Insights from EDA : 
* In this dataset there are 3 Columns and 1440 rows are there
* Feature 1: ['title'] and its data type is object, This feature is short summary of Review 
* Feature 2: ['Rating'] and its data type is descrit numerical column, Rating of the product accroding to review
* Feature 3: [body] and its data type is object, This feature is long text explaining the review
* we combine the title and body features to make new feature called Review
* And also we Deriving new Feature from the Rating featrue called Sentiment 
* In this dataset there is no null vlaues are there as well as there is no duplicated row are present in this dataset
* In sentment feature have 3 uniques values and those are ['Negative' 'Neutral' 'Positive']

# Text Preprocessing for Sentiment Analysis

# Step 1: Lowercasing & Removing Noise

In [None]:
import re

In [None]:

def clean_text(text):
    text = text.lower()                                   # lowercase
    text = re.sub(r'<.*?>', '', text)                     # remove HTML
    text = re.sub(r'http\S+|www\S+', '', text)            # remove URLs
    text = re.sub(r'[^a-z\s]', '', text)                  # keep only letters
    text = re.sub(r'\s+', ' ', text).strip()              # remove extra spaces
    return text

In [None]:
df["clean_text"] = df["review"].apply(clean_text)

In [None]:
df.head()

# Step 2: Tokenization + Stopword Removal

## Why we remove stopwords and what is stop words
* We removed stopwords (common words like ‘the’, ‘is’, ‘and’) because they do not carry meaningful information for sentiment analysis and only add noise to the model.

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
from collections import Counter

# Download stopwords if not already
nltk.download('punkt')     # for tokenization (word_tokenize)
nltk.download('stopwords') # downloading the all stop

In [None]:
stop_words = set(stopwords.words("english"))

In [None]:
stop_words

In [None]:
def tokenize_and_remove_stopwords(review):
    tokens = word_tokenize(review)
    tokens = [word for word in tokens if word not in stop_words]
    return tokens

In [None]:
df['tokens'] = df['clean_text'].apply(tokenize_and_remove_stopwords)

In [None]:
df[['clean_text','tokens']].head()

# Word Frequency Analysis of Reviews

In [None]:
tokens = df['tokens'].tolist()

all_words = []
for words_list in tokens:
    for word in words_list: 
        all_words.append(word)

word_freq = Counter(all_words).most_common(20)
words, counts = zip(*word_freq)

for i in range(len(counts)):
    print(f"{words[i]} - {counts[i]}")

# Visualizing Most Frequent Words Using Bar Plots

In [None]:
plt.figure(figsize=(8,5))
sns.barplot(x=list(counts), y=list(words), palette="viridis")
plt.title("Top 20 Most Frequent Words")
plt.xlabel("Count")
plt.ylabel("Words")
plt.show()

# Visualizing Most Frequent Words by Sentiment -> Word Clouds 

In [None]:
from wordcloud import WordCloud

for sentiment in df["sentiment"].unique():
    text = " ".join(df[df["sentiment"] == sentiment]["review"])
    wordcloud = WordCloud(width=800, height=400, background_color="white").generate(text)

    print(f"Word Cloud for {sentiment} Reviews: ")
    plt.figure(figsize=(8,5))
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis("off")
    plt.title(f"Word Cloud for {sentiment} Reviews")
    plt.show()
    print()
    print()

# Step 3: Text Normalization (Lemmatization/Stemming)
* We applied lemmatization to reduce words to their base form (e.g., ‘running’ → ‘run’). This helps the model focus on meaning rather than word variations.
  
#### "J" → Adjective

#### "N" → Noun

#### "V" → Verb

#### "R" → Adverb

In [None]:
# ! py -3.11 -m pip install spacy

In [None]:
# ! py -3.11 -m spacy download en_core_web_lg

In [None]:
import spacy

# Load English model
nlp = spacy.load("en_core_web_lg")

def lemmatize_tokens_spacy(tokens):
    doc = nlp(" ".join(tokens))   # join tokens back into a sentence for spaCy
    return [token.lemma_ for token in doc]

# Example: if you already have a column "tokens"
df["review_lemmatized"] = df["tokens"].apply(lemmatize_tokens_spacy)

In [None]:
df[["tokens","review_lemmatized"]].head(5)

In [None]:
df.shape

# VADER-based sentiment analysis

In [None]:
# !py -3.11 -m pip install vaderSentiment 

In [None]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [None]:
analyzer = SentimentIntensityAnalyzer()

def get_vader_sentiment(token):
    text = " ".join(token)
    scores = analyzer.polarity_scores(text)
    compound = scores['compound']
    if compound >= 0.05:
        return "Positive"
    elif compound <= -0.05:
        return "Negative"
    else:
        return "Neutral"

In [None]:
df['vader_sentiment'] = df['review_lemmatized'].apply(get_vader_sentiment)

# TF-IDF (Term Frequency – Inverse Document Frequency) Vectorization:
* We converted the preprocessed reviews into numerical vectors using TF-IDF, where each word is represented by its importance in the review relative to the corpus, resulting in a DataFrame ready for machine learning.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
# Join lemmatized tokens back into a sentence
df["review_text"] = df["review_lemmatized"].apply(lambda x: " ".join(x))

In [None]:
# Initialize TF-IDF Vectorizer
tfidf = TfidfVectorizer(max_features=2000)  # you can adjust max_features

In [None]:
# Fit and transform the text
X = tfidf.fit_transform(df["review_text"]).toarray()

In [None]:
# Convert to DataFrame with feature names as columns
tfidf_df = pd.DataFrame(X, columns=tfidf.get_feature_names_out())

In [None]:
# Optional: add the target sentiment column back
tfidf_df["sentiment"] = df["sentiment"]

In [None]:
tfidf_df.head()

In [None]:
tfidf_df.shape

In [None]:
# with open("../models/tfidf_vectorizer.pkl", "wb") as f:
#     pickle.dump(tfidf, f)

# Encoding the Target Variable

In [None]:
# Map sentiment to numerical values in the original DataFrame
sentiment_mapping = {"Negative": 0, "Neutral": 1, "Positive": 2}
tfidf_df["sentiment_num"] = tfidf_df["sentiment"].map(sentiment_mapping)

# Check
tfidf_df[["sentiment", "sentiment_num"]].head()

# Final Preprocessed Data: 

In [None]:
tfidf_df.head(5)

In [None]:
tfidf_df.shape

# Splitting Dataset into Training and Testing Sets

In [None]:
X = tfidf_df.drop(columns=['sentiment','sentiment_num'], axis = 1)

In [None]:
X.head(5)

In [None]:
X.shape

In [None]:
y = tfidf_df['sentiment_num']

In [None]:
y.head(5)

In [None]:
y.shape

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y , test_size=0.2, random_state=42)

In [None]:
X_train.head(5)

In [None]:
X_train.shape , X_test.shape

# Model Development

# Model 1: Logistic regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

In [None]:
model1 = LogisticRegression(max_iter=1000, random_state=42, class_weight="balanced")

In [None]:
# Train the model
model1.fit(X_train, y_train)

In [None]:
# Predictions
y_pred = model1.predict(X_test)

In [None]:
# Evaluation
print("Accuracy  and classification report of LogisticRegression: ")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

In [None]:
# with open("../models/logistic_model.pkl", "wb") as f:
#     dill.dump(model1, f)

# Model 2: SVM (svc)

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
import dill

# Create the SVM model
# For multi-class classification, 'ovr' (one-vs-rest) is default
model_svm = SVC(kernel='linear', probability=True, class_weight='balanced', random_state=42)

# Train the model
model_svm.fit(X_train, y_train)

# Make predictions
y_pred = model_svm.predict(X_test)

# Evaluation
print("Accuracy and classification report of SVM: ")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Save the trained SVM model
# with open("../models/svm_model.pkl", "wb") as f:
#     dill.dump(model_svm, f)


In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, accuracy_score
import dill



In [None]:
param_grid = {
    'kernel': ['linear'], # Kernels to try
    'C': [0.1, 1,]                         # Regularization                              
}




In [None]:
svc = SVC(class_weight='balanced', probability=True, random_state=42)

grid_search = GridSearchCV(
    estimator=svc,
    param_grid=param_grid,
    scoring='accuracy',       # Optimize for accuracy
    cv=5,                     # 5-fold cross-validation
    n_jobs=-1,                # Use all cores
    verbose=2
)




In [None]:
grid_search.fit(X_train, y_train)
print("Best Hyperparameters:", grid_search.best_params_)
print("Best CV Accuracy:", grid_search.best_score_)

best_svc = grid_search.best_estimator_
y_pred = best_svc.predict(X_test)

print("Test Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Model 3: Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

model2_rf = RandomForestClassifier()

model2_rf.fit(X_train, y_train)

y_pred2 = model2_rf.predict(X_test)

print("Accuracy  and classification report of Random forest: ")
print(f"Accuracy: {accuracy_score(y_test,y_pred2)}")

print("\nClassification Report:\n", classification_report(y_test, y_pred2))

# Sentiment Prediction on New Input Reviews

In [None]:
import re
import dill
class TextCleaner:
    def __init__(self):
        pass
    
    def clean(self, text):
        text = text.lower()
        text = re.sub(r'<.*?>', '', text)            # remove HTML
        text = re.sub(r'http\S+|www\S+', '', text)   # remove URLs
        text = re.sub(r'[^a-z\s]', '', text)         # keep only letters
        text = re.sub(r'\s+', ' ', text).strip()     # remove extra spaces
        return text


In [None]:
class TextTokenizerStopwordsRemover:
    def __init__(self):
        # Initialize stopwords inside the class
        self.stop_words = set(stopwords.words('english'))

    def tokenize_and_remove_stopwords(self, text):
        

        tokens = word_tokenize(text)
        return [token for token in tokens if token.lower() not in self.stop_words]

In [None]:
import spacy
class Lemmatization:
    def __init__(self):
        self.nlp = spacy.load("en_core_web_lg")

    def lemmatize_tokens_spacy(self,tokens):
        doc = self.nlp(" ".join(tokens))   # join tokens back into a sentence for spaCy
        return [token.lemma_ for token in doc]


In [None]:
def predict_sentiment(test_text):
    print(f"Input Text: [{test_text}]\n")

    # Load pickled preprocessors and model
    cleaner = TextCleaner()
    tokenizer_stopword_remover = TextTokenizerStopwordsRemover()
    lemmatizer = Lemmatization()
    
   
    with open('../backend/models/tfidf_vectorizer.pkl', 'rb') as f:
        tfidf = dill.load(f)
    with open('../backend/models/logistic_model.pkl', 'rb') as f:
        model = dill.load(f)

    # Step 1: Clean text
    cleaned_text = cleaner.clean(test_text)
    print(f"Cleaned text: [{cleaned_text}]\n")

    # Step 2: Tokenize & remove stopwords
    tokens = tokenizer_stopword_remover.tokenize_and_remove_stopwords(cleaned_text)
    print(f"Tokens after stopword removal: [{tokens}]\n")

    # Step 3: Lemmatize tokens
    lemmatized_tokens = lemmatizer.lemmatize_tokens_spacy(tokens)
    print(f"Lemmatized tokens: [{lemmatized_tokens}]\n")

    # Step 4: Join tokens and convert to TF-IDF vector
    final_text = ' '.join(lemmatized_tokens)
    print(f"Final preprocessed text: [{final_text}]\n")
    vector = tfidf.transform([final_text])

    # Step 5: Predict
    predicted_class = model.predict(vector)[0]
    sentiment_labels = {0: "Negative", 1: "Neutral", 2: "Positive"}
    predicted_sentiment = sentiment_labels[predicted_class]

    print(f"Predicted Sentiment: {predicted_sentiment}\n")
    return predicted_sentiment


In [None]:
test_text = "The product was horrible and disappointing."
predict_sentiment(test_text)

In [None]:
test_text = "The product is okay, nothing special but works fine."
predict_sentiment(test_text)

In [None]:
test_text = "This is the best product "
predict_sentiment(test_text)

In [None]:
test_text = "The product is not good okay"
predict_sentiment(test_text)