In [1]:
%%time
import pandas as pd 
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

#Load and preprocess data
df = pd.read_csv("Dataset-SA.csv").rename(columns={
    "Review": "text",
    "Summary": "string",
    "Rate": "int",
    "Sentiment": "label"})

#Drop rows where the sentiment (label) is missing
df = df.dropna(subset=['label'])
               
#Map sentiment labels to binary values
df ['label'] = df['label'].map ({"positive": 1,"negative": 0 })
df = df.dropna(subset=['label'])
df['text'] = df ['text'].str.lower().apply(lambda x: re.sub(r'[^\w\s]', '', x))

# Define and fit the vectorizer globally
vectorizer = TfidfVectorizer (stop_words="english", ngram_range=(1,2))
X = vectorizer.fit_transform(df['text'])
y = df['label']

#Train model using Logistic Regression 
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.25, random_state=42)
model = LogisticRegression(max_iter=1000).fit(X_train, y_train)

#Evaluate model
y_pred = model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test,y_pred): .2f}")
print("Classification Report:\n", classification_report(y_test,y_pred,zero_division=1))



Accuracy:  0.94
Classification Report:
               precision    recall  f1-score   support

         0.0       0.93      0.65      0.77      7181
         1.0       0.94      0.99      0.97     41523

    accuracy                           0.94     48704
   macro avg       0.94      0.82      0.87     48704
weighted avg       0.94      0.94      0.94     48704

CPU times: total: 5.67 s
Wall time: 6.06 s


In [26]:
#Function to predict sentiment and confidence for multiple reviews
def predict_bulk_sentiments(reviews):
    cleaned_reviews = [re.sub(r'[^\w\s]', '', review.lower()) for review in reviews]
    transformed_reviews = vectorizer.transform(cleaned_reviews)
    # Get prediction probabilities
    probabilities = model.predict_proba(transformed_reviews)
    # Get predicted class (1 = Positive, 0 = Negative)
    predictions = model.predict(transformed_reviews)
    # Convert predictions into readable labels
    results = ["Positive" if pred == 1 else "Negative" for pred in predictions]

    # Print results with confidence
    for review, sentiment, prob in zip(reviews, results, probabilities):
        confidence = max(prob) * 100
        print(f"Review: {review}")
        print(f"Predicted Sentiment: {sentiment} (Confidence: {confidence:.2f}%)\n")
    #List of new reviews to test
new_reviews = ["This product is amazing! I loved it.",
               "Really disappointing, it broke on the first day.",
               "Decent quality for the price.",
               "I absolutely adore this!Highly recommended.",
               "Not as expected, quite poor performance.",
               "Very poor quality for the price>",
               "disappointed"
]
# Run bulk predictions
predict_bulk_sentiments(new_reviews)
    
   


Review: This product is amazing! I loved it.
Predicted Sentiment: Positive (Confidence: 83.28%)

Review: Really disappointing, it broke on the first day.
Predicted Sentiment: Positive (Confidence: 86.14%)

Review: Decent quality for the price.
Predicted Sentiment: Positive (Confidence: 73.73%)

Review: I absolutely adore this!Highly recommended.
Predicted Sentiment: Negative (Confidence: 51.40%)

Review: Not as expected, quite poor performance.
Predicted Sentiment: Negative (Confidence: 61.98%)

Review: Very poor quality for the price>
Predicted Sentiment: Negative (Confidence: 55.98%)

Review: disappointed
Predicted Sentiment: Negative (Confidence: 84.75%)



In [4]:
import pickle
#Save the trained model
with open("model.pkl", "wb") as model_file:
    pickle.dump(model, model_file)
#Save the Vectorizer
with open ("vectorizer.pkl", "wb") as vectorizer_file:
    pickle.dump(vectorizer, vectorizer_file)
print("Model and vectorizer saved!")

Model and vectorizer saved!
