In [2]:
# Step 1: Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import nltk
nltk.download('punkt')  # Ensure the Punkt tokenizer is downloaded
import joblib  # For saving and loading the model

# Step 2: Load datasets
# Load the CSV file with reviews
df = pd.read_csv('reviews.csv')

# Load the positive and negative paragraphs from the txt files with encoding
with open('positive.txt', 'r', encoding='utf-8') as file:
    positive_text = file.read()

with open('negative.txt', 'r', encoding='utf-8') as file:
    negative_text = file.read()

# Step 3: Create DataFrames for positive and negative paragraphs
positive_df = pd.DataFrame({'Review': [positive_text], 'Label': [1]})
negative_df = pd.DataFrame({'Review': [negative_text], 'Label': [0]})

# Concatenate the paragraphs with the original DataFrame
df = pd.concat([df, positive_df, negative_df], ignore_index=True)

# Step 4: Preprocess the data using CountVectorizer's default tokenizer
vectorizer = CountVectorizer(stop_words='english')
X = vectorizer.fit_transform(df['Review'])
y = df['Label']

# Step 5: Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 6: Build and train the model
model = LogisticRegression()
model.fit(X_train, y_train)

# Step 7: Evaluate the model
y_pred = model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Step 8: Save the model and vectorizer (binary files)
joblib.dump(model, 'sentiment_model.pkl')      # Save the trained model
joblib.dump(vectorizer, 'vectorizer.pkl')      # Save the vectorizer

print("Model and vectorizer saved.")

# Step 9: Load the saved model and vectorizer for later use
model_loaded = joblib.load('sentiment_model.pkl')
vectorizer_loaded = joblib.load('vectorizer.pkl')

# Example usage: make predictions on new data
new_reviews = ["This is a great product!", "Terrible experience, will not buy again."]
X_new = vectorizer_loaded.transform(new_reviews)
predictions = model_loaded.predict(X_new)

print("Predictions on new reviews:", predictions)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\NEHA\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Accuracy: 0.9433526011560693
Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.88      0.89       236
           1       0.96      0.97      0.96       629

    accuracy                           0.94       865
   macro avg       0.93      0.92      0.93       865
weighted avg       0.94      0.94      0.94       865

Model and vectorizer saved.
Predictions on new reviews: [1 0]
