In [1]:
!pip install pandas scikit-learn nltk
import nltk
nltk.download('punkt')
nltk.download('stopwords')



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
import pandas as pd
df = pd.read_csv("D:\sentiment_analysis_movie-review\Data\IMDB Dataset.csv")  # Replace with your actual dataset filename
print(df.head())

                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [4]:
df.sentiment.unique()

array(['positive', 'negative'], dtype=object)

In [5]:
# Label Encoding
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})

In [6]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower()
    text = ''.join([char for char in text if char not in string.punctuation])
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

df['cleaned_text'] = df['review'].apply(clean_text)
print(df['cleaned_text'].head())

0    one reviewers mentioned watching 1 oz episode ...
1    wonderful little production br br filming tech...
2    thought wonderful way spend time hot summer we...
3    basically theres family little boy jake thinks...
4    petter matteis love time money visually stunni...
Name: cleaned_text, dtype: object


In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

X = df['cleaned_text']
y = df['sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

vectorizer = TfidfVectorizer()
X_train_vectors = vectorizer.fit_transform(X_train)
X_test_vectors = vectorizer.transform(X_test)

print("Shape of X_train_vectors:", X_train_vectors.shape)
print("Shape of X_test_vectors:", X_test_vectors.shape)

Shape of X_train_vectors: (40000, 158877)
Shape of X_test_vectors: (10000, 158877)


In [8]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# Initialize the Naive Bayes model
naive_bayes = MultinomialNB()

# Train the model
naive_bayes.fit(X_train_vectors, y_train)

# Make predictions on the test set
y_pred = naive_bayes.predict(X_test_vectors)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Classification Report:\n", report)

Accuracy: 0.8699
Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.88      0.87      4961
           1       0.88      0.86      0.87      5039

    accuracy                           0.87     10000
   macro avg       0.87      0.87      0.87     10000
weighted avg       0.87      0.87      0.87     10000



In [9]:
import pickle

# Save the trained model
with open('sentiment_model.pkl', 'wb') as model_file:
    pickle.dump(naive_bayes, model_file)

# Save the vectorizer
with open('tfidf_vectorizer.pkl', 'wb') as vectorizer_file:
    pickle.dump(vectorizer, vectorizer_file)

print("Model and vectorizer saved successfully!")

Model and vectorizer saved successfully!


In [None]:
import pickle

# Load the vectorizer
with open("tfidf_vectorizer.pkl", "rb") as file:
    tfidf_vectorizer = pickle.load(file)

print(type(tfidf_vectorizer))  


<class 'sklearn.feature_extraction.text.TfidfVectorizer'>


In [11]:
import pickle

# Load the vectorizer
with open("tfidf_vectorizer.pkl", "rb") as file:
    tfidf_vectorizer = pickle.load(file)

print(type(tfidf_vectorizer))  # Confirm it's TfidfVectorizer
print(tfidf_vectorizer.get_feature_names_out()[:10])  # Show some words from vocabulary


<class 'sklearn.feature_extraction.text.TfidfVectorizer'>
['00' '000' '00000001' '000001' '0001' '00015' '001' '0010' '002'
 '00383042']


In [12]:
import pickle

# Load the sentiment model
with open("sentiment_model.pkl", "rb") as file:
    sentiment_model = pickle.load(file)

# Confirm the model type
print(type(sentiment_model))  

# Check if the model has a `predict` method (indicating it's a classifier)
if hasattr(sentiment_model, "predict"):
    print("The model is ready for predictions.")
else:
    print("This might not be a trained model.")

# If it's a scikit-learn model, check its classes
if hasattr(sentiment_model, "classes_"):
    print("Model classes:", sentiment_model.classes_)


<class 'sklearn.naive_bayes.MultinomialNB'>
The model is ready for predictions.
Model classes: [0 1]


In [13]:
# Example text for prediction
sample_text = ["This movie was amazing! I loved it."]

# Convert text into numerical features using the vectorizer
text_vector = tfidf_vectorizer.transform(sample_text)

# Predict sentiment
prediction = sentiment_model.predict(text_vector)

# Show result
print("Predicted Sentiment:", prediction[0])  # 0 for negative, 1 for positive


Predicted Sentiment: 1


In [16]:


# Sample text for prediction
sample_text = ["This movie was terrible and boring. I regret watching it."]

# Transform the text using the loaded vectorizer
sample_text_tfidf = tfidf_vectorizer.transform(sample_text)

# Predict sentiment
predicted_sentiment = sentiment_model.predict(sample_text_tfidf)

# Print the predicted sentiment
print("Predicted Sentiment:", predicted_sentiment[0])  # Should print 0 for negative sentiment



Predicted Sentiment: 0
