# Sentiment analysis for movie reviews involves using natural language processing (NLP) techniques to determine the sentiment expressed in textual reviews. The goal is typically to classify whether a review conveys positive, negative, or neutral sentiment towards a movie.

# Importing necessary libraries

In [1]:
import nltk
from nltk.corpus import movie_reviews
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

# Download NLTK resources

In [2]:
nltk.download('movie_reviews')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\CNKUTTY\AppData\Roaming\nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\CNKUTTY\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\CNKUTTY\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\CNKUTTY\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Load the movie reviews dataset

In [3]:
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

# Shuffle the documents

In [4]:
import random
random.shuffle(documents)

# Preprocessing the data

In [5]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Defining Function to preprocess text

In [6]:
def preprocess_text(text):
    tokens = word_tokenize(text)
    tokens = [word.lower() for word in tokens if word.isalnum()]
    tokens = [word for word in tokens if word not in stop_words]
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return " ".join(tokens)

# Preprocess the documents

In [7]:
preprocessed_documents = [(preprocess_text(" ".join(words)), category) for words, category in documents]

# Split the data into training and testing sets

In [8]:
X = [text for text, _ in preprocessed_documents]
y = [category for _, category in preprocessed_documents]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vectorize the text data using TF-IDF

In [9]:
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Train a Naive Bayes classifier

In [10]:
clf = MultinomialNB()
clf.fit(X_train_tfidf, y_train)

# Make predictions on the testing data

In [11]:
y_pred = clf.predict(X_test_tfidf)

# Evaluate the model

In [12]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.7875
