In [3]:
'''Consider a suitable text dataset. Remove stop words, apply stemming and feature selection techniques to represent documents as vectors.
Classify documents and evaluate precision, recall. (For Ex: Movie Review Dataset).'''
import nltk
from nltk.corpus import stopwords, movie_reviews
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

# Download necessary resources
nltk.download('stopwords')
nltk.download('movie_reviews')

# Load movie reviews dataset
documents = []
labels = []

# Each fileid corresponds to a review
for fileid in movie_reviews.fileids():
    words = movie_reviews.words(fileid)
    review = ' '.join(words)  # Join words into a single string
    label = movie_reviews.categories(fileid)[0]  # pos or neg
    documents.append(review)
    labels.append(label)

# Preprocessing function
def preprocess(text):
    stemmer = PorterStemmer()
    stop_words = set(stopwords.words('english'))
    
    text = text.lower()
    words = text.split()
    words = [stemmer.stem(word) for word in words if word not in stop_words]
    
    return ' '.join(words)

# Apply preprocessing to all documents
processed_reviews = [preprocess(review) for review in documents]

# Feature extraction using TF-IDF
vectorizer = TfidfVectorizer(max_features=3000)  # Limit features if needed
X = vectorizer.fit_transform(processed_reviews)
y = labels

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train classifier
classifier = MultinomialNB()
classifier.fit(X_train, y_train)

# Predict
y_pred = classifier.predict(X_test)

# Evaluate
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred))


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Asus\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\Asus\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\movie_reviews.zip.



Classification Report:

              precision    recall  f1-score   support

         neg       0.77      0.82      0.80       199
         pos       0.81      0.76      0.78       201

    accuracy                           0.79       400
   macro avg       0.79      0.79      0.79       400
weighted avg       0.79      0.79      0.79       400

