In [1]:
import string  # Add this import to resolve the error
import numpy as np
import pandas as pd
import nltk
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
nltk.download('punkt_tab')

# Download necessary resources for nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [2]:
# Load IMDB dataset
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=1000)
# This loads the IMDB dataset from Keras. It is preprocessed to contain reviews represented as word indices
# instead of raw text.
# - X_train, X_test: These are the reviews in the dataset, encoded as sequences of integers (word indices).
# - y_train, y_test: These are the corresponding labels (0 = negative review, 1 = positive review).
# - num_words=1000: This limits the dataset to the top 1000 most frequently occurring words in the entire dataset.
#   This simplifies the model by focusing on commonly used words while ignoring rare ones.

# Function to decode the text data from word indices to actual words
word_index = imdb.get_word_index()
# `word_index` is a dictionary mapping words to their corresponding integer indices in the IMDB dataset.
# This is required to convert encoded reviews (sequences of integers) back into human-readable text.

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz
[1m17464789/17464789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json
[1m1641221/1641221[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


In [3]:
def decode_review(encoded_review):
    # Create a reverse mapping from integer indices back to words
    reverse_word_index = {value: key for (key, value) in word_index.items()}
    # Decode the encoded review:
    # - Subtract 3 from each index because Keras reserves indices 0, 1, and 2 for special purposes.
    # - Replace indices not found in `reverse_word_index` with a placeholder ('?') for missing words.
    return ' '.join([reverse_word_index.get(i - 3, '?') for i in encoded_review])

In [4]:
# Decode IMDB reviews into actual text from word indices
X_train_text = [decode_review(review) for review in X_train]
# Convert each encoded review in the training set (X_train) into human-readable text using the `decode_review` function.
# This creates a list of text reviews corresponding to the encoded data.

X_test_text = [decode_review(review) for review in X_test]
# Similarly, convert each encoded review in the test set (X_test) into human-readable text.

# Why this decoding step is necessary:
# - The IMDB dataset is provided in an encoded format (integer sequences representing word indices) for
#   computational efficiency.
# - However, human-readable text is essential for performing further preprocessing steps (e.g., removing punctuation,
#   lowercasing) and understanding the dataset for debugging and analysis.

In [5]:
# Text preprocessing (convert to lowercase and remove punctuation)
def text_preprocessor(text):
    # Convert to lowercase and remove punctuation
    text = text.lower()
    text = ''.join([char for char in text if char not in string.punctuation])  # Use string.punctuation
    # Tokenize the text
    tokens = nltk.word_tokenize(text)
    # Remove stopwords
    stop_words = set(nltk.corpus.stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    return tokens