In [6]:
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Download stopwords and WordNetLemmatizer data (only need to do this once)
nltk.download('stopwords')
nltk.download('wordnet')

# Preprocess the text data
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()

    # Tokenize text into words
    words = word_tokenize(text)

    # Remove stopwords and non-alphabetic characters
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word.isalpha() and word not in stop_words]

    # Lemmatize words
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]

    return ' '.join(words)

# Main function to build and train the Naive Bayes classifier
def naive_bayes_classifier(X_train, y_train):
    # Vectorize the text data using CountVectorizer
    vectorizer = CountVectorizer()
    X_train_vec = vectorizer.fit_transform(X_train)

    # Train the Naive Bayes classifier
    nb_classifier = MultinomialNB()
    nb_classifier.fit(X_train_vec, y_train)

    return vectorizer, nb_classifier

# Function to make predictions on new data
def predict_sentiment(vectorizer, nb_classifier, X_new):
    # Preprocess the new data
    X_new_preprocessed = X_new.apply(preprocess_text)

    # Vectorize the new data using the same vectorizer
    X_new_vec = vectorizer.transform(X_new_preprocessed)

    # Make predictions
    y_pred = nb_classifier.predict(X_new_vec)

    return y_pred

if __name__ == '__main__':
    # Read the test data from test_data.csv
    with open('test_data.csv', 'r', encoding='utf-8') as f:
        test_data = f.readlines()

    test_data = [line.strip('\n') for line in test_data]
    test_samples_df = pd.DataFrame(test_data, columns=['text'])

    # Apply preprocessing to 'text' column of DataFrame
    test_samples_df['text'] = test_samples_df['text'].apply(preprocess_text)

    # Load the IMDb reviews dataset using TensorFlow Datasets
    data, info = tfds.load('imdb_reviews', split='train', with_info=True)

    # Access the 'text' and 'label' data directly from the loaded dataset
    X_train = np.array([example['text'].numpy().decode('utf-8') for example in data])
    y_train = np.array([example['label'].numpy() for example in data])

    # Build and train the Naive Bayes classifier
    vectorizer, nb_classifier = naive_bayes_classifier(X_train, y_train)

    # Make predictions on the test data
    X_test_samples = vectorizer.transform(test_samples_df['text'])
    predictions = nb_classifier.predict(X_test_samples)

    # Add the predicted labels to the test dataframe
    test_samples_df['predicted_sentiment'] = predictions

    # Display the test dataframe with predicted labels
    print(test_samples_df)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


                                                 text  predicted_sentiment
0    could stand film acting terrible plot made sense                    0
1   amazing piece cinema performance brilliant sto...                    1
2   offce total gem laugh hard ca even steve carel...                    1
3                        disappointed movie live hype                    0
4   crwn lavish treat costume set gorgeous make fe...                    1
5   found film quite boring lacked excitement fail...                    0
6                movie hilarious laughed start finish                    0
7   black mirror creepy af mess yer mind make ya q...                    0
8   high hope series total letdown plot place char...                    0
9   acting film superb actor delivered powerful pe...                    1
10  could stop thinking movie watched left lasting...                    0
11              special effect film visually stunning                    1
12  expected movie story 

In [7]:
print(predictions)

[0 1 1 0 1 0 0 0 0 1 0 1 0 0 1 1 1 0 0 0 1 1 0 1 1 1 0 0 0 1 1 0 0 0 0 1 0
 0 1 1 0 0 1 1 1 0 1 0 1 1]


In [8]:
print(classification_report(y_train[:50], predictions))

              precision    recall  f1-score   support

           0       0.65      0.57      0.61        30
           1       0.46      0.55      0.50        20

    accuracy                           0.56        50
   macro avg       0.56      0.56      0.55        50
weighted avg       0.58      0.56      0.56        50

