In [1]:
import re
import pandas as pd
import nltk
import tensorflow_datasets as tfds
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from string import punctuation
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report

nltk.download('punkt')
nltk.download('stopwords')

lemmatizer = WordNetLemmatizer()
cachedStopWords = stopwords.words("english")

def load_data():
    # Load the IMDb reviews dataset
    data = tfds.load('imdb_reviews', split={'train': 'train', 'test': 'test'})
    
    # Convert the data into pandas DataFrame and decode bytes to string
    train_df = tfds.as_dataframe(data['train'])
    test_df = tfds.as_dataframe(data['test'])

    train_df['text'] = train_df['text'].apply(lambda x: x.decode('utf-8'))
    test_df['text'] = test_df['text'].apply(lambda x: x.decode('utf-8'))

    train_df['label'] = train_df['label'].replace({0: 'negative', 1: 'positive'})
    test_df['label'] = test_df['label'].replace({0: 'negative', 1: 'positive'})

    return train_df, test_df

def preprocess_data(dataframe):
    # HTML Tags removal
    dataframe['text'] = dataframe['text'].apply(lambda words: re.sub('<[^<]+?>', '', words))

    # Lower case conversion
    dataframe['text'] = dataframe['text'].apply(lambda words: words.lower())

    # Word Tokenization
    dataframe['text'] = dataframe['text'].apply(word_tokenize)

    # Punctuation removal
    dataframe['text'] = dataframe['text'].apply(lambda words: [x for x in words if not x in punctuation])

    # Number removal
    dataframe['text'] = dataframe['text'].apply(lambda words: [x for x in words if not x.isdigit()])

    # Stopword removal
    dataframe['text'] = dataframe['text'].apply(lambda words: [x for x in words if x not in cachedStopWords])

    # Lemmatization
    dataframe['text'] = dataframe['text'].apply(lambda words: [lemmatizer.lemmatize(x) for x in words])

    # Join again
    dataframe['text'] = dataframe['text'].apply(lambda words: " ".join(words))

    return dataframe

def model_training(x_train, y_train):
    vec = CountVectorizer()
    vec = vec.fit(x_train.text)
    train_x_bow = vec.transform(x_train.text)

    # Create a Logistic Regression classifier
    classifier = LogisticRegression(max_iter=100)

    # Define the parameter grid for GridSearchCV
    param_grid = {
        'C': [0.001, 0.01, 0.1, 1, 10.0, 100]  # Replace this list with your 'alpha_ranges' values for 'C'
    }

    # Create GridSearchCV object
    grid_search = GridSearchCV(classifier, param_grid=param_grid, scoring='accuracy', cv=2, return_train_score=True)

    # Fit the model on the training data
    grid_search.fit(train_x_bow, y_train)

    # Get the best estimator from the grid search
    best_classifier = grid_search.best_estimator_

    return best_classifier, vec

def evaluate_model(classifier, vec, x_test, y_test):
    test_x_bow = vec.transform(x_test.text)
    predict = classifier.predict(test_x_bow)
    print("Accuracy is ", accuracy_score(y_test, predict))
    print("Report: ", classification_report(y_test, predict))

def main():
    train_df, test_df = load_data()

    le = LabelEncoder()
    y_train = le.fit_transform(train_df['label'])
    y_test = le.transform(test_df['label'])

    x_train = preprocess_data(train_df)
    x_test = preprocess_data(test_df)

    best_classifier, vec = model_training(x_train, y_train)

    evaluate_model(best_classifier, vec, x_test, y_test)

if __name__ == "__main__":
    main()


  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\matth\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\matth\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    htt

Accuracy is  0.87472
Report:                precision    recall  f1-score   support

           0       0.87      0.88      0.88     12500
           1       0.88      0.87      0.87     12500

    accuracy                           0.87     25000
   macro avg       0.87      0.87      0.87     25000
weighted avg       0.87      0.87      0.87     25000

