**Install Required Libraries**

In [None]:
!pip3 install scikit-learn
!pip3 install ktrain
!pip3 install tensorflow-text
!pip install tf_keras

Collecting tensorflow<2.17,>=2.16.1 (from tensorflow-text)
  Using cached tensorflow-2.16.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (589.8 MB)
Collecting tensorboard<2.17,>=2.16 (from tensorflow<2.17,>=2.16.1->tensorflow-text)
  Using cached tensorboard-2.16.2-py3-none-any.whl (5.5 MB)
Collecting keras>=3.0.0 (from tensorflow<2.17,>=2.16.1->tensorflow-text)
  Using cached keras-3.3.3-py3-none-any.whl (1.1 MB)
Installing collected packages: tensorboard, keras, tensorflow
  Attempting uninstall: tensorboard
    Found existing installation: tensorboard 2.15.2
    Uninstalling tensorboard-2.15.2:
      Successfully uninstalled tensorboard-2.15.2
  Attempting uninstall: keras
    Found existing installation: keras 2.15.0
    Uninstalling keras-2.15.0:
      Successfully uninstalled keras-2.15.0
  Attempting uninstall: tensorflow
    Found existing installation: tensorflow 2.15.1
    Uninstalling tensorflow-2.15.1:
      Successfully uninstalled tensorflow-2.15.1
[31mERRO

Collecting tensorflow<2.16,>=2.15 (from tf_keras)
  Using cached tensorflow-2.15.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (475.2 MB)
Collecting tensorboard<2.16,>=2.15 (from tensorflow<2.16,>=2.15->tf_keras)
  Using cached tensorboard-2.15.2-py3-none-any.whl (5.5 MB)
Collecting keras<2.16,>=2.15.0 (from tensorflow<2.16,>=2.15->tf_keras)
  Using cached keras-2.15.0-py3-none-any.whl (1.7 MB)
Installing collected packages: keras, tensorboard, tensorflow
  Attempting uninstall: keras
    Found existing installation: keras 3.3.3
    Uninstalling keras-3.3.3:
      Successfully uninstalled keras-3.3.3
  Attempting uninstall: tensorboard
    Found existing installation: tensorboard 2.16.2
    Uninstalling tensorboard-2.16.2:
      Successfully uninstalled tensorboard-2.16.2
  Attempting uninstall: tensorflow
    Found existing installation: tensorflow 2.16.1
    Uninstalling tensorflow-2.16.1:
      Successfully uninstalled tensorflow-2.16.1
[31mERROR: pip's dependency re

**Importing the useful library functions**

In [None]:
import os
os.environ['TF_USE_LEGACY_KERAS'] = 'True'  # Set environment variable
import ktrain
from ktrain import text
import re
import nltk
import os
from collections import defaultdict
from pathlib import Path
import pandas as pd
import numpy as np
import tensorflow as tf
import urllib.request
import tarfile
import shutil

# Download necessary NLTK packages
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("omw-1.4")

# Import NLTK modules for text preprocessing
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Define stopwords and create WordNetLemmatizer object
stop_words = stopwords.words("english")
stop_words.remove("not")  # Retain 'not' as it's important for sentiment analysis
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


**Define a function to preprocess text data**

In [None]:
# Define function for text preprocessing
def clean_text(text):
    # Remove HTML tags
    text = re.sub(r"<.*?>", "", text)
    # Remove non-alphanumeric characters
    text = re.sub(r"[^A-Za-z0-9]+", " ", text)
    # Convert text to lowercase
    text = text.lower()
    # Tokenize text into words
    tokens = nltk.word_tokenize(text)
    # Remove stopwords
    words = [word for word in tokens if word not in stop_words]
    # Lemmatize words
    words = [lemmatizer.lemmatize(word) for word in words]
    # Join processed words back into a single string
    cleaned_text = " ".join(words)
    return cleaned_text

**Download and extract IMDb dataset if not already present**

In [None]:
url = 'http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz'
filename = 'aclImdb_v1.tar.gz'

if not os.path.exists('aclImdb'):
    if not os.path.exists(filename):
        urllib.request.urlretrieve(url, filename)
    tar = tarfile.open(filename)
    tar.extractall()
    tar.close()

**Preprocessing and storing the Training Data into dataframe**

In [None]:
from collections import defaultdict
from pathlib import Path
import pandas as pd

# Read positive reviews from training dataset
results = defaultdict(list)
for file in Path("aclImdb/train/pos").iterdir():
    with open(file, "r") as file_open:
        results["file_name"].append(file.name)
        results["review"].append(file_open.read())
train_pos_df = pd.DataFrame(results["review"])
train_pos_df["sentiment"] = "positive"


# Read negative reviews from training dataset
results = defaultdict(list)
for file in Path("aclImdb/train/neg").iterdir():
    with open(file, "r") as file_open:
        results["file_name"].append(file.name)
        results["review"].append(file_open.read())
train_neg_df = pd.DataFrame(results["review"])
train_neg_df["sentiment"] = "negative"


# Concatenate positive and negative training datasets
train_df_list = [train_pos_df, train_neg_df]
train_data_df = pd.concat(train_df_list)
train_data_df.columns = ["review", "sentiment"]
train_data_df["review"] = train_data_df["review"].apply(clean_text)



**Preprocessing And Storing the testing data into dataframe**

In [None]:
# Read positive reviews from test dataset
results = defaultdict(list)
for file in Path("aclImdb/test/pos").iterdir():
    with open(file, "r") as file_open:
        results["file_name"].append(file.name)
        results["review"].append(file_open.read())
test_pos_df = pd.DataFrame(results["review"])
test_pos_df["sentiment"] = "positive"

# Read negative reviews from test dataset
results = defaultdict(list)
for file in Path("aclImdb/test/neg").iterdir():
    with open(file, "r") as file_open:
        results["file_name"].append(file.name)
        results["review"].append(file_open.read())
test_neg_df = pd.DataFrame(results["review"])
test_neg_df["sentiment"] = "negative"

# Concatenate positive and negative test datasets
test_df_list = [test_pos_df, test_neg_df]
test_data_df = pd.concat(test_df_list)
test_data_df.columns = ["review", "sentiment"]
test_data_df["review"] = test_data_df["review"].apply(clean_text)

# Prepare training and testing data
train_reviews = train_data_df["review"]
train_sentiments = train_data_df["sentiment"]
test_reviews = test_data_df["review"]
test_sentiments = test_data_df["sentiment"]

**Building the ALBERT model and assigning a learning instance**

In [None]:
# Define and preprocess text for classification using ALBERT model
MODEL_NAME = "albert-base-v2"
transformer = text.Transformer(
    MODEL_NAME, maxlen=400, classes=np.array(["positive", "negative"])
)
train_data = transformer.preprocess_train(
    np.array(train_reviews), np.array(train_sentiments)
)
test_data = transformer.preprocess_test(
    np.array(test_reviews), np.array(test_sentiments)
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessing train...
language: en
train sequence lengths:
	mean : 122
	95percentile : 316
	99percentile : 473




Is Multi-Label? False
preprocessing test...
language: en
test sequence lengths:
	mean : 119
	95percentile : 308
	99percentile : 470


In [None]:
# Train classifier
model = transformer.get_classifier()
learner = ktrain.get_learner(
    model, train_data=train_data, val_data=test_data, batch_size=6
)

**Running the model and evaluating the results**

In [None]:
learner.fit_onecycle(lr=2e-5,
                     epochs=1)



begin training using onecycle policy with max lr of 2e-05...


<tf_keras.src.callbacks.History at 0x7a358f360dc0>

**PREPARE DATA FOR BERT MODEL**

In [None]:
CURRENT_DIR = os.getcwd()
IMDB_DATADIR = os.path.join(CURRENT_DIR, "aclImdb")
(x_train, y_train), (x_test, y_test), preproc = text.texts_from_folder(
    datadir=IMDB_DATADIR,
    classes=["pos", "neg"],
    maxlen=500,
    train_test_names=["train", "test"],
    preprocess_mode="bert",
)

detected encoding: utf-8
downloading pretrained BERT model (uncased_L-12_H-768_A-12.zip)...
[██████████████████████████████████████████████████]
extracting pretrained BERT model...
done.

cleanup downloaded zip...
done.

preprocessing train...
language: en


Is Multi-Label? False
preprocessing test...
language: en


**Building the BERT model and assigning a learning instance**

In [None]:
# Train BERT model
bert_model = text.text_classifier(
    name="bert", train_data=(x_train, y_train), preproc=preproc
)

Is Multi-Label? False
maxlen is 500




done.


In [None]:
learner = ktrain.get_learner(
    model=bert_model,
    train_data=(x_train, y_train),
    val_data=(x_test, y_test),
    batch_size=6,
)

In [None]:
learner.fit_onecycle(lr=2e-5,
                     epochs=1)



begin training using onecycle policy with max lr of 2e-05...


<tf_keras.src.callbacks.History at 0x7a356b3892d0>