# Text Classification using Embedding Model

# Install Libraries

In [4]:
!pip install datasets transformers sentence_transformers



# Loading the dataset

In [5]:
# Importing the 'load_dataset' function from the 'datasets' library
from datasets import load_dataset

# Loading the "rotten_tomatoes" dataset from the Hugging Face datasets repository
data = load_dataset("rotten_tomatoes")

# Printing the loaded dataset to the console
print(data)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/7.46k [00:00<?, ?B/s]

train.parquet:   0%|          | 0.00/699k [00:00<?, ?B/s]

validation.parquet:   0%|          | 0.00/90.0k [00:00<?, ?B/s]

test.parquet:   0%|          | 0.00/92.2k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/8530 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1066 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1066 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 8530
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 1066
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 1066
    })
})


In [6]:
data['train'][0]

{'text': 'the rock is destined to be the 21st century\'s new " conan " and that he\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .',
 'label': 1}

In [7]:
data["train"][7000]

{'text': "it won't be long before you'll spy i spy at a video store near you .",
 'label': 0}

In [8]:
unique_labels = set(data['train']['label'])
print(unique_labels)

{0, 1}


# Use Embeddings for Text Classification

In [9]:
# Importing the SentenceTransformer class from the sentence_transformers library
from sentence_transformers import SentenceTransformer

# Loading a pre-trained model ('all-mpnet-base-v2') for encoding sentences into embeddings
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

# Encoding the 'text' field of the training data into embeddings, with progress shown
train_embeddings = model.encode(data['train']['text'], show_progress=True)

# Encoding the 'text' field of the test data into embeddings, with progress shown
test_embeddings = model.encode(data['test']['text'], show_progress=True)


The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [10]:
train_embeddings

array([[ 0.01492988, -0.0055476 ,  0.01199457, ...,  0.00566546,
        -0.01295628,  0.00153667],
       [ 0.03582863, -0.00235001, -0.02624873, ...,  0.00551064,
        -0.01447113, -0.02090711],
       [ 0.04090172,  0.11052182,  0.02460099, ...,  0.08475419,
         0.01615554,  0.02599418],
       ...,
       [ 0.02865292,  0.02628075,  0.00234627, ..., -0.00321243,
         0.03100069, -0.0142052 ],
       [-0.00342414,  0.04673285, -0.00241927, ..., -0.02206743,
        -0.0588572 , -0.01847415],
       [ 0.05150039,  0.04283322, -0.01324793, ..., -0.00367886,
         0.01647633,  0.00087228]], dtype=float32)

In [11]:
train_embeddings.shape

(8530, 768)

# Perform Classification

In [12]:
!pip install lazypredict

Collecting lazypredict
  Downloading lazypredict-0.2.13-py2.py3-none-any.whl.metadata (12 kB)
Downloading lazypredict-0.2.13-py2.py3-none-any.whl (12 kB)
Installing collected packages: lazypredict
Successfully installed lazypredict-0.2.13


In [13]:
from lazypredict.Supervised import LazyClassifier

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [14]:
import numpy as np
import random

In [15]:
# Converting the 'train_embeddings' list to a NumPy array for efficient numerical operations
train_embeddings = np.array(train_embeddings)

# Converting the 'test_embeddings' list to a NumPy array for efficient numerical operations
test_embeddings = np.array(test_embeddings)

# Converting the 'train' labels from the dataset to a NumPy array
train_labels = np.array(data['train']['label'])

# Converting the 'test' labels from the dataset to a NumPy array
test_labels = np.array(data['test']['label'])

# Calculating 20% of the size of the training data to create a smaller sample
sample_size = int(0.2 * len(train_embeddings))

# Generating a list of random indices from the range of the training data size
random_indices = random.sample(range(len(train_embeddings)), sample_size)

# Creating a subset of the training embeddings using the selected random indices
sampled_train_embeddings = train_embeddings[random_indices]

# Creating a subset of the training labels using the same random indices
sampled_train_labels = train_labels[random_indices]

In [16]:
# Importing the LazyClassifier class (assuming it's already imported in the code context)
clf = LazyClassifier(verbose=0, ignore_warnings=True)

# Fitting the classifier on the training embeddings and labels, and making predictions on the test embeddings
# 'verbose=0' suppresses detailed output, and 'ignore_warnings=True' disables warnings during fitting
models, predictions = clf.fit(sampled_train_embeddings, test_embeddings, sampled_train_labels, test_labels)


 97%|█████████▋| 31/32 [01:11<00:01,  1.89s/it]

[LightGBM] [Info] Number of positive: 889, number of negative: 817
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.016841 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 195840
[LightGBM] [Info] Number of data points in the train set: 1706, number of used features: 768
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.521102 -> initscore=0.084458
[LightGBM] [Info] Start training from score 0.084458


100%|██████████| 32/32 [01:22<00:00,  2.57s/it]


In [17]:
predictions

Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
SVC,0.84,0.84,0.84,0.84,1.27
NuSVC,0.84,0.84,0.84,0.84,2.09
GaussianNB,0.83,0.83,0.83,0.83,0.07
BernoulliNB,0.83,0.83,0.83,0.83,0.12
NearestCentroid,0.83,0.83,0.83,0.83,0.14
ExtraTreesClassifier,0.82,0.82,0.82,0.82,0.78
LGBMClassifier,0.82,0.82,0.82,0.82,10.4
RandomForestClassifier,0.82,0.82,0.82,0.82,4.1
AdaBoostClassifier,0.8,0.8,0.8,0.8,15.52
BaggingClassifier,0.78,0.78,0.78,0.78,19.79
