# Text Classification using Embedding Model

# Install Libraries

In [None]:
!pip install datasets transformers sentence_transformers

# Loading the dataset

In [None]:
# Importing the 'load_dataset' function from the 'datasets' library
from datasets import load_dataset

# Loading the "rotten_tomatoes" dataset from the Hugging Face datasets repository
data = load_dataset("rotten_tomatoes")

# Printing the loaded dataset to the console
print(data)


In [None]:
data['train'][0]

In [None]:
data["train"][7000]

In [None]:
unique_labels = set(data['train']['label'])
print(unique_labels)

# Use Embeddings for Text Classification

In [None]:
# Importing the SentenceTransformer class from the sentence_transformers library
from sentence_transformers import SentenceTransformer

# Loading a pre-trained model ('all-mpnet-base-v2') for encoding sentences into embeddings
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

# Encoding the 'text' field of the training data into embeddings, with progress shown
train_embeddings = model.encode(data['train']['text'], show_progress=True)

# Encoding the 'text' field of the test data into embeddings, with progress shown
test_embeddings = model.encode(data['test']['text'], show_progress=True)


In [None]:
train_embeddings

In [None]:
train_embeddings.shape

# Perform Classification

In [None]:
!pip install lazypredict

In [None]:
from lazypredict.Supervised import LazyClassifier

In [None]:
import numpy as np
import random

In [None]:
# Converting the 'train_embeddings' list to a NumPy array for efficient numerical operations
train_embeddings = np.array(train_embeddings)

# Converting the 'test_embeddings' list to a NumPy array for efficient numerical operations
test_embeddings = np.array(test_embeddings)

# Converting the 'train' labels from the dataset to a NumPy array
train_labels = np.array(data['train']['label'])

# Converting the 'test' labels from the dataset to a NumPy array
test_labels = np.array(data['test']['label'])

# Calculating 20% of the size of the training data to create a smaller sample
sample_size = int(0.2 * len(train_embeddings))

# Generating a list of random indices from the range of the training data size
random_indices = random.sample(range(len(train_embeddings)), sample_size)

# Creating a subset of the training embeddings using the selected random indices
sampled_train_embeddings = train_embeddings[random_indices]

# Creating a subset of the training labels using the same random indices
sampled_train_labels = train_labels[random_indices]

In [None]:
# Importing the LazyClassifier class (assuming it's already imported in the code context)
clf = LazyClassifier(verbose=0, ignore_warnings=True)

# Fitting the classifier on the training embeddings and labels, and making predictions on the test embeddings
# 'verbose=0' suppresses detailed output, and 'ignore_warnings=True' disables warnings during fitting
models, predictions = clf.fit(sampled_train_embeddings, test_embeddings, sampled_train_labels, test_labels)


In [None]:
predictions