In [None]:
!pip install datasets transformers sentence_transformers

# Loading the dataset

In [None]:
# Importing the load_dataset function from the datasets library
from datasets import load_dataset

# Loading the "rotten_tomatoes" dataset and storing it in the variable 'data'
data = load_dataset("rotten_tomatoes")

# Printing the content of the loaded dataset to the console
print(data)


In [None]:
data['train'][0]

In [None]:
data["train"][7000]

# Load Embeddings Model

In [None]:
# Importing the SentenceTransformer class from the sentence_transformers library
from sentence_transformers import SentenceTransformer

# Initializing the model by loading the pre-trained 'all-mpnet-base-v2' model
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

# Generate Embeddings for Labels

In [None]:
# Encoding a list of sentences ("Negative movie review" and "Positive movie review")
# using the loaded model to obtain their embeddings (numerical representations)
label_embeddings = model.encode(["Negative movie review",  "Positive movie review"])

# Generate Embeddings for Test Data (Text)

In [None]:
# Encoding the "text" column from the "test" split of the dataset `data` using the model.
# This will generate embeddings for each sentence in the "test" set.
# The `show_progress_bar=True` argument will display a progress bar during encoding.
test_embeddings = model.encode(data["test"]["text"], show_progress_bar=True)

# Zero-Shot Classification

In [None]:
# Importing the cosine_similarity function from sklearn.metrics.pairwise
# to compute the cosine similarity between two sets of embeddings
from sklearn.metrics.pairwise import cosine_similarity

# Importing numpy for numerical operations (e.g., finding the index of the max value)
import numpy as np

# Compute the cosine similarity between the test embeddings and the label embeddings.
# The result is a similarity matrix where each row corresponds to a test sentence,
# and each column corresponds to one of the label embeddings ("Negative" or "Positive").
sim_matrix = cosine_similarity(test_embeddings, label_embeddings)

# For each document in the test set, find the index of the label with the highest cosine similarity.
# This gives the predicted label (0 for "Negative", 1 for "Positive").
predictions = np.argmax(sim_matrix, axis=1)

# Compute the Classification Report / Confusion Matrix

In [None]:
# Importing the classification_report function from sklearn.metrics
# to evaluate the model's performance by generating precision, recall, and F1-score metrics
from sklearn.metrics import classification_report

# Extracting the true labels (ground truth) from the 'label' column in the "test" split of the dataset
y_true = data['test']["label"]

# Generating a classification report to evaluate the model's predictions against the true labels
# The report includes precision, recall, and F1-score for each class (Negative and Positive)
report = classification_report(y_true, predictions,
                               target_names=["Negative Movie Review", "Positive Movie Review"])

# Printing the classification report to the console
print(report)
