In [4]:
from datasets import load_dataset

data = load_dataset('cornell-movie-review-data/rotten_tomatoes')

In [5]:
data['test'][0, 1, -1]

{'text': ['lovingly photographed in the manner of a golden book sprung to life , stuart little 2 manages sweetness largely without stickiness .',
  'consistently clever and suspenseful .',
  "enigma is well-made , but it's just too dry and too placid ."],
 'label': [1, 1, 0]}

In [15]:
from sklearn.metrics import classification_report

def evaluate_performance(y_true, y_pred):
    performance = classification_report(y_true, y_pred, target_names=['Negative Review', 'Positive Review'])
    print(performance)

## Text Classification Using Embeddings

In [6]:
from sentence_transformers import SentenceTransformer

model  = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

In [7]:
# Convert text to embeddings
train_embeddings = model.encode(data['train']['text'], show_progress_bar=True)
test_embeddings = model.encode(data['test']['text'], show_progress_bar=True)

Batches:   0%|          | 0/267 [00:00<?, ?it/s]

Batches:   0%|          | 0/34 [00:00<?, ?it/s]

In [9]:
test_embeddings.shape

(1066, 768)

### Apply Logistic Regression for text classification

In [11]:
from sklearn.linear_model import LogisticRegression

# Train a logistic regression on our train embeddings
clf = LogisticRegression(random_state=42)
clf.fit(train_embeddings, data['train']['label'])

In [13]:
# Predict previously unseen instances
y_pred = clf.predict(test_embeddings)

In [14]:
evaluate_performance(data["test"]["label"], y_pred)

                 precision    recall  f1-score   support

Negative Review       0.85      0.86      0.85       533
Positive Review       0.86      0.85      0.85       533

       accuracy                           0.85      1066
      macro avg       0.85      0.85      0.85      1066
   weighted avg       0.85      0.85      0.85      1066



## Text Classificaiton by Averaging the Embeddings / Class

In [18]:
import numpy as np
import pandas as pd
from sklearn.metrics import classification_report
from sklearn.metrics.pairwise import cosine_similarity

In [36]:
# Average the embeddings of all sentences in each target label
df = pd.DataFrame(np.hstack([train_embeddings, np.array(data['train']['label']).reshape(-1, 1)]))
averaged_target_embeddings = df.groupby(768).mean().values

# Find the best matching embeddings between evaluation documents and target embeddings
sim_matrix = cosine_similarity(test_embeddings, averaged_target_embeddings)
y_pred = np.argmax(sim_matrix, axis=1)

# Evaluate the model
evaluate_performance(data["test"]["label"], y_pred)

                 precision    recall  f1-score   support

Negative Review       0.85      0.84      0.84       533
Positive Review       0.84      0.85      0.84       533

       accuracy                           0.84      1066
      macro avg       0.84      0.84      0.84      1066
   weighted avg       0.84      0.84      0.84      1066



### Zero-Shot Classification

In [45]:
# Create embeddings for our labels
label_embeddings = model.encode(['A very negative movie review', 'a very positive movie review'])

In [46]:
# Find the best matching embeddings between evaluation documents and target embeddings
sim_matrix = cosine_similarity(test_embeddings, label_embeddings)
y_pred = np.argmax(sim_matrix, axis=1)

# Evaluate the model
evaluate_performance(data["test"]["label"], y_pred)

                 precision    recall  f1-score   support

Negative Review       0.86      0.73      0.79       533
Positive Review       0.76      0.88      0.82       533

       accuracy                           0.80      1066
      macro avg       0.81      0.80      0.80      1066
   weighted avg       0.81      0.80      0.80      1066

