# Text Classification using Task Specific Encoder-Only Model

---



## Aim: to classify movie reviews into positive or negative categories

# Installing Libraries

In [None]:
!pip install datasets transformers

# Loading the Dataset

In [None]:
# Importing the 'load_dataset' function from the 'datasets' library to load datasets
from datasets import load_dataset

# Loading the "rotten_tomatoes" dataset from Hugging Face's datasets library
data = load_dataset("rotten_tomatoes")

# Printing the loaded dataset object to inspect its contents and structure
print(data)


In [None]:
print(data["train"][0])

In [None]:
print(data["train"][6000])

# Text Classification with Representation Model: TASK SPECIFIC

In [None]:
# Importing the 'pipeline' function from the 'transformers' library to easily use pre-trained models
from transformers import pipeline

# Defining the model ID for the pre-trained sentiment analysis model, "cardiffnlp/twitter-roberta-base-sentiment-latest"
model_id = "cardiffnlp/twitter-roberta-base-sentiment-latest"

# Initializing the sentiment analysis pipeline with the specified model and tokenizer
# 'return_all_scores=True' returns the sentiment scores for all classes, not just the predicted class
# 'device="cuda:0"' ensures the model runs on the first GPU (if available)
pipe = pipeline(
    model=model_id,          # The model identifier for the sentiment analysis task
    tokenizer=model_id,      # The tokenizer corresponding to the model
    return_all_scores=True,  # Return sentiment scores for all possible sentiment classes (e.g., positive, neutral, negative)
    device='cuda:0'          # Use GPU (CUDA) for faster inference, if available
)


# Let's perform predictions on test dataset split

In [None]:
# Extracting the 'test' split from the loaded dataset (data) to access the test data
test_data = data['test']

# Selecting the first data point (text) from the test dataset for testing the model
sample_test_datapoint = test_data[0]["text"]

# Passing the selected test data point (movie review or sentence) through the sentiment analysis pipeline
# 'pipe' processes the text and returns the sentiment predictions, extracting the first prediction result
prediction = pipe(sample_test_datapoint)[0]

# Printing the model's prediction for the selected test data point
print(prediction)


In [None]:
test_data[0]["text"]

In [None]:
# Importing the 'numpy' library for array manipulation and mathematical operations
import numpy as np

# Extracting the score for the 'negative' sentiment from the prediction result
negative_score = prediction[0].get("score")

# Extracting the score for the 'positive' sentiment from the prediction result
postive_score = prediction[2].get("score")

# Using 'np.argmax' to find the index of the highest score between negative and positive sentiment
# The index 0 corresponds to 'negative' and index 1 corresponds to 'positive'
final_prediction = np.argmax([negative_score, postive_score])

# Printing the final prediction (0 for negative, 1 for positive) based on the higher sentiment score
print(final_prediction)


# Let's do evaluation on entire test set

In [None]:
# Importing the 'tqdm' library to display a progress bar for the loop
from tqdm import tqdm

# Initializing an empty list to store the final sentiment predictions for each test data point
predictions = []

# Iterating through the test data, displaying a progress bar with the description "predicting..."
for test_data_point in tqdm(test_data, desc="predicting..."):

  # Passing the text of the current test data point through the sentiment analysis pipeline
  prediction = pipe(test_data_point['text'])[0]

  # Extracting the sentiment score for 'negative' from the model's output
  negative_score = prediction[0].get("score")

  # Extracting the sentiment score for 'positive' from the model's output
  positive_score = prediction[2].get("score")

  # Using 'np.argmax' to determine the index of the highest sentiment score (0 for negative, 1 for positive)
  final_prediction = np.argmax([negative_score, positive_score])

  # Appending the final prediction (0 or 1) to the predictions list
  predictions.append(final_prediction)


# Let's compute the Classification Report / Confusion Matrix

In [None]:
# Importing the 'classification_report' function from the 'sklearn.metrics' module
from sklearn.metrics import classification_report

# Extracting the true labels (ground truth) from the 'test' split of the dataset
y_true = data['test']["label"]

# Generating the classification report, comparing the true labels ('y_true') with the predicted labels ('predictions')
# 'target_names' defines the readable names for each class (negative and positive movie reviews)
report = classification_report(y_true, predictions,
                               target_names=["Negative Movie Review", "Positive Movie Review"])

# Printing the generated classification report which includes metrics like precision, recall, and F1-score for each class
print(report)
