In [1]:
!pip install transformers datasets

from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from transformers import DataCollatorWithPadding, pipeline
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

import torch

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (1

In [2]:

def load_data():
    dataset = load_dataset("amazon_polarity")
    return dataset


def preprocess_data(dataset, tokenizer):
    def tokenize_function(examples):
        return tokenizer(examples["content"], truncation=True)

    tokenized_datasets = dataset.map(tokenize_function, batched=True)
    return tokenized_datasets


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average="binary")
    acc = accuracy_score(labels, predictions)
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}


def train_model(tokenized_datasets, tokenizer):
    model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)


    train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(3000))
    test_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1500))


    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


    training_args = TrainingArguments(
        output_dir="./results",
        evaluation_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=3,
        weight_decay=0.01,
        save_strategy="epoch",
        logging_dir="./logs",
        logging_steps=25,
        load_best_model_at_end=True,
        report_to="none"
    )

    # Initialize Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics
    )

    # Train the model
    trainer.train()

    # Evaluate the model
    eval_results = trainer.evaluate()
    print("Evaluation Results:", eval_results)

    return model

def analyze_sentiments(texts, sentiment_pipeline):


    label_map = {0: "NEGATIVE", 1: "POSITIVE"}
    results = []

    for text in texts:
        result = sentiment_pipeline(text)[0]
        label = label_map[int(result['label'][-1])]  # Map label to positive/negative
        confidence = f"{result['score']:.3f}"
        results.append({"TEXT": text, "SENTIMENT": label, "CONFIDENCE": confidence})

    return results



In [3]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"Using GPU: {torch.cuda.get_device_name(0)}")
else:
    device = torch.device("cpu")
    print("GPU not available, using CPU instead")


Using GPU: Tesla T4


In [4]:

if __name__ == "__main__":
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    dataset = load_data()


    train_texts = dataset["train"]["content"]
    train_labels = dataset["train"]["label"]
    label_map = {0: "negative", 1: "positive"}
    for i, (text, label) in enumerate(zip(train_texts[:5], train_labels[:5])):
        print(f"Review {i + 1}: {text}\nSentiment: {label_map[label]}\n")

    tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

    tokenized_datasets = preprocess_data(dataset, tokenizer)

    model = train_model(tokenized_datasets, tokenizer)


    sentiment_pipeline = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer, device=0)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/6.81k [00:00<?, ?B/s]

train-00000-of-00004.parquet:   0%|          | 0.00/260M [00:00<?, ?B/s]

train-00001-of-00004.parquet:   0%|          | 0.00/258M [00:00<?, ?B/s]

train-00002-of-00004.parquet:   0%|          | 0.00/255M [00:00<?, ?B/s]

train-00003-of-00004.parquet:   0%|          | 0.00/254M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/117M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3600000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/400000 [00:00<?, ? examples/s]

Review 1: This sound track was beautiful! It paints the senery in your mind so well I would recomend it even to people who hate vid. game music! I have played the game Chrono Cross but out of all of the games I have ever played it has the best music! It backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras. It would impress anyone who cares to listen! ^_^
Sentiment: positive

Review 2: I'm reading a lot of reviews saying that this is the best 'game soundtrack' and I figured that I'd write a review to disagree a bit. This in my opinino is Yasunori Mitsuda's ultimate masterpiece. The music is timeless and I'm been listening to it for years now and its beauty simply refuses to fade.The price tag on this is pretty staggering I must say, but if you are going to buy any cd for this much money, this is the only one that I feel would be worth every penny.
Sentiment: positive

Review 3: This soundtrack is my favorite music of all time, hands down. 

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/3600000 [00:00<?, ? examples/s]

Map:   0%|          | 0/400000 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2747,0.269524,0.892667,0.925352,0.858824,0.890847
2,0.1546,0.320447,0.896,0.881101,0.920261,0.900256
3,0.0992,0.376661,0.892667,0.876559,0.918954,0.897256


Evaluation Results: {'eval_loss': 0.269524484872818, 'eval_accuracy': 0.8926666666666667, 'eval_precision': 0.9253521126760563, 'eval_recall': 0.8588235294117647, 'eval_f1': 0.8908474576271187, 'eval_runtime': 8.5803, 'eval_samples_per_second': 174.818, 'eval_steps_per_second': 10.955, 'epoch': 3.0}


In [5]:
texts = ["This product exceeded my expectations! The quality is fantastic, and the delivery was super fast."]
analyze_sentiments(texts, sentiment_pipeline)

[{'TEXT': 'This product exceeded my expectations! The quality is fantastic, and the delivery was super fast.',
  'SENTIMENT': 'POSITIVE',
  'CONFIDENCE': '0.968'}]

In [6]:
texts = ["Delivery was late, and the item was nothing like the description. Very disappointed."]
analyze_sentiments(texts, sentiment_pipeline)

[{'TEXT': 'Delivery was late, and the item was nothing like the description. Very disappointed.',
  'SENTIMENT': 'NEGATIVE',
  'CONFIDENCE': '0.970'}]

In [7]:
texts = ["The design is nice, but the functionality is average. Could use some improvements."]
analyze_sentiments(texts, sentiment_pipeline)


[{'TEXT': 'The design is nice, but the functionality is average. Could use some improvements.',
  'SENTIMENT': 'POSITIVE',
  'CONFIDENCE': '0.737'}]

In [8]:
texts = ["The product stopped working after a week. Waste of money. Do not buy this."]
analyze_sentiments(texts, sentiment_pipeline)

[{'TEXT': 'The product stopped working after a week. Waste of money. Do not buy this.',
  'SENTIMENT': 'NEGATIVE',
  'CONFIDENCE': '0.973'}]

In [9]:
texts = ["I absolutely love this gadget! It works exactly as described and has made my life so much easier."]
analyze_sentiments(texts, sentiment_pipeline)

[{'TEXT': 'I absolutely love this gadget! It works exactly as described and has made my life so much easier.',
  'SENTIMENT': 'POSITIVE',
  'CONFIDENCE': '0.974'}]

In [10]:
texts = ["Packaging was decent, and delivery was on time. Haven't tried the product yet."]
analyze_sentiments(texts, sentiment_pipeline)

[{'TEXT': "Packaging was decent, and delivery was on time. Haven't tried the product yet.",
  'SENTIMENT': 'POSITIVE',
  'CONFIDENCE': '0.737'}]

In [11]:
texts = ["Terrible experience! The item arrived damaged, and the seller refused to provide a replacement."]
analyze_sentiments(texts, sentiment_pipeline)

[{'TEXT': 'Terrible experience! The item arrived damaged, and the seller refused to provide a replacement.',
  'SENTIMENT': 'NEGATIVE',
  'CONFIDENCE': '0.966'}]

In [12]:
texts = ["The product was okay. Not great, but not bad either. You get what you pay for."]
analyze_sentiments(texts, sentiment_pipeline)

[{'TEXT': 'The product was okay. Not great, but not bad either. You get what you pay for.',
  'SENTIMENT': 'POSITIVE',
  'CONFIDENCE': '0.784'}]

In [13]:
texts = ["The quality is awful. It broke on the first use, and the material feels really cheap."]
analyze_sentiments(texts, sentiment_pipeline)

[{'TEXT': 'The quality is awful. It broke on the first use, and the material feels really cheap.',
  'SENTIMENT': 'NEGATIVE',
  'CONFIDENCE': '0.968'}]

In [14]:
texts = ["Worst customer service ever. They ignored my emails and refused to issue a refund."]
analyze_sentiments(texts, sentiment_pipeline)

[{'TEXT': 'Worst customer service ever. They ignored my emails and refused to issue a refund.',
  'SENTIMENT': 'NEGATIVE',
  'CONFIDENCE': '0.970'}]

In [16]:
texts = ["Customer service was excellent. They responded quickly and resolved my issue without any hassle."]
analyze_sentiments(texts, sentiment_pipeline)

[{'TEXT': 'Customer service was excellent. They responded quickly and resolved my issue without any hassle.',
  'SENTIMENT': 'POSITIVE',
  'CONFIDENCE': '0.927'}]

In [17]:
texts = ["Great value for the price. Highly recommend to anyone looking for a budget-friendly option."]
analyze_sentiments(texts, sentiment_pipeline)

[{'TEXT': 'Great value for the price. Highly recommend to anyone looking for a budget-friendly option.',
  'SENTIMENT': 'POSITIVE',
  'CONFIDENCE': '0.970'}]