<a href="https://colab.research.google.com/github/muhammadfadlankamal/Kelompok-30-UAS-AI/blob/main/Kelompok_30.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# Install required packages with specific versions
!pip install transformers==4.44.2 datasets==2.21.0 torch==2.4.1 scikit-learn==1.5.1 pandas==2.2.2 tqdm==4.66.5 fsspec==2024.6.1

import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertTokenizer, BertForSequenceClassification
from torch.optim import AdamW
from datasets import load_dataset
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import pandas as pd
from tqdm import tqdm

# Set device (GPU if available, else CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

Collecting transformers==4.44.2
  Downloading transformers-4.44.2-py3-none-any.whl.metadata (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.7/43.7 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets==2.21.0
  Downloading datasets-2.21.0-py3-none-any.whl.metadata (21 kB)
Collecting torch==2.4.1
  Downloading torch-2.4.1-cp311-cp311-manylinux1_x86_64.whl.metadata (26 kB)
Collecting scikit-learn==1.5.1
  Downloading scikit_learn-1.5.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting tqdm==4.66.5
  Downloading tqdm-4.66.5-py3-none-any.whl.metadata (57 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.6/57.6 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting fsspec==2024.6.1
  Downloading fsspec-2024.6.1-py3-none-any.whl.metadata (11 kB)
Collecting tokenizers<0.2

In [4]:
# Load dataset from Hugging Face with error handling
try:
    dataset = load_dataset("amazon_polarity")
    # Limit dataset size for faster execution (e.g., 1,000 samples for train, 1,000 for test)
    train_data = dataset["train"].select(range(300))
    test_data = dataset["test"].select(range(300))
except Exception as e:
    print(f"Error loading dataset: {e}")
    print("Please check your internet connection or try again later.")
    raise

# Load BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2).to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
# Preprocess dataset
def preprocess_data(data, max_length=128):
    texts = [item["content"] for item in data]
    labels = [item["label"] for item in data]
    encodings = tokenizer(texts, truncation=True, padding=True, max_length=max_length, return_tensors="pt")
    return encodings["input_ids"], encodings["attention_mask"], torch.tensor(labels)

# Prepare train and validation data
input_ids, attention_mask, labels = preprocess_data(train_data)
train_inputs, val_inputs, train_masks, val_masks, train_labels, val_labels = train_test_split(
    input_ids, attention_mask, labels, test_size=0.2, random_state=42
)

# Create DataLoader
train_dataset = TensorDataset(train_inputs, train_masks, train_labels)
val_dataset = TensorDataset(val_inputs, val_masks, val_labels)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)

In [6]:
# Training setup
optimizer = AdamW(model.parameters(), lr=2e-5)
epochs = 3

# Training loop
model.train()
for epoch in range(epochs):
    print(f"Epoch {epoch + 1}/{epochs}")
    total_loss = 0
    for batch in tqdm(train_loader):
        b_input_ids, b_attention_mask, b_labels = [b.to(device) for b in batch]
        optimizer.zero_grad()
        outputs = model(b_input_ids, attention_mask=b_attention_mask, labels=b_labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
    avg_loss = total_loss / len(train_loader)
    print(f"Average training loss: {avg_loss:.4f}")

Epoch 1/3


100%|██████████| 8/8 [05:30<00:00, 41.33s/it]


Average training loss: 0.6953
Epoch 2/3


100%|██████████| 8/8 [05:06<00:00, 38.25s/it]


Average training loss: 0.5860
Epoch 3/3


100%|██████████| 8/8 [05:07<00:00, 38.39s/it]

Average training loss: 0.4478





In [None]:
# Validation
model.eval()
val_predictions = []
val_true_labels = []
with torch.no_grad():
    for batch in tqdm(val_loader):
        b_input_ids, b_attention_mask, b_labels = [b.to(device) for b in batch]
        outputs = model(b_input_ids, attention_mask=b_attention_mask)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=1).cpu().numpy()
        val_predictions.extend(predictions)
        val_true_labels.extend(b_labels.cpu().numpy())

100%|██████████| 7/7 [01:22<00:00, 11.81s/it]


In [None]:
# Calculate metrics
accuracy = accuracy_score(val_true_labels, val_predictions)
precision, recall, f1, _ = precision_recall_fscore_support(val_true_labels, val_predictions, average="binary")
print(f"Validation Accuracy: {accuracy:.4f}")
print(f"Validation Precision: {precision:.4f}")
print(f"Validation Recall: {recall:.4f}")
print(f"Validation F1-Score: {f1:.4f}")

Validation Accuracy: 0.9150
Validation Precision: 0.9048
Validation Recall: 0.8941
Validation F1-Score: 0.8994


In [None]:
# Simulated product recommendation dataset (since Kaggle dataset varies)
products = pd.DataFrame({
    "product_id": [1, 2, 3, 4],
    "name": ["Smartphone X", "Laptop Y", "Headphones Z", "Tablet A"],
    "category": ["Electronics", "Electronics", "Audio", "Electronics"],
    "rating": [4.5, 4.2, 4.8, 4.0]
})

In [None]:
def recommend_products_based_on_keywords(review_text: str, sentiment: int):
    if sentiment == 0:
        return "No recommendations for negative sentiment."

    # List of sample products
    products = pd.DataFrame({
        "name": ["Smartphone X", "Laptop Y", "Headphones Z", "Tablet A", "Camera B",
                 "Fast Charger C", "Bluetooth Speaker D"],
        "rating": [4.5, 4.2, 4.8, 4.0, 3.9, 4.6, 4.4],
        "category": ["smartphone", "laptop", "headphones", "tablet", "camera",
                     "charger", "speaker"],
    })

    # Synonym → category
    CATEGORY_KEYWORDS = {
        "smartphone":  {"phone", "smartphone", "android", "iphone"},
        "laptop":      {"laptop", "notebook", "macbook"},
        "headphones":  {"headphone", "headphones", "earphone", "earbud"},
        "tablet":      {"tablet", "ipad", "galaxy tab"},
        "camera":      {"camera", "dslr", "mirrorless", "camcorder"},
        "charger":     {"charger", "charging", "powerex", "mh-c204f"},
        "speaker":     {"speaker", "bluetooth", "music", "sound"},
    }

    review_tokens = set(review_text.lower().split())

    # Detect any category that appears
    detected_categories = [
        cat for cat, kw_set in CATEGORY_KEYWORDS.items()
        if review_tokens & kw_set          # the slice is not empty → there is a suitable keyword
    ]

    if detected_categories:
        # Get all the products in the detected category, then sort by rating
        mask = products["category"].isin(detected_categories)
        return (products[mask]
                .sort_values("rating", ascending=False)
                [["name", "rating"]]
                .head(3))

    # If there is no suitable keyword, give 3 highest rating products (fallback)
    return products.sort_values("rating", ascending=False)[["name", "rating"]].head(3)


In [None]:
def preprocess_data(data, tokenizer, max_length=128):
    texts = [item["content"] for item in data]
    labels = [item["label"] for item in data]
    encodings = tokenizer(texts, truncation=True, padding=True, max_length=max_length, return_tensors="pt")
    return encodings["input_ids"], encodings["attention_mask"], torch.tensor(labels)


In [None]:
# Example inference on test data
test_input_ids, test_attention_mask, _ = preprocess_data(test_data.select(range(5)), tokenizer)

model.eval()
with torch.no_grad():
    test_input_ids = test_input_ids.to(device)
    test_attention_mask = test_attention_mask.to(device)
    outputs = model(test_input_ids, attention_mask=test_attention_mask)
    test_predictions = torch.argmax(outputs.logits, dim=1).cpu().numpy()

In [None]:
# Print predictions and recommendations
for i, (text, pred) in enumerate(zip(test_data[:5]["content"], test_predictions)):
    sentiment_label = "Positive" if pred == 1 else "Negative"
    print(f"\nReview {i+1}: {text}...")
    print(f"Predicted Sentiment: {sentiment_label}")
    print("Recommendation:")
    print(recommend_products_based_on_keywords(text, pred))


Review 1: My lovely Pat has one of the GREAT voices of her generation. I have listened to this CD for YEARS and I still LOVE IT. When I'm in a good mood it makes me feel better. A bad mood just evaporates like sugar in the rain. This CD just oozes LIFE. Vocals are jusat STUUNNING and lyrics just kill. One of life's hidden gems. This is a desert isle CD in my book. Why she never made it big is just beyond me. Everytime I play this, no matter black, white, young, old, male, female EVERYBODY says one thing "Who was that singing ?"...
Predicted Sentiment: Positive
Recommendation:
             name  rating
2    Headphones Z     4.8
5  Fast Charger C     4.6
0    Smartphone X     4.5

Review 2: Despite the fact that I have only played a small portion of the game, the music I heard (plus the connection to Chrono Trigger which was great as well) led me to purchase the soundtrack, and it remains one of my favorite albums. There is an incredible mix of fun, epic, and emotional songs. Those sad 