In [8]:
%pip install pandas torch transformers scikit-learn






[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [9]:
import pandas as pd

# Load data
data = pd.read_csv('aqua_OnlineMedia_21-Jun-2024_20-Sep-2024_auY2Vi02Qm.csv', delimiter=';')

print(data.head())


                                         original_id  \
0  'b794bad0873930adb3fc1aac4d3155f1a3f3d2a1ca722...   
1  '8fa227aab334c2175f9b375b21adc1b41dbc0fa1a6438...   
2  '5a69d8466ca6ffb91a79fb6444aeadf8fb7dac06d5bf4...   
3  '8c22a2209c57ada189a0becfef7e62c8f4f37ddafd69a...   
4  '3fecdfab67d7cfeed43ceee0125f9e563e7c666a478c8...   

                       source_name  \
0  'mediapakuan.pikiran-rakyat.com   
1                'suaramerdeka.com   
2                           'rm.id   
3      'magetan.pikiran-rakyat.com   
4                'suaramerdeka.com   

                                               title  \
0  'Peluang Emas, Pabrik Le Minerale (Mayora Grou...   
1  'Melalui Program Persija Belajar Bola Bareng, ...   
2  'Le Minerale Dan Persija Majukan Talenta Muda ...   
3  'Fresh Graduate Merapat PT Tirta Fresindo Jaya...   
4  'Melalui Program Persija Belajar Bola Bareng, ...   

                                                 url  \
0  'https://mediapakuan.pikiran-rakyat.co

In [10]:
# Clean column names
data.columns = data.columns.str.replace('"', '').str.strip()


In [11]:
print(data.columns.tolist())


['original_id', 'source_name', 'title', 'url', 'body', 'date_published', 'language', 'date_modified', 'author_list', 'images', 'description', 'sentiment', 'emotions', 'entities', 'quotations', 'prValues', 'clipping', 'label', 'category']


In [12]:
# Keep only necessary columns
data = data[['title', 'body', 'sentiment']]

# Combine title and body as input text for the model
data['text'] = data['title'] + " " + data['body']

# Remove any rows with missing sentiment
data = data.dropna(subset=['sentiment'])

In [13]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Encode the sentiment labels
label_encoder = LabelEncoder()
data['sentiment'] = label_encoder.fit_transform(data['sentiment'])  # E.g., 0 = negative, 1 = neutral, 2 = positive

# Split data
train_texts, test_texts, train_labels, test_labels = train_test_split(
    data['text'].tolist(),
    data['sentiment'].tolist(),
    test_size=0.2,
    random_state=42
)


In [14]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
import torch

# Load tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=3)

# Tokenize data
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=512)


  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
from torch.utils.data import DataLoader, Dataset

class OnlineMediaDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = OnlineMediaDataset(train_encodings, train_labels)
test_dataset = OnlineMediaDataset(test_encodings, test_labels)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)


In [16]:
from transformers import AdamW

# Optimizer
optimizer = AdamW(model.parameters(), lr=1e-5)

# Training loop
model.train()
for epoch in range(3):  # 3 epochs should be a good start
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']

        # Forward pass
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch + 1} completed")




Epoch 1 completed
Epoch 2 completed
Epoch 3 completed


In [17]:
from sklearn.metrics import accuracy_score, classification_report

model.eval()
predictions, true_labels = [], []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1)

        predictions.extend(preds.tolist())
        true_labels.extend(labels.tolist())

# Calculate metrics
print("Accuracy:", accuracy_score(true_labels, predictions))
print(classification_report(true_labels, predictions, target_names=label_encoder.classes_))


Accuracy: 0.8365384615384616
              precision    recall  f1-score   support

   'negative       0.00      0.00      0.00         1
    'neutral       0.79      0.99      0.88       125
   'positive       0.98      0.61      0.75        82

    accuracy                           0.84       208
   macro avg       0.59      0.53      0.54       208
weighted avg       0.86      0.84      0.82       208



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [18]:
model.save_pretrained("sentiment_analysis_model")
tokenizer.save_pretrained("sentiment_analysis_model")


('sentiment_analysis_model\\tokenizer_config.json',
 'sentiment_analysis_model\\special_tokens_map.json',
 'sentiment_analysis_model\\vocab.txt',
 'sentiment_analysis_model\\added_tokens.json')

In [20]:
# Testing

from transformers import DistilBertForSequenceClassification, DistilBertTokenizer

# Load the trained model and tokenizer from the specified directory
model = DistilBertForSequenceClassification.from_pretrained("sentiment_analysis_model")
tokenizer = DistilBertTokenizer.from_pretrained("sentiment_analysis_model")



In [26]:
# Example text to test
# Example texts to test the model (in Indonesian)
test_texts = [
    "Produk ini luar biasa! Saya sangat menyukainya.",
    "Saya sangat kecewa dengan layanan ini.",
    "Tidak buruk, tetapi juga tidak istimewa."
]

# Tokenize the input texts
test_encodings = tokenizer(test_texts, padding=True, truncation=True, return_tensors='pt')

In [27]:
import torch

# Run inference
with torch.no_grad():
    outputs = model(**test_encodings)
    predictions = outputs.logits.argmax(dim=-1)  # Get the index of the highest score

# Map predictions to sentiment labels
label_mapping = {1: 'positive', 0: 'negative', 2: 'neutral'}  # Adjust this based on your training

predicted_labels = [label_mapping[label.item()] for label in predictions]
print(predicted_labels)  # Display predicted sentiments


['positive', 'positive', 'positive']


In [28]:
from sklearn.metrics import accuracy_score

# Assuming you have a list of true labels for the test set
true_labels = [1, 0, 2]  # Replace with your actual true labels

# Calculate accuracy
accuracy = accuracy_score(true_labels, predictions.numpy())
print(f"Accuracy: {accuracy:.2f}")


Accuracy: 0.33
