In [2]:
import numpy as np
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification


In [3]:
df = pd.read_csv('../dataset/INA_TweetsPPKM_Raw.csv', sep='\t')
print(f"Rows : {df.shape[0]}, Columns : {df.shape[1]}")

Rows : 23644, Columns : 3


In [6]:
df['Date'] = pd.to_datetime(df['Date'])

Unnamed: 0,Date,User,Tweet
0,2022-03-31 14:32:04+00:00,pikobar_jabar,Ketahui informasi pembagian #PPKM di wilayah J...
1,2022-03-31 09:26:00+00:00,inewsdotid,Tempat Ibadah di Wilayah PPKM Level 1 Boleh Be...
2,2022-03-31 05:02:34+00:00,vdvc_talk,"Juru bicara Satgas Covid-19, Wiku Adisasmito m..."
3,2022-03-30 14:23:10+00:00,pikobar_jabar,Ketahui informasi pembagian #PPKM di wilayah J...
4,2022-03-30 11:28:57+00:00,tvOneNews,Kementerian Agama menerbitkan Surat Edaran Nom...
...,...,...,...
23639,2020-06-26 18:34:56+00:00,bananabluff,noelle loses a bet to akarsha and it somehow e...
23640,2020-06-19 15:14:07+00:00,Auqroix,they call her... weekeeshee...\n#butterflysoup...
23641,2020-06-18 23:45:26+00:00,Auqroix,"put out what you wanna see more of, amirite ga..."
23642,2020-06-15 23:17:22+00:00,Auqroix,"i don't need anybody, i'm fine here on my own\..."


In [8]:
tokenizer = AutoTokenizer.from_pretrained("afbudiman/indobert-classification")
model = AutoModelForSequenceClassification.from_pretrained(
    "afbudiman/indobert-classification"
)

In [9]:
# Iterate over each row of the dataset using the iterrows() method of the DataFrame
for index, row in df.iterrows():
    tweet = row["Tweet"]

    # Using tokenizer to encode tweets into model input
    encoded_input = tokenizer.encode_plus(
        tweet,
        max_length=128,
        padding="max_length",
        truncation=True,
        return_tensors="pt",
    )

    # Selects the device used (GPU if available, otherwise uses CPU)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)

    # Using models to perform inferences on the given input
    with torch.no_grad():
        input_ids = encoded_input["input_ids"].to(device)
        attention_mask = encoded_input["attention_mask"].to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits

    # Getting predicted labels from logits
    predicted_label = torch.argmax(logits, dim=1).item()

    # Updating the value of the column "sentiment" in the row being iterated
    df.at[index, "sentiment"] = predicted_label

In [10]:
import os
from datetime import datetime
current_time = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
model_directory = f"../models/indobert_{current_time}"
os.makedirs(model_directory, exist_ok=True)
tokenizer.save_pretrained(model_directory)
model.save_pretrained(model_directory)

In [None]:
df

Unnamed: 0,Date,User,Tweet,sentiment
0,2022-03-31 14:32:04+00:00,pikobar_jabar,Ketahui informasi pembagian #PPKM di wilayah J...,1.0
1,2022-03-31 09:26:00+00:00,inewsdotid,Tempat Ibadah di Wilayah PPKM Level 1 Boleh Be...,1.0
2,2022-03-31 05:02:34+00:00,vdvc_talk,"Juru bicara Satgas Covid-19, Wiku Adisasmito m...",1.0
3,2022-03-30 14:23:10+00:00,pikobar_jabar,Ketahui informasi pembagian #PPKM di wilayah J...,1.0
4,2022-03-30 11:28:57+00:00,tvOneNews,Kementerian Agama menerbitkan Surat Edaran Nom...,1.0
...,...,...,...,...
23639,2020-06-26 18:34:56+00:00,bananabluff,noelle loses a bet to akarsha and it somehow e...,1.0
23640,2020-06-19 15:14:07+00:00,Auqroix,they call her... weekeeshee...\n#butterflysoup...,1.0
23641,2020-06-18 23:45:26+00:00,Auqroix,"put out what you wanna see more of, amirite ga...",1.0
23642,2020-06-15 23:17:22+00:00,Auqroix,"i don't need anybody, i'm fine here on my own\...",1.0
