In [1]:
pip install pandas torch transformers scikit-learn numpy matplotlib

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import pandas as pd
import re
import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import matplotlib.pyplot as plt


In [3]:
def clean_tweet(text):
    """Bersihkan tweet dari URL, mention, hashtag, emoji, dan karakter khusus."""
    text = re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE)  # Hapus URL
    text = re.sub(r"@\w+", "", text)  # Hapus mention
    text = re.sub(r"#\w+", "", text)  # Hapus hashtag
    text = re.sub(r"[^\w\s]", "", text)  # Hapus tanda baca
    text = re.sub(r"\d+", "", text)  # Hapus angka
    text = text.lower().strip()  # Ubah ke huruf kecil dan hapus spasi berlebih
    return text

In [4]:
# --- Fungsi untuk Mendapatkan Embedding IndoBERT ---
def get_indobert_embedding(text, tokenizer, model, device):
    """Menghasilkan embedding IndoBERT untuk teks."""
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=128).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state[:, 0, :].cpu().numpy()  # Ambil [CLS] token embedding


In [5]:
# --- Fungsi untuk Klasifikasi Tweet ---
def classify_tweet(tweet, stress_phrases, non_stress_phrases, tokenizer, model, device):
    """Klasifikasi tweet berdasarkan kemiripan dengan frasa stres/tidak stres."""
    cleaned_tweet = clean_tweet(tweet)
    if not cleaned_tweet:
        return "tidak_stres", 0.0  # Default jika tweet kosong setelah pembersihan

    # Dapatkan embedding tweet
    tweet_embedding = get_indobert_embedding(cleaned_tweet, tokenizer, model, device)

    # Dapatkan embedding frasa stres dan tidak stres
    stress_embeddings = [get_indobert_embedding(phrase, tokenizer, model, device) for phrase in stress_phrases]
    non_stress_embeddings = [get_indobert_embedding(phrase, tokenizer, model, device) for phrase in non_stress_phrases]

    # Hitung cosine similarity
    stress_similarities = [cosine_similarity(tweet_embedding, emb)[0][0] for emb in stress_embeddings]
    non_stress_similarities = [cosine_similarity(tweet_embedding, emb)[0][0] for emb in non_stress_embeddings]

    # Ambil similarity maksimum
    max_stress_sim = max(stress_similarities) if stress_similarities else 0
    max_non_stress_sim = max(non_stress_similarities) if non_stress_similarities else 0

    # Tentukan label berdasarkan similarity tertinggi
    if max_stress_sim > max_non_stress_sim:
        return "stres", max_stress_sim
    else:
        return "tidak_stres", max_non_stress_sim

In [6]:
# --- Setup IndoBERT ---
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained("indobenchmark/indobert-base-p1")
model = AutoModel.from_pretrained("indobenchmark/indobert-base-p1").to(device)

In [7]:
# Load the dataset
df_stress = pd.read_csv("dataset/real_dataset_stress.csv")

# Filter stress and non-stress phrases
stress_phrases = df_stress[df_stress["kategori"] == "stres"]["text"].tolist()
non_stress_phrases = df_stress[df_stress["kategori"] == "tidak_stres"]["text"].tolist()
df_stress.head(10)

Unnamed: 0,text,kategori
0,mau mati aja,stres
1,kenapa harus berusaha waktu kesepian,stres
2,kenapa harus ngerjain tugas waktu tekanan hidup,stres
3,bodoh banget kan jadi malu maluin,stres
4,capek banget jadi orang yang selalu tidur,stres
5,tiap malam cuma bisa memuaskan orang lain kare...,stres
6,rasanya pengen berpikir pas lagi masalah keluarga,stres
7,beneran gak kuat lagi buat menghindar,stres
8,muak jadi omongan orang,stres
9,udah nangis tapi tetap harapan palsu,stres


In [8]:
# --- Load Tweet dari CSV ---
file_path = "tweets-data/save_tweets.csv"
df_tweets = pd.read_csv(file_path)
df_tweets.head(10)

Unnamed: 0,conversation_id_str,created_at,favorite_count,full_text,id_str,image_url,in_reply_to_screen_name,lang,location,quote_count,reply_count,retweet_count,tweet_url,user_id_str,username
0,1929866844585726062,Tue Jun 03 11:44:45 +0000 2025,0,abis ulangan lega hehe,1929866844585726062,,,in,,0,0,0,https://x.com/undefined/status/192986684458572...,1865298251311910914,
1,1929660292440772663,Mon Jun 02 22:03:59 +0000 2025,0,woi takut bangat nih ulangannya susahhh,1929660292440772663,,,in,,0,0,0,https://x.com/undefined/status/192966029244077...,1865298251311910914,
2,1929660154892767739,Mon Jun 02 22:03:26 +0000 2025,0,baru bangun tidur bngat plss,1929660154892767739,,,in,,0,0,0,https://x.com/undefined/status/192966015489276...,1865298251311910914,
3,1929566733708538150,Mon Jun 02 15:52:13 +0000 2025,0,pasti nilai nya jelek nih takut plss,1929566733708538150,,,in,,0,0,0,https://x.com/undefined/status/192956673370853...,1865298251311910914,
4,1929565448988733945,Mon Jun 02 15:47:07 +0000 2025,0,pusing banget plss uts 2 pelajaran susahhh,1929565448988733945,,,in,,0,0,0,https://x.com/undefined/status/192956544898873...,1865298251311910914,
5,1929565294059544831,Mon Jun 02 15:46:30 +0000 2025,0,aaaaaaaa,1929565294059544831,,,und,,0,0,0,https://x.com/undefined/status/192956529405954...,1865298251311910914,
6,1929565268474302925,Mon Jun 02 15:46:24 +0000 2025,0,gabisa tidur plsss ini materi susah banget,1929565268474302925,,,in,,0,0,0,https://x.com/undefined/status/192956526847430...,1865298251311910914,
7,1929442776648749219,Mon Jun 02 07:39:39 +0000 2025,0,duh takut jelek nih hasilnya,1929442776648749219,,,in,,0,0,0,https://x.com/undefined/status/192944277664874...,1865298251311910914,
8,1929442711729226037,Mon Jun 02 07:39:24 +0000 2025,0,Pusingh banget abis ulangan,1929442711729226037,,,in,,0,0,0,https://x.com/undefined/status/192944271172922...,1865298251311910914,
9,1929309091391807717,Sun Jun 01 22:48:26 +0000 2025,0,pagi pagi udah diomelin males banget pls,1929309091391807717,,,in,,0,0,0,https://x.com/undefined/status/192930909139180...,1865298251311910914,


In [9]:
# --- Klasifikasi Tweet ---
results = []
for _, row in df_tweets.iterrows():
    tweet = row["full_text"]
    label, similarity = classify_tweet(tweet, stress_phrases, non_stress_phrases, tokenizer, model, device)
    results.append({
        "text": tweet,
        "cleaned_text": clean_tweet(tweet),
        "label": label,
        "similarity_score": similarity,
        "created_at": row["created_at"],
        "user_id": row["user_id_str"]
    })

In [10]:
# Buat DataFrame dan simpan ke CSV
df_results = pd.DataFrame(results)
df_results.to_csv("tweet_stress_classification_results.csv", index=False)

In [11]:
# --- Hitung Kesimpulan Akhir ---
stress_count = len(df_results[df_results["label"] == "stres"])
total_tweets = len(df_results)
stress_percentage = (stress_count / total_tweets) * 100 if total_tweets > 0 else 0

In [12]:
# Tentukan kesimpulan
final_conclusion = "Stres" if stress_percentage >= 50 else "Tidak Stres"

In [13]:
# --- Visualisasi Pie Chart ---
labels = ["Stres", "Tidak Stres"]
sizes = [stress_count, total_tweets - stress_count]
plt.figure(figsize=(6, 6))
plt.pie(sizes, labels=labels, autopct="%1.1f%%", startangle=140)
plt.title("Proporsi Tweet Stres vs Tidak Stres")
plt.savefig("stress_pie_chart.png")
plt.show()

  plt.show()


In [14]:
# --- Tampilkan Hasil ---
print(f"\nHasil Analisis Stres dari {total_tweets} Tweet:")
print(f"Jumlah Tweet Stres: {stress_count} ({stress_percentage:.2f}%)")
print(f"Jumlah Tweet Tidak Stres: {total_tweets - stress_count} ({100 - stress_percentage:.2f}%)")
print(f"Kesimpulan Akhir: {final_conclusion}")


Hasil Analisis Stres dari 21 Tweet:
Jumlah Tweet Stres: 12 (57.14%)
Jumlah Tweet Tidak Stres: 9 (42.86%)
Kesimpulan Akhir: Stres


In [15]:
df_results['clean_text'] = df_results['text'].apply(clean_tweet)

# Tampilkan semua hasil klasifikasi tweet stress
print("\nContoh Hasil Klasifikasi (Semua):")
for i, row in df_results.iterrows():
    print(f"Tweet: {row['clean_text']}")
    print(f"Label: {row['label']} (Similarity: {row['similarity_score']:.4f})")
    print("-" * 50)



Contoh Hasil Klasifikasi (Semua):
Tweet: abis ulangan lega hehe
Label: tidak_stres (Similarity: 0.9090)
--------------------------------------------------
Tweet: woi takut bangat nih ulangannya susahhh
Label: stres (Similarity: 0.9121)
--------------------------------------------------
Tweet: baru bangun tidur bngat plss
Label: stres (Similarity: 0.9062)
--------------------------------------------------
Tweet: pasti nilai nya jelek nih takut plss
Label: stres (Similarity: 0.9242)
--------------------------------------------------
Tweet: pusing banget plss uts  pelajaran susahhh
Label: tidak_stres (Similarity: 0.8909)
--------------------------------------------------
Tweet: aaaaaaaa
Label: stres (Similarity: 0.7853)
--------------------------------------------------
Tweet: gabisa tidur plsss ini materi susah banget
Label: stres (Similarity: 0.9343)
--------------------------------------------------
Tweet: duh takut jelek nih hasilnya
Label: stres (Similarity: 0.8312)
----------------