# Hackathon Sentiment Analysis Improved

This notebook integrates Play Store and YouTube data with the main dataset (`CoreTax Combined Data Clean.csv`), improves sentiment labeling using RoBERTa, and includes detailed analysis (Temporal, Source-based, Content-based).

In [None]:
# Install dependencies
!pip install -q transformers torch pandas numpy scikit-learn matplotlib seaborn wordcloud networkx
!pip install -q indobenchmark-toolkit
!pip install -q Sastrawi
!pip install -q accelerate -U

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import re
import warnings
import requests
from io import StringIO
import os
import networkx as nx
from collections import Counter
from itertools import combinations

# Import untuk preprocessing
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
from sklearn.preprocessing import LabelEncoder

# Import untuk IndoBERT
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, pipeline
from torch.utils.data import Dataset

# Import Sastrawi untuk stemming
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory

# Setup plotting style
sns.set(style="whitegrid")
warnings.filterwarnings('ignore')

# Mount GDrive dan Load Data

In [None]:
try:
    from google.colab import drive
    print("Mounting Google Drive...")
    drive.mount('/content/drive/')
    print("✓ Google Drive mounted!")
    # Set data path for Colab
    DATA_PATH = '/content/drive/MyDrive/Hackathon/data/'
    DATA_ACTUAL_PATH = '/content/drive/MyDrive/Hackathon/data/processed/'
    MODELS_PATH = '/content/drive/MyDrive/Hackathon/models/'
    OUTPUTS_PATH = '/content/drive/MyDrive/Hackathon/outputs/'
except ImportError:
    print("⚠️ Google Colab environment not detected. Using local paths.")
    DATA_PATH = '../data/'
    DATA_ACTUAL_PATH = '../data/processed/'
    MODELS_PATH = '../models/'
    OUTPUTS_PATH = '../outputs/'

# Ensure output directories exist
os.makedirs(MODELS_PATH, exist_ok=True)
os.makedirs(OUTPUTS_PATH, exist_ok=True)

print(f"Data path set to: {DATA_PATH}")
print(f"Models path set to: {MODELS_PATH}")
print(f"Outputs path set to: {OUTPUTS_PATH}")

# Load RoBERTa Sentiment Classifier

In [None]:
# Load RoBERTa sentiment classifier
print("Loading RoBERTa sentiment classifier...")
try:
    sentiment_classifier = pipeline(
        "text-classification",
        model="w11wo/indonesian-roberta-base-sentiment-classifier"
    )
    print("✓ RoBERTa sentiment classifier loaded!")
except Exception as e:
    print(f"⚠️ Error loading RoBERTa classifier: {e}")
    sentiment_classifier = None

def roberta_sentiment_labeling(text, classifier):
    """
    Labeling sentimen menggunakan RoBERTa classifier
    """
    if not isinstance(text, str) or len(text.strip()) == 0:
        return 'neutral', 0.0

    if classifier is None:
        return 'neutral', 0.0

    try:
        # Predict sentiment
        result = classifier(text[:512])[0]  # Limit to 512 chars for efficiency

        # Map label to sentiment
        label = result['label'].lower()
        score = result['score']

        # Model outputs: positive, negative, neutral
        if 'positive' in label:
            return 'positive', score
        elif 'negative' in label:
            return 'negative', score
        else:
            return 'neutral', score

    except Exception as e:
        # print(f"Error in RoBERTa labeling: {e}")
        return 'neutral', 0.0

print("✓ RoBERTa labeling function ready!")

# Data Loading and Integration

In [None]:
# Load Datasets
print("Loading datasets...")
try:
    df_youtube = pd.read_csv(os.path.join(DATA_PATH, 'Scraper Youtube CoreTax.csv'))
except FileNotFoundError:
    df_youtube = pd.read_csv('../data/Scraper Youtube CoreTax.csv')

try:
    df_playstore = pd.read_csv(os.path.join(DATA_PATH, 'CoreTax Scraper M Pajak 2025.csv'))
except FileNotFoundError:
    df_playstore = pd.read_csv('../data/CoreTax Scraper M Pajak 2025.csv')

try:
    df_combined_old = pd.read_csv(os.path.join(DATA_ACTUAL_PATH, 'CoreTax Combined Data Clean.csv'))
except FileNotFoundError:
    df_combined_old = pd.read_csv('../data/processed/CoreTax Combined Data Clean.csv')

print(f"YouTube Data: {len(df_youtube)} rows")
print(f"Play Store Data: {len(df_playstore)} rows")
print(f"Existing Combined Data: {len(df_combined_old)} rows")

# Standardize Columns
# Youtube: date, text, source
df_youtube = df_youtube[['date', 'text', 'source']].copy()
df_youtube['rating'] = np.nan

# Play Store: rating, at -> date, content -> text, source
df_playstore = df_playstore.rename(columns={'at': 'date', 'content': 'text'})
df_playstore = df_playstore[['date', 'text', 'source', 'rating']].copy()

# Combined Old: date, text, source, rating
# Ensure columns exist
if 'date' not in df_combined_old.columns:
    df_combined_old['date'] = np.nan
if 'rating' not in df_combined_old.columns:
    df_combined_old['rating'] = np.nan

df_combined_old = df_combined_old[['date', 'text', 'source', 'rating']].copy()

# Combine all data
df_combined = pd.concat([df_youtube, df_playstore, df_combined_old], ignore_index=True)

# Drop duplicates based on text to avoid repetition
df_combined = df_combined.drop_duplicates(subset=['text'])

# Convert date to datetime
df_combined['date'] = pd.to_datetime(df_combined['date'], errors='coerce')

print(f"Total Combined Data after deduplication: {len(df_combined)}")
print("Source Distribution:")
print(df_combined['source'].value_counts())
df_combined.head()

# Data Preprocessing

In [None]:
def clean_text(text):
    if not isinstance(text, str):
        return ""
    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    # Remove Mentions
    text = re.sub(r'@\w+', '', text)
    # Remove special characters but keep punctuation that might be useful for RoBERTa (though usually we clean it for WordCloud)
    text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text)
    text = text.lower()
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Visualization: WordCloud BEFORE Preprocessing
print("Generating WordCloud BEFORE Preprocessing...")
all_text_before = ' '.join([str(t) for t in df_combined['text'].fillna('')])
wordcloud_before = WordCloud(width=800, height=400, background_color='white').generate(all_text_before)

plt.figure(figsize=(10, 5))
plt.imshow(wordcloud_before, interpolation='bilinear')
plt.axis('off')
plt.title('WordCloud BEFORE Preprocessing')
plt.savefig(os.path.join(OUTPUTS_PATH, 'wordcloud_before.png'))
plt.show()

# Apply Cleaning
print("Cleaning text...")
df_combined['cleaned_text'] = df_combined['text'].apply(clean_text)

# Visualization: WordCloud AFTER Preprocessing
print("Generating WordCloud AFTER Preprocessing...")
all_text_after = ' '.join([str(t) for t in df_combined['cleaned_text'].fillna('')])
wordcloud_after = WordCloud(width=800, height=400, background_color='white').generate(all_text_after)

plt.figure(figsize=(10, 5))
plt.imshow(wordcloud_after, interpolation='bilinear')
plt.axis('off')
plt.title('WordCloud AFTER Preprocessing')
plt.savefig(os.path.join(OUTPUTS_PATH, 'wordcloud_after.png'))
plt.show()

# Sentiment Labeling with RoBERTa

In [None]:
# Apply to dataset (This might take a while)
print("Labeling data with RoBERTa... This may take some time.")

# Using cleaned_text for consistency
df_combined[['roberta_sentiment', 'roberta_score']] = df_combined['cleaned_text'].apply(
    lambda x: pd.Series(roberta_sentiment_labeling(x, sentiment_classifier))
)

print("Labeling complete!")
print(df_combined['roberta_sentiment'].value_counts())
df_combined.head()

# Detailed Analysis & Visualization

In [None]:
# 1. Sentiment Distribution (Bar Chart)
plt.figure(figsize=(8, 6))
sns.countplot(x='roberta_sentiment', data=df_combined, palette='viridis')
plt.title('Overall Sentiment Distribution (RoBERTa)')
plt.xlabel('Sentiment')
plt.ylabel('Count')
plt.savefig(os.path.join(OUTPUTS_PATH, 'sentiment_distribution_bar.png'))
plt.show()

# 2. Sentiment Distribution (Pie Chart)
plt.figure(figsize=(8, 8))
sentiment_counts = df_combined['roberta_sentiment'].value_counts()
plt.pie(sentiment_counts, labels=sentiment_counts.index, autopct='%1.1f%%', startangle=140, colors=sns.color_palette('viridis', 3))
plt.title('Sentiment Proportion')
plt.savefig(os.path.join(OUTPUTS_PATH, 'sentiment_distribution_pie.png'))
plt.show()

# 3. Sentiment by Source
plt.figure(figsize=(12, 6))
sns.countplot(x='source', hue='roberta_sentiment', data=df_combined, palette='viridis')
plt.title('Sentiment Distribution by Source')
plt.xlabel('Source')
plt.ylabel('Count')
plt.legend(title='Sentiment')
plt.savefig(os.path.join(OUTPUTS_PATH, 'sentiment_by_source.png'))
plt.show()

# 4. Confidence Score Distribution (Box Plot)
plt.figure(figsize=(10, 6))
sns.boxplot(x='roberta_sentiment', y='roberta_score', data=df_combined, palette='viridis')
plt.title('Distribution of Confidence Scores by Sentiment')
plt.xlabel('Sentiment')
plt.ylabel('Confidence Score')
plt.savefig(os.path.join(OUTPUTS_PATH, 'confidence_score_distribution.png'))
plt.show()

# 5. Temporal Analysis (Sentiment over Time)
# Filter out rows with missing dates
df_time = df_combined.dropna(subset=['date']).copy()
df_time['month_year'] = df_time['date'].dt.to_period('M')

plt.figure(figsize=(14, 6))
sentiment_time = df_time.groupby(['month_year', 'roberta_sentiment']).size().unstack(fill_value=0)
sentiment_time.plot(kind='line', marker='o', figsize=(14, 6))
plt.title('Sentiment Trends Over Time (Monthly)')
plt.xlabel('Month-Year')
plt.ylabel('Count')
plt.grid(True)
plt.savefig(os.path.join(OUTPUTS_PATH, 'sentiment_trends_time.png'))
plt.show()

In [None]:
# 6. Word Clouds by Sentiment
for sentiment in ['positive', 'negative', 'neutral']:
    subset = df_combined[df_combined['roberta_sentiment'] == sentiment]
    if not subset.empty:
        text = ' '.join([str(t) for t in subset['cleaned_text'].fillna('')])
        if len(text) > 0:
            wc = WordCloud(width=800, height=400, background_color='white').generate(text)
            plt.figure(figsize=(10, 5))
            plt.imshow(wc, interpolation='bilinear')
            plt.axis('off')
            plt.title(f'WordCloud - {sentiment.capitalize()} Sentiment')
            plt.savefig(os.path.join(OUTPUTS_PATH, f'wordcloud_{sentiment}.png'))
            plt.show()

# Advanced N-gram Analysis (Bigram & Trigram)

In [None]:
def plot_ngrams(text_series, n=2, top_k=20, title="N-grams", filename="ngrams.png"):
    vec = CountVectorizer(ngram_range=(n, n), stop_words=None).fit(text_series)
    bag_of_words = vec.transform(text_series)
    sum_words = bag_of_words.sum(axis=0)
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq = sorted(words_freq, key=lambda x: x[1], reverse=True)
    
    common_words = words_freq[:top_k]
    df_ngram = pd.DataFrame(common_words, columns=['ngram', 'count'])
    
    plt.figure(figsize=(12, 6))
    sns.barplot(x='count', y='ngram', data=df_ngram, palette='viridis')
    plt.title(title)
    plt.savefig(os.path.join(OUTPUTS_PATH, filename))
    plt.show()

# Plot Bigrams (AFTER Preprocessing)
plot_ngrams(df_combined['cleaned_text'], n=2, title="Top 20 Bigrams (After Preprocessing)", filename="bigrams.png")

# Plot Trigrams (AFTER Preprocessing)
plot_ngrams(df_combined['cleaned_text'], n=3, title="Top 20 Trigrams (After Preprocessing)", filename="trigrams.png")

# Co-occurrence Network Analysis

In [None]:
def plot_cooccurrence_network(text_series, top_n=30):
    """
    Plots a co-occurrence network of the most frequent words.
    """
    # Tokenize and count words
    all_words = ' '.join(text_series).split()
    word_counts = Counter(all_words)
    top_words = [word for word, count in word_counts.most_common(top_n)]
    
    # Build co-occurrence matrix
    co_occurrence = Counter()
    for text in text_series:
        words = set(text.split())
        # Only consider top words
        words = [w for w in words if w in top_words]
        for w1, w2 in combinations(sorted(words), 2):
            co_occurrence[(w1, w2)] += 1
            
    # Create Graph
    G = nx.Graph()
    for (w1, w2), count in co_occurrence.items():
        if count > 5: # Threshold to reduce clutter
            G.add_edge(w1, w2, weight=count)
            
    plt.figure(figsize=(12, 12))
    pos = nx.spring_layout(G, k=0.5)
    
    # Draw nodes and edges
    nx.draw_networkx_nodes(G, pos, node_size=1000, node_color='skyblue', alpha=0.7)
    nx.draw_networkx_edges(G, pos, width=[G[u][v]['weight']/5 for u,v in G.edges()], alpha=0.5)
    nx.draw_networkx_labels(G, pos, font_size=10, font_family='sans-serif')
    
    plt.title('Word Co-occurrence Network (Top Words)')
    plt.axis('off')
    plt.savefig(os.path.join(OUTPUTS_PATH, 'cooccurrence_network.png'))
    plt.show()

print("Generating Co-occurrence Network...")
plot_cooccurrence_network(df_combined['cleaned_text'])

# Advanced TF-IDF Analysis (Per Sentiment)

In [None]:
def plot_top_tfidf_words(df, sentiment_col, text_col, sentiment_label, top_n=15):
    """
    Plots the top TF-IDF words for a specific sentiment.
    """
    subset = df[df[sentiment_col] == sentiment_label]
    if subset.empty:
        print(f"No data for sentiment: {sentiment_label}")
        return

    tfidf = TfidfVectorizer(max_features=1000, stop_words=None) # You can add Indonesian stopwords here
    try:
        tfidf_matrix = tfidf.fit_transform(subset[text_col])
    except ValueError:
        print(f"Not enough data to perform TF-IDF for {sentiment_label}")
        return

    feature_names = tfidf.get_feature_names_out()
    dense = tfidf_matrix.todense()
    denselist = dense.tolist()
    
    # Sum TF-IDF scores for each word
    df_tfidf = pd.DataFrame(denselist, columns=feature_names)
    top_words = df_tfidf.sum().sort_values(ascending=False).head(top_n)
    
    plt.figure(figsize=(10, 6))
    sns.barplot(x=top_words.values, y=top_words.index, palette='magma')
    plt.title(f'Top {top_n} TF-IDF Words for {sentiment_label.capitalize()} Sentiment')
    plt.xlabel('TF-IDF Score Sum')
    plt.savefig(os.path.join(OUTPUTS_PATH, f'tfidf_{sentiment_label}.png'))
    plt.show()

print("Generating TF-IDF Plots per Sentiment...")
for sentiment in ['positive', 'negative', 'neutral']:
    plot_top_tfidf_words(df_combined, 'roberta_sentiment', 'cleaned_text', sentiment)

# Recommendation Analysis (Automated Insights)

In [None]:
# Extract top negative bigrams to identify pain points
negative_text = df_combined[df_combined['roberta_sentiment'] == 'negative']['cleaned_text']

if not negative_text.empty:
    vec = CountVectorizer(ngram_range=(2, 2)).fit(negative_text)
    bag_of_words = vec.transform(negative_text)
    sum_words = bag_of_words.sum(axis=0)
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq = sorted(words_freq, key=lambda x: x[1], reverse=True)
    
    top_negative_bigrams = words_freq[:10]
    
    # Create a recommendation table based on common issues
    recommendations = []
    for bigram, count in top_negative_bigrams:
        issue = bigram
        rec = "Investigate specific error logs and user reports related to this term."
        
        if "login" in bigram or "masuk" in bigram:
            rec = "Improve login stability and error messaging."
        elif "kode" in bigram or "verifikasi" in bigram or "otp" in bigram:
            rec = "Check SMS/Email gateway latency and OTP delivery rates."
        elif "error" in bigram or "gagal" in bigram:
            rec = "Conduct technical audit on server stability and API endpoints."
        elif "lambat" in bigram or "lemot" in bigram:
            rec = "Optimize application performance and server response times."
            
        recommendations.append({'Issue (Bigram)': issue, 'Frequency': count, 'Recommendation': rec})
    
    df_rec = pd.DataFrame(recommendations)
    
    print("Automated Recommendations based on Negative Sentiment Analysis:")
    display(df_rec.style.background_gradient(cmap='Reds', subset=['Frequency']))
    
    # Save recommendations
    df_rec.to_csv(os.path.join(OUTPUTS_PATH, 'recommendations.csv'), index=False)
else:
    print("No negative sentiment data found to generate recommendations.")

# IndoBERT Fine-tuning & Evaluation

In [None]:
# Prepare Data for IndoBERT Fine-tuning
# We will use the 'roberta_sentiment' labels as our target for this demonstration.
# In a real scenario, you would use human-annotated labels.

print("Preparing data for IndoBERT fine-tuning...")

# Encode labels
le = LabelEncoder()
df_combined['label_encoded'] = le.fit_transform(df_combined['roberta_sentiment'])
label_map = {index: label for index, label in enumerate(le.classes_)}
print("Label Mapping:", label_map)

# Split Data
X_train, X_val, y_train, y_val = train_test_split(
    df_combined['cleaned_text'], 
    df_combined['label_encoded'], 
    test_size=0.2, 
    random_state=42
)

# Load Tokenizer
model_name = "indobenchmark/indobert-base-p1"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Create Dataset Class
class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = str(self.texts[item])
        label = self.labels[item]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Create Datasets
train_dataset = SentimentDataset(X_train.to_numpy(), y_train.to_numpy(), tokenizer)
val_dataset = SentimentDataset(X_val.to_numpy(), y_val.to_numpy(), tokenizer)

# Load Model
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(label_map))

# Define Metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average='weighted')
    return {'accuracy': acc, 'f1': f1}

# Training Arguments
training_args = TrainingArguments(
    output_dir=os.path.join(MODELS_PATH, 'indobert_finetuned'),
    num_train_epochs=1,              # Low epoch for demo purposes
    per_device_train_batch_size=8,   # Adjust based on GPU memory
    per_device_eval_batch_size=16,
    warmup_steps=100,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="steps",
    eval_steps=50,
    save_steps=100,
    load_best_model_at_end=True,
    report_to="none" # Disable wandb/mlflow logging
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

# Train Model
print("Starting IndoBERT Fine-tuning...")
trainer.train()

# Evaluate
print("Evaluating Model...")
eval_result = trainer.evaluate()
print(f"Evaluation Results: {eval_result}")

# Save the final model
trainer.save_model(os.path.join(MODELS_PATH, 'indobert_finetuned_final'))
tokenizer.save_pretrained(os.path.join(MODELS_PATH, 'indobert_finetuned_final'))
print(f"Model saved to {os.path.join(MODELS_PATH, 'indobert_finetuned_final')}")

# Visualization: Confusion Matrix for IndoBERT
print("Generating Confusion Matrix for IndoBERT...")
predictions = trainer.predict(val_dataset)
preds = np.argmax(predictions.predictions, axis=-1)
cm = confusion_matrix(predictions.label_ids, preds)

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=le.classes_, yticklabels=le.classes_)
plt.title('Confusion Matrix (IndoBERT)')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.savefig(os.path.join(OUTPUTS_PATH, 'confusion_matrix_indobert.png'))
plt.show()

# Visualization: Training History (Loss)
history = trainer.state.log_history
loss_history = [x['loss'] for x in history if 'loss' in x]
steps = [x['step'] for x in history if 'loss' in x]

if loss_history:
    plt.figure(figsize=(10, 5))
    plt.plot(steps, loss_history, label='Training Loss')
    plt.title('IndoBERT Training Loss')
    plt.xlabel('Step')
    plt.ylabel('Loss')
    plt.legend()
    plt.savefig(os.path.join(OUTPUTS_PATH, 'training_loss_indobert.png'))
    plt.show()

# Save Results

In [None]:
output_path = '../data/processed/CoreTax_Combined_RoBERTa_Labeled.csv'
df_combined.to_csv(output_path, index=False)
print(f"Processed data saved to {output_path}")