In [1]:
# Import Library
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [2]:
# Load Dataset
manual_path = "data_manual.csv"
self_path = "data_self_training.csv"

data_manual = pd.read_csv(manual_path)
data_self = pd.read_csv(self_path)

print("Manual dataset:", data_manual.shape)
print("Self-training dataset:", data_self.shape)

Manual dataset: (250, 4)
Self-training dataset: (750, 3)


In [3]:
# Deteksi otomatis kolom teks & label

# cari text column
text_candidates = data_manual.select_dtypes(include=['object']).columns.tolist()
text_col = text_candidates[0]  

# cari label manual
label_col = "label_manual"

if label_col not in data_manual.columns:
    raise ValueError("data_manual.csv harus memiliki kolom 'label_manual'")

print("Text column =", text_col)
print("Label column =", label_col)

Text column = review
Label column = label_manual


In [4]:
# Data for training
X_manual = data_manual[text_col].astype(str).values
y_manual = data_manual[label_col].astype(int).values

# Untuk evaluasi di self-training, kita perlu ground-truth dari file terpisah
true_label_candidates = [c for c in data_self.columns if "true" in c.lower() or "label" in c.lower()]
if not true_label_candidates:
    raise ValueError("Self-training dataset harus memiliki kolom label asli untuk evaluasi.")
true_label_col = true_label_candidates[0]

X_self = data_self[text_col].astype(str).values
y_self_true = data_self[true_label_col].astype(int).values

In [5]:
# Model A — Rekomendasi: TF-IDF + Logistic Regression
tfidf_config = dict(max_features=2000, ngram_range=(1,1))
model_A = make_pipeline(TfidfVectorizer(**tfidf_config),
                        LogisticRegression(solver='liblinear', max_iter=200))

model_A.fit(X_manual, y_manual)
y_pred_A = model_A.predict(X_self)

In [6]:
# Model B — LLM (fallback ML jika LLM tidak tersedia)
use_transformers = False
try:
    from transformers import pipeline
    print("Trying transformers LLM sentiment pipeline...")
    pipe = pipeline("sentiment-analysis")
    use_transformers = True

    def llm_predict(texts, batch=32):
        pred_list = []
        for i in range(0, len(texts), batch):
            batch_text = list(texts[i:i+batch])
            outs = pipe(batch_text)
            for o in outs:
                lab = o['label'].upper()
                if lab.startswith("NEG"):
                    pred_list.append(0)
                elif lab.startswith("NEU"):
                    pred_list.append(1)
                else:
                    pred_list.append(2)
        return np.array(pred_list)

    y_pred_B = llm_predict(X_self)

except:
    print("Transformers tidak tersedia. Fallback ke MultinomialNB.")
    model_B = make_pipeline(TfidfVectorizer(**tfidf_config), MultinomialNB())
    model_B.fit(X_manual, y_manual)
    y_pred_B = model_B.predict(X_self)

  from .autonotebook import tqdm as notebook_tqdm
No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


Trying transformers LLM sentiment pipeline...


Device set to use cpu


In [7]:
# Evaluation
acc_A = accuracy_score(y_self_true, y_pred_A)
acc_B = accuracy_score(y_self_true, y_pred_B)

print("\n=== Model A (TF-IDF + Logistic Regression) ===")
print("Akurasi:", acc_A)
print(classification_report(y_self_true, y_pred_A, target_names=['neg','neu','pos']))
print("Confusion Matrix:\n", confusion_matrix(y_self_true, y_pred_A))

print("\n=== Model B (LLM / NB Fallback) ===")
print("Transformers digunakan:", use_transformers)
print("Akurasi:", acc_B)
print(classification_report(y_self_true, y_pred_B, target_names=['neg','neu','pos']))
print("Confusion Matrix:\n", confusion_matrix(y_self_true, y_pred_B))


=== Model A (TF-IDF + Logistic Regression) ===
Akurasi: 0.7346666666666667
              precision    recall  f1-score   support

         neg       0.70      0.84      0.76       335
         neu       0.00      0.00      0.00        85
         pos       0.77      0.82      0.80       330

    accuracy                           0.73       750
   macro avg       0.49      0.55      0.52       750
weighted avg       0.65      0.73      0.69       750

Confusion Matrix:
 [[280   0  55]
 [ 60   0  25]
 [ 59   0 271]]

=== Model B (LLM / NB Fallback) ===
Transformers digunakan: True
Akurasi: 0.49466666666666664
              precision    recall  f1-score   support

         neg       0.47      0.98      0.63       335
         neu       0.00      0.00      0.00        85
         pos       0.81      0.13      0.23       330

    accuracy                           0.49       750
   macro avg       0.43      0.37      0.29       750
weighted avg       0.57      0.49      0.38       750

Co

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [9]:
# ------------------------------------------------------------
# 7. Export hasil prediksi
# ------------------------------------------------------------
out_eval = "self_eval_with_predictions.csv"
df_eval = data_self.copy()
df_eval["pred_model_A"] = y_pred_A
df_eval["pred_model_B"] = y_pred_B
df_eval.to_csv(out_eval, index=False)

print("\nSaved evaluation file:", out_eval)
print("Clean script finished.")


Saved evaluation file: self_eval_with_predictions.csv
Clean script finished.
