In [2]:
# Import Library
import pandas as pd
import numpy as np
import pickle
import warnings
warnings.filterwarnings('ignore')

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [3]:
# Konfigurasi
TRAINING_DATA = "data_manual.csv"           # File data training (harus punya label)
NEW_DATA = "data_self_training.csv"         # File data baru yang akan dilabeli
LABEL_COLUMN = "label_manual"               # Nama kolom label di training data
TEXT_COLUMN = "preprocessed_text"           # Nama kolom text/review

# Pilih model yang ingin digunakan (hanya Linear Regression dan LLM)
MODELS_TO_TRAIN = {
    'Linear Regression': True,   # Logistic Regression (linear model)
    'LLM': True                  # Large Language Model
}

MODEL_OUTPUT = "sentiment_model.pkl"
LABELED_OUTPUT = "data_labeled_result.csv"

print(f"✓ Training data: {TRAINING_DATA}")
print(f"✓ New data: {NEW_DATA}")
print(f"✓ Label column: {LABEL_COLUMN}")
print(f"✓ Text column: {TEXT_COLUMN}")
print(f"✓ Models to train: {[k for k,v in MODELS_TO_TRAIN.items() if v]}")
print(f"✓ Model output: {MODEL_OUTPUT}")
print(f"✓ Labeled output: {LABELED_OUTPUT}")

✓ Training data: data_manual.csv
✓ New data: data_self_training.csv
✓ Label column: label_manual
✓ Text column: preprocessed_text
✓ Models to train: ['Linear Regression', 'LLM']
✓ Model output: sentiment_model.pkl
✓ Labeled output: data_labeled_result.csv


In [4]:
# Load Data Training
try:
    data_train = pd.read_csv(TRAINING_DATA)
    print(f"✓ Training dataset loaded: {data_train.shape}")
    
    # Check label column exists
    if LABEL_COLUMN not in data_train.columns:
        raise ValueError(f"Kolom '{LABEL_COLUMN}' tidak ditemukan di {TRAINING_DATA}")
    
    # Auto-detect text column if TEXT_COLUMN not found
    if TEXT_COLUMN not in data_train.columns:
        text_candidates = data_train.select_dtypes(include=['object']).columns.tolist()
        if text_candidates:
            TEXT_COLUMN = text_candidates[0]
            print(f"⚠ Kolom text otomatis terdeteksi: '{TEXT_COLUMN}'")
        else:
            raise ValueError("Tidak dapat menemukan kolom text!")
    
    print(f"✓ Text column: '{TEXT_COLUMN}'")
    
except FileNotFoundError:
    print(f"❌ File '{TRAINING_DATA}' tidak ditemukan!")
    print("Pastikan file berada di folder yang sama dengan notebook ini.")
    raise

✓ Training dataset loaded: (250, 4)
✓ Text column: 'preprocessed_text'


In [5]:
# Preparing Data Training

# Konversi label text ke angka
label_mapping = {
    'negatif': 0, 'negative': 0, 'neg': 0,
    'netral': 1, 'neutral': 1, 'neu': 1,
    'positif': 2, 'positive': 2, 'pos': 2
}

data_train[LABEL_COLUMN] = data_train[LABEL_COLUMN].astype(str).str.lower().str.strip()
data_train['label_encoded'] = data_train[LABEL_COLUMN].map(label_mapping)

# Hapus baris dengan label tidak valid
invalid_mask = data_train['label_encoded'].isna()
if invalid_mask.any():
    invalid_labels = data_train[invalid_mask][LABEL_COLUMN].unique()
    print(f"⚠ WARNING: Label tidak valid ditemukan: {invalid_labels}")
    data_train = data_train[~invalid_mask]
    print(f"✓ Baris dengan label invalid dihapus. Shape baru: {data_train.shape}")

data_train['label_encoded'] = data_train['label_encoded'].astype(int)

print(f"✓ Label distribution:")
label_counts = data_train[LABEL_COLUMN].value_counts()
for label, count in label_counts.items():
    print(f"   - {label}: {count}")

# Prepare features and labels
X_train_full = data_train[TEXT_COLUMN].astype(str).values
y_train_full = data_train['label_encoded'].values

print(f"✓ X_train_full shape: {X_train_full.shape}")
print(f"✓ y_train_full shape: {y_train_full.shape}")

# Split data untuk evaluasi
X_train, X_test, y_train, y_test = train_test_split(
    X_train_full, y_train_full, 
    test_size=0.2, 
    random_state=42,
    stratify=y_train_full
)
print(f"\n✓ Data split:")
print(f"   - Training: {len(X_train)} samples")
print(f"   - Testing: {len(X_test)} samples")

✓ Label distribution:
   - negatif: 100
   - positif: 100
   - netral: 50
✓ X_train_full shape: (250,)
✓ y_train_full shape: (250,)

✓ Data split:
   - Training: 200 samples
   - Testing: 50 samples


In [6]:
# Training Model Linear Regression
def create_linear_model():
    """Create Logistic Regression model pipeline"""
    tfidf = TfidfVectorizer(max_features=2000, ngram_range=(1, 2))
    clf = LogisticRegression(solver='liblinear', max_iter=200, random_state=42)
    return make_pipeline(tfidf, clf)

print("Training Linear Regression (Logistic Regression) Model...")
linear_model = create_linear_model()
linear_model.fit(X_train, y_train)
print("✓ Linear Regression model trained successfully")

# Cross-validation
cv_scores = cross_val_score(linear_model, X_train, y_train, cv=5, scoring='accuracy')
print(f"✓ Cross-validation scores: {cv_scores}")
print(f"✓ Mean CV accuracy: {cv_scores.mean():.4f} (+/- {cv_scores.std():.4f})")

# Evaluate on test set
y_pred_linear = linear_model.predict(X_test)
acc_linear = accuracy_score(y_test, y_pred_linear)

print(f"\n✓ Test set accuracy: {acc_linear:.4f} ({acc_linear*100:.2f}%)")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_linear, 
                            target_names=['Negatif', 'Netral', 'Positif'],
                            zero_division=0))

print("\nConfusion Matrix:")
cm_linear = confusion_matrix(y_test, y_pred_linear)
print(cm_linear)
print("(Baris = True Label, Kolom = Predicted Label)")

Training Linear Regression (Logistic Regression) Model...
✓ Linear Regression model trained successfully
✓ Cross-validation scores: [0.7   0.625 0.65  0.75  0.6  ]
✓ Mean CV accuracy: 0.6650 (+/- 0.0539)

✓ Test set accuracy: 0.7800 (78.00%)

Classification Report:
              precision    recall  f1-score   support

     Negatif       0.79      0.95      0.86        20
      Netral       1.00      0.10      0.18        10
     Positif       0.76      0.95      0.84        20

    accuracy                           0.78        50
   macro avg       0.85      0.67      0.63        50
weighted avg       0.82      0.78      0.72        50


Confusion Matrix:
[[19  0  1]
 [ 4  1  5]
 [ 1  0 19]]
(Baris = True Label, Kolom = Predicted Label)


In [7]:
# Training Model LLM
def create_llm_model():
    """Create LLM-based sentiment analyzer"""
    try:
        # Try to use transformers
        from transformers import pipeline
        print("✓ Transformers library available")
        
        def llm_predict(texts, batch_size=32):
            """Predict using pretrained LLM"""
            # Use a pretrained sentiment analysis model
            sentiment_pipeline = pipeline("sentiment-analysis")
            
            predictions = []
            total = len(texts)
            
            for i in range(0, total, batch_size):
                batch_texts = list(texts[i:i+batch_size])
                results = sentiment_pipeline(batch_texts)
                
                for result in results:
                    label = result['label'].upper()
                    score = result['score']
                    
                    # Map LLM output to our labels (0, 1, 2)
                    if 'NEG' in label or 'BAD' in label:
                        predictions.append(0)  # Negatif
                    elif 'NEU' in label or 'NEUTRAL' in label:
                        predictions.append(1)  # Netral
                    else:
                        predictions.append(2)  # Positif
                
                # Show progress
                if (i + batch_size) % 100 == 0 or i + batch_size >= total:
                    print(f"  Progress: {min(i+batch_size, total)}/{total}")
            
            return np.array(predictions)
        
        return llm_predict
        
    except ImportError:
        print("⚠ Transformers tidak tersedia, menggunakan fallback model...")
        
        # Fallback: Simple rule-based LLM simulation
        def fallback_llm_predict(texts):
            """Fallback simple LLM simulation"""
            predictions = []
            
            # Simple keyword-based sentiment analysis
            neg_keywords = ['buruk', 'jelek', 'gagal', 'error', 'masalah', 'susah', 'sulit']
            pos_keywords = ['bagus', 'baik', 'mantap', 'keren', 'suka', 'puas', 'rekomendasi']
            neu_keywords = ['biasa', 'standar', 'lumayan', 'cukup', 'rata-rata']
            
            for text in texts:
                text_lower = text.lower()
                
                neg_count = sum(1 for word in neg_keywords if word in text_lower)
                pos_count = sum(1 for word in pos_keywords if word in text_lower)
                neu_count = sum(1 for word in neu_keywords if word in text_lower)
                
                if neg_count > pos_count and neg_count > neu_count:
                    predictions.append(0)  # Negatif
                elif pos_count > neg_count and pos_count > neu_count:
                    predictions.append(2)  # Positif
                elif neu_count > 0:
                    predictions.append(1)  # Netral
                else:
                    # Default to neutral if no keywords found
                    predictions.append(1)
            
            return np.array(predictions)
        
        return fallback_llm_predict

# Create LLM predictor
llm_predictor = create_llm_model()

# Test LLM on sample data
print("\nTesting LLM on sample data...")
sample_texts = X_test[:10]  # Test on 10 samples
sample_preds = llm_predictor(sample_texts)
print(f"\n✓ LLM predictions made for {len(sample_preds)} samples")

# If we have labels for test set, evaluate LLM
if len(y_test) > 0:
    print("\nEvaluating LLM on test set...")
    try:
        # Predict on entire test set
        y_pred_llm = llm_predictor(X_test)
        acc_llm = accuracy_score(y_test, y_pred_llm)
        
        print(f"✓ LLM test set accuracy: {acc_llm:.4f} ({acc_llm*100:.2f}%)")
        print("\nClassification Report:")
        print(classification_report(y_test, y_pred_llm, 
                                    target_names=['Negatif', 'Netral', 'Positif'],
                                    zero_division=0))
        
        print("\nConfusion Matrix:")
        cm_llm = confusion_matrix(y_test, y_pred_llm)
        print(cm_llm)
        print("(Baris = True Label, Kolom = Predicted Label)")
        
    except Exception as e:
        print(f"⚠ Error evaluating LLM: {str(e)}")
        acc_llm = 0
else:
    print("⚠ No labels available for LLM evaluation")
    acc_llm = 0


No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


✓ Transformers library available

Testing LLM on sample data...


Device set to use cpu
No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


  Progress: 10/10

✓ LLM predictions made for 10 samples

Evaluating LLM on test set...


Device set to use cpu


  Progress: 50/50
✓ LLM test set accuracy: 0.4400 (44.00%)

Classification Report:
              precision    recall  f1-score   support

     Negatif       0.44      0.95      0.60        20
      Netral       0.00      0.00      0.00        10
     Positif       0.43      0.15      0.22        20

    accuracy                           0.44        50
   macro avg       0.29      0.37      0.28        50
weighted avg       0.35      0.44      0.33        50


Confusion Matrix:
[[19  0  1]
 [ 7  0  3]
 [17  0  3]]
(Baris = True Label, Kolom = Predicted Label)


In [8]:
# Model Comparison
print(f"{'MODEL':25s} | {'TEST ACCURACY':15s} | {'STATUS':10s}")
print("-" * 70)

print(f"{'Linear Regression':25s} | {acc_linear:.4f} ({acc_linear*100:.2f}%) | {'✅' if acc_linear > acc_llm else ' '}")
print(f"{'LLM':25s} | {acc_llm:.4f} ({acc_llm*100:.2f}%) | {'✅' if acc_llm > acc_linear else ' '}")

# Select best model
if acc_linear >= acc_llm:
    best_model = linear_model
    best_name = "Linear Regression"
    best_acc = acc_linear
    print(f"\n✓ Best model: Linear Regression")
    print(f"✓ Reason: Higher accuracy ({acc_linear:.4f} vs {acc_llm:.4f})")
else:
    best_model = llm_predictor
    best_name = "LLM"
    best_acc = acc_llm
    print(f"\n✓ Best model: LLM")
    print(f"✓ Reason: Higher accuracy ({acc_llm:.4f} vs {acc_linear:.4f})")

print(f"✓ Final accuracy: {best_acc:.4f} ({best_acc*100:.2f}%)")


MODEL                     | TEST ACCURACY   | STATUS    
----------------------------------------------------------------------
Linear Regression         | 0.7800 (78.00%) | ✅
LLM                       | 0.4400 (44.00%) |  

✓ Best model: Linear Regression
✓ Reason: Higher accuracy (0.7800 vs 0.4400)
✓ Final accuracy: 0.7800 (78.00%)


In [9]:
# Retrain Best Model
if best_name == "Linear Regression":
    # Retrain linear regression on full dataset
    final_model = create_linear_model()
    final_model.fit(X_train_full, y_train_full)
    print("✓ Linear Regression retrained on full dataset")
    
    # Save the model type for later use
    model_type = "linear"
    
else:  # LLM
    # LLM doesn't need retraining as it's pretrained
    final_model = best_model
    print("✓ LLM is pretrained, no retraining needed")
    
    # Save the model type for later use
    model_type = "llm"

print(f"✓ Final model: {best_name}")
print(f"✓ Training samples: {len(X_train_full)}")

✓ Linear Regression retrained on full dataset
✓ Final model: Linear Regression
✓ Training samples: 250


In [10]:
# Menyimpan Model

# Prepare model data for saving
model_data = {
    'model': final_model,
    'model_name': best_name,
    'model_type': model_type,
    'accuracy': best_acc,
    'label_mapping': {0: 'negatif', 1: 'netral', 2: 'positif'},
    'label_mapping_reverse': {'negatif': 0, 'netral': 1, 'positif': 2},
    'training_samples': len(X_train_full),
    'label_distribution': {
        'negatif': sum(y_train_full == 0),
        'netral': sum(y_train_full == 1),
        'positif': sum(y_train_full == 2)
    },
    'text_column': TEXT_COLUMN,
    'vectorizer_config': {
        'max_features': 2000,
        'ngram_range': (1, 2)
    } if model_type == "linear" else None
}

with open(MODEL_OUTPUT, 'wb') as f:
    pickle.dump(model_data, f)

print(f"✓ Model saved to: {MODEL_OUTPUT}")
print(f"✓ Model name: {best_name}")
print(f"✓ Model type: {model_type}")
print(f"✓ Accuracy: {best_acc:.4f} ({best_acc*100:.2f}%)")
print(f"✓ Training samples: {len(X_train_full)}")
print(f"✓ Label distribution:")
print(f"   - Negatif: {model_data['label_distribution']['negatif']}")
print(f"   - Netral: {model_data['label_distribution']['netral']}")
print(f"   - Positif: {model_data['label_distribution']['positif']}")

✓ Model saved to: sentiment_model.pkl
✓ Model name: Linear Regression
✓ Model type: linear
✓ Accuracy: 0.7800 (78.00%)
✓ Training samples: 250
✓ Label distribution:
   - Negatif: 100
   - Netral: 50
   - Positif: 100


In [11]:
# Load Data Baru untuk Prediksi
try:
    data_new = pd.read_csv(NEW_DATA)
    print(f"✓ New dataset loaded: {data_new.shape}")
    
    # Use the same text column as training
    if TEXT_COLUMN not in data_new.columns:
        text_candidates = data_new.select_dtypes(include=['object']).columns.tolist()
        if text_candidates:
            text_col = text_candidates[0]
            print(f"⚠ Text column auto-detected: '{text_col}'")
        else:
            raise ValueError("Tidak dapat menemukan kolom text di data baru!")
    else:
        text_col = TEXT_COLUMN
    
    X_new = data_new[text_col].astype(str).values
    print(f"✓ X_new shape: {X_new.shape}")
    
except FileNotFoundError:
    print(f"⚠ File '{NEW_DATA}' tidak ditemukan")
    print(f"✓ Model sudah tersimpan dan bisa digunakan nanti")
    data_new = None
    X_new = None
    text_col = None

✓ New dataset loaded: (750, 3)
✓ X_new shape: (750,)


In [12]:
# Membuat Prediksi
if data_new is not None:
    if model_type == "linear":
        # Use linear model
        predictions = final_model.predict(X_new)
        probabilities = final_model.predict_proba(X_new)
        
        # Add predictions to dataframe
        label_decode = {0: 'negatif', 1: 'netral', 2: 'positif'}
        data_new['predicted_label'] = [label_decode[p] for p in predictions]
        data_new['predicted_label_code'] = predictions
        data_new['confidence_negatif'] = probabilities[:, 0]
        data_new['confidence_netral'] = probabilities[:, 1]
        data_new['confidence_positif'] = probabilities[:, 2]
        data_new['confidence_max'] = probabilities.max(axis=1)
        
        print(f"✓ Linear Regression predictions made for {len(predictions)} samples")
        
    else:  # LLM
        # Use LLM
        predictions = final_model(X_new)
        
        # For LLM, we don't have probabilities, so create dummy ones
        data_new['predicted_label'] = [model_data['label_mapping'][p] for p in predictions]
        data_new['predicted_label_code'] = predictions
        
        # Create confidence scores (LLM doesn't provide probabilities)
        # We'll use a default confidence of 0.8 for all predictions
        data_new['confidence_negatif'] = 0.0
        data_new['confidence_netral'] = 0.0
        data_new['confidence_positif'] = 0.0
        data_new['confidence_max'] = 0.8  # Default confidence for LLM
        
        # Set confidence based on predicted label
        for i, pred in enumerate(predictions):
            if pred == 0:
                data_new.loc[i, 'confidence_negatif'] = 0.8
            elif pred == 1:
                data_new.loc[i, 'confidence_netral'] = 0.8
            else:
                data_new.loc[i, 'confidence_positif'] = 0.8
        
        print(f"✓ LLM predictions made for {len(predictions)} samples")
        print("⚠ Note: LLM predictions use default confidence scores")

✓ Linear Regression predictions made for 750 samples


In [13]:
# Menyimpan Hasil Prediksi
if data_new is not None:
    data_new.to_csv(LABELED_OUTPUT, index=False)
    print(f"✓ Predictions saved to: {LABELED_OUTPUT}")

✓ Predictions saved to: data_labeled_result.csv


In [14]:
# Prediction Summary
if data_new is not None:
    unique, counts = np.unique(predictions, return_counts=True)
    label_names = {0: 'Negatif', 1: 'Netral', 2: 'Positif'}
    
    print(f"{'LABEL':10s} | {'COUNT':10s} | {'PERCENTAGE':12s}")
    print("-" * 40)
    
    for label, count in zip(unique, counts):
        pct = (count / len(predictions)) * 100
        print(f"{label_names[label]:10s} | {count:10d} | {pct:10.2f}%")
    
    if model_type == "linear":
        print(f"\n✓ Average confidence: {data_new['confidence_max'].mean():.4f}")
        print(f"✓ Minimum confidence: {data_new['confidence_max'].min():.4f}")
        print(f"✓ Maximum confidence: {data_new['confidence_max'].max():.4f}")
    else:
        print(f"\n✓ Model type: LLM (using default confidence scores)")


LABEL      | COUNT      | PERCENTAGE  
----------------------------------------
Negatif    |        519 |      69.20%
Netral     |          8 |       1.07%
Positif    |        223 |      29.73%

✓ Average confidence: 0.5314
✓ Minimum confidence: 0.3384
✓ Maximum confidence: 0.8569


In [15]:
# Sampel Prediksi
if data_new is not None:
    print("First 5 predictions:")
    print("-" * 70)
    
    sample_cols = [text_col, 'predicted_label', 'confidence_max']
    sample_data = data_new[sample_cols].head(5).copy()
    
    # Format confidence as percentage
    sample_data['confidence_max'] = sample_data['confidence_max'].apply(
        lambda x: f"{x*100:.1f}%"
    )
    
    print(sample_data.to_string(index=False))

First 5 predictions:
----------------------------------------------------------------------
                                               preprocessed_text predicted_label confidence_max
eror pace eror pace gimana baik uninstall login eror tolong baik         negatif          55.8%
                                                           rekam         negatif          37.8%
                                                          akurat         positif          42.6%
                             bantu hitung jarak bener bener mula         positif          51.8%
                                                          ganggu         negatif          44.2%


In [16]:
# # Final Summary
# print("✓ Training completed successfully!")
# print(f"✓ Best model: {best_name}")
# print(f"✓ Model type: {model_type}")
# print(f"✓ Model accuracy: {best_acc:.4f} ({best_acc*100:.2f}%)")
# print(f"✓ Training samples: {len(X_train_full)}")

# if data_new is not None:
#     print(f"✓ New data predicted: {len(predictions)} samples")
#     print(f"✓ Output files:")
#     print(f"   - Model: {MODEL_OUTPUT}")
#     print(f"   - Predictions: {LABELED_OUTPUT}")
# else:
#     print(f"✓ Model saved: {MODEL_OUTPUT}")
#     print(f"⚠ New data not found - predictions skipped")

# print("\n" + "=" * 70)
# print("USAGE INSTRUCTIONS")
# print("=" * 70)

# print(f"""
# Untuk menggunakan model di lain waktu:

# 1. Load model:
#    import pickle
#    with open('{MODEL_OUTPUT}', 'rb') as f:
#        model_data = pickle.load(f)
#        model = model_data['model']
#        model_type = model_data['model_type']

# 2. Predict new text:
#    text = ["contoh teks review"]
   
#    if model_type == "linear":
#        prediction = model.predict(text)
#        probability = model.predict_proba(text)
#    else:  # LLM
#        prediction = model(text)  # LLM is a function
       
#    # Convert to label
#    label_mapping = {{0: 'negatif', 1: 'netral', 2: 'positif'}}
#    predicted_label = label_mapping[prediction[0]]
# """)
# print("✓ SCRIPT COMPLETED SUCCESSFULLY!")