In [1]:
# Data manipulation
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [4]:
df = pd.read_csv('ensembled_mobil_listrik.csv')  # Replace with your file path
df = df.dropna()
X = df['text_cleaning']
map_sentiment = {'positif': 2, 'netral': 1, 'negatif': 0}
y = df['final_ensemble_sentiment'].map(map_sentiment)

In [14]:
df[df.final_ensemble_sentiment != df.sentimen].head(30)

Unnamed: 0,id_komentar,nama_akun,tanggal,text_cleaning,sentimen,mapped_text,sentimen_RoBERTa,sentimen_DistilBERT,sentimen_BERT,sentimen_INDOBERTweet,final_ensemble_sentiment
2,UgwqJqu6JMF4EH2CsVV4AaABAg,Fatih Al-Ayyubi,2023-08-04 10:17:57+00:00,baik kualitas kembang dulu baik kualitas motor...,positif,baik kualitas kembang dulu baik kualitas motor...,positif,netral,negatif,negatif,negatif
5,Ugx-zVY4ktd7JNUB6xV4AaABAg,Syarif Airlangga,2023-08-04 06:58:17+00:00,harga motor mahal masa harga mirip motor beat ...,positif,harga motor mahal masa harga mirip motor beat ...,negatif,netral,negatif,negatif,negatif
7,Ugyy3luBOOHQspWyBiR4AaABAg,Putut Parwoto,2023-08-04 01:04:18+00:00,proses kenal produk baru butuh waktu ganti ken...,negatif,proses kenal produk baru butuh waktu ganti ken...,netral,netral,netral,negatif,netral
12,UgzZY-ZoTvfiVoOv8Dt4AaABAg,gema,2023-08-03 04:23:25+00:00,kampung sekarang banyak banget bocil sama ce...,positif,kampung sekarang banyak banget bocil sama cewe...,netral,netral,negatif,negatif,negatif
14,UgwzritHDvD9naYvia54AaABAg,Khoirudin 22,2023-07-29 11:08:07+00:00,harga terlalu mahal,positif,harga terlalu mahal,negatif,negatif,negatif,negatif,negatif
19,UgxGrdA8dD3FNULB4Rd4AaABAg,Lukman Effendi,2023-07-28 10:23:53+00:00,bapak luhut panjaitan yth baik pakai dulu dina...,positif,bapak luhut panjaitan yth baik pakai dulu dina...,negatif,netral,netral,negatif,negatif
20,Ugw_osI9is5z9jHD9454AaABAg,Ricky Thunger,2023-07-27 09:20:51+00:00,subsidi motor jalan umum tolak system,negatif,subsidi motor jalan umum tolak sistem,netral,netral,netral,netral,netral
23,Ugym8DQyKPAzJntZCeZ4AaABAg,Deddy jagad semesta,2023-07-20 07:40:34+00:00,tambah bikin rakyat susah,netral,tambah bikin rakyat susah,negatif,negatif,negatif,negatif,negatif
25,UgxFMeOytVzzJo6iy5F4AaABAg,TopTrainers,2023-07-16 06:30:59+00:00,jual ev naik jual genset bakal naik ni,positif,jual ev naik jual genset bakal naik ni,netral,netral,netral,negatif,netral
26,UgxcvX9LMnprk35PR3d4AaABAg,parkir anovo,2023-07-13 05:17:26+00:00,contohin dulu semua menteri dpr mpr guna listri,negatif,contohkan dulu semua menteri dpr mpr guna listrik,netral,netral,netral,netral,netral


In [5]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.1, random_state=42)
# show len
print('Train data:', len(X_train))
print('Test data:', len(X_test))

Train data: 1362
Test data: 152


# Traditional ML

In [6]:
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
import lightgbm as lgb
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
import numpy as np
import warnings
warnings.filterwarnings('ignore')

## Result

In [20]:
classes_weights = list(class_weight.compute_class_weight(class_weight='balanced',
                                                         classes=np.unique(df['final_ensemble_sentiment']),
                                                         y=df['final_ensemble_sentiment']))

In [30]:
df['final_ensemble_sentiment'].value_counts()  

final_ensemble_sentiment
negatif    1150
netral      235
positif     129
Name: count, dtype: int64

In [28]:
config = {
    'vectorizer': 'tfidf',  # Choose 'tfidf' or 'count'
    'vectorizer_params': {'max_features': 5000},  # Additional parameters for vectorizer
    'model_params': {},  # Additional parameters for models
    'random_state': 42,
}

model_dict = {
    'xgboost': xgb.XGBClassifier(objective='multi:softmax', num_class=3, **config['model_params'], class_weights=classes_weights),
    'randomforest': RandomForestClassifier(**config['model_params']),
    'svm': SVC(**config['model_params']),
    'logreg': LogisticRegression(**config['model_params']),
    'lightgbm': lgb.LGBMClassifier(**config['model_params'], class_weights=classes_weights),
}

In [32]:
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score

vectorizer = TfidfVectorizer(**config['vectorizer_params'])

# List to store results and a dictionary to store predictions
results = []
predictions_dict = {}

# Iterate through the models and evaluate them
for model_name, model in model_dict.items():
    # Create the pipeline
    pipeline = Pipeline([
        ('vectorizer', vectorizer),
        ('model', model),
    ])
    
    # Perform cross-validation for different metrics on the training set
    f1_scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='f1_weighted')
    mean_f1_score = np.mean(f1_scores)
    
    accuracy_scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='accuracy')
    mean_accuracy_score = np.mean(accuracy_scores)
    
    precision_scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='precision_weighted')
    mean_precision_score = np.mean(precision_scores)
    
    recall_scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='recall_weighted')
    mean_recall_score = np.mean(recall_scores)
    
    # Fit the model on the full training set
    pipeline.fit(X_train, y_train)
    
    # Predict on the test set
    y_pred = pipeline.predict(X_test)
    
    # Store predictions in the dictionary
    predictions_dict[model_name] = y_pred
    
    # Compute metrics on the test set
    test_f1 = f1_score(y_test, y_pred, average='weighted')
    test_accuracy = accuracy_score(y_test, y_pred)
    test_precision = precision_score(y_test, y_pred, average='weighted')
    test_recall = recall_score(y_test, y_pred, average='weighted')
    
    # Append the results (train and test metrics)
    results.append({
        'model': model_name,
        'train_f1': mean_f1_score,
        'train_accuracy': mean_accuracy_score,
        'train_precision': mean_precision_score,
        'train_recall': mean_recall_score,
        'test_f1': test_f1,
        'test_accuracy': test_accuracy,
        'test_precision': test_precision,
        'test_recall': test_recall
    })

# Print the results in a table format
print(f"{'Model':<15} {'Train F1':<10} {'Train Acc':<10} {'Train Prec':<10} {'Train Recall':<10} {'Test F1':<10} {'Test Acc':<10} {'Test Prec':<10} {'Test Recall':<10}")
for result in results:
    print(f"{result['model']:<15} {result['train_f1']:.4f}    {result['train_accuracy']:.4f}   {result['train_precision']:.4f}    {result['train_recall']:.4f}    "
          f"{result['test_f1']:.4f}    {result['test_accuracy']:.4f}   {result['test_precision']:.4f}    {result['test_recall']:.4f}")

ValueError: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\user\anaconda3\envs\gnn\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\user\anaconda3\envs\gnn\Lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\user\anaconda3\envs\gnn\Lib\site-packages\sklearn\pipeline.py", line 476, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "c:\Users\user\anaconda3\envs\gnn\Lib\site-packages\lightgbm\sklearn.py", line 1284, in fit
    super().fit(
  File "c:\Users\user\anaconda3\envs\gnn\Lib\site-packages\lightgbm\sklearn.py", line 955, in fit
    self._Booster = train(
                    ^^^^^^
  File "c:\Users\user\anaconda3\envs\gnn\Lib\site-packages\lightgbm\engine.py", line 282, in train
    booster = Booster(params=params, train_set=train_set)
              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\user\anaconda3\envs\gnn\Lib\site-packages\lightgbm\basic.py", line 3637, in __init__
    train_set.construct()
  File "c:\Users\user\anaconda3\envs\gnn\Lib\site-packages\lightgbm\basic.py", line 2576, in construct
    self._lazy_init(
  File "c:\Users\user\anaconda3\envs\gnn\Lib\site-packages\lightgbm\basic.py", line 2144, in _lazy_init
    params_str = _param_dict_to_str(params)
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\user\anaconda3\envs\gnn\Lib\site-packages\lightgbm\basic.py", line 537, in _param_dict_to_str
    raise TypeError(f"Unknown type of parameter:{key}, got:{type(val).__name__}")
TypeError: Unknown type of parameter:class_weights, got:dict


In [11]:
import json
# make array serializeable
# for key, value in predictions_dict.items():
#     predictions_dict[key] = value.tolist()

# Save the predictions to a JSON file
with open('predictions_ml_ensemble.json', 'w') as f:
    json.dump(predictions_dict, f)

# Save the results to a CSV file
results_df = pd.DataFrame(results)
results_df.to_csv('results_ml_ensemble.csv', index=False)

# BERT 

In [99]:
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import Trainer, TrainingArguments
import torch
from datasets import Dataset

In [103]:
# Load BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('indobenchmark/indobert-base-p1')
model = BertForSequenceClassification.from_pretrained('indobenchmark/indobert-base-p1', num_labels=3)

def tokenize_function(texts):
    return tokenizer(texts, padding=True, truncation=True, max_length=128, return_tensors="pt")

X_train_bert, X_val_bert, y_train_bert, y_val_bert = train_test_split(X_train, y_train, test_size=0.111, random_state=42)

print('Train data:', len(X_train_bert))
print('Val data:', len(X_val_bert))

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Train data: 1210
Val data: 152


In [107]:
X_train_tokenized = tokenize_function(list(X_train_bert))
X_val_tokenized = tokenize_function(list(X_val_bert))

# Convert labels to lists of integers
y_train_list = y_train_bert.tolist()
y_val_list = y_val_bert.tolist()

In [108]:
train_dataset = Dataset.from_dict({
    'input_ids': [ids.tolist() for ids in X_train_tokenized['input_ids']],  # Convert tensors to lists
    'attention_mask': [mask.tolist() for mask in X_train_tokenized['attention_mask']],  # Convert tensors to lists
    'labels': y_train_list
})

eval_dataset = Dataset.from_dict({
    'input_ids': [ids.tolist() for ids in X_val_tokenized['input_ids']],  # Convert tensors to lists
    'attention_mask': [mask.tolist() for mask in X_val_tokenized['attention_mask']],  # Convert tensors to lists
    'labels': y_val_list
})

In [109]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = torch.argmax(torch.tensor(logits), dim=-1)
    accuracy = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')
    return {
        'accuracy': accuracy,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [110]:
training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy="epoch",  # Use 'eval_strategy' in future versions
    logging_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
)

# Move the model to GPU if available
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)

# Use Huggingface's Trainer API
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()

  0%|          | 0/760 [00:00<?, ?it/s]

{'loss': 0.8617, 'grad_norm': 47.028038024902344, 'learning_rate': 4e-05, 'epoch': 1.0}


  0%|          | 0/19 [00:00<?, ?it/s]

{'eval_loss': 0.8264140486717224, 'eval_accuracy': 0.631578947368421, 'eval_f1': 0.6106526478798622, 'eval_precision': 0.6313230994152047, 'eval_recall': 0.631578947368421, 'eval_runtime': 1.2545, 'eval_samples_per_second': 121.163, 'eval_steps_per_second': 15.145, 'epoch': 1.0}
{'loss': 0.5498, 'grad_norm': 69.61203002929688, 'learning_rate': 3e-05, 'epoch': 2.0}


  0%|          | 0/19 [00:00<?, ?it/s]

{'eval_loss': 0.7540877461433411, 'eval_accuracy': 0.7368421052631579, 'eval_f1': 0.7174865823537059, 'eval_precision': 0.7186380872129767, 'eval_recall': 0.7368421052631579, 'eval_runtime': 1.2127, 'eval_samples_per_second': 125.338, 'eval_steps_per_second': 15.667, 'epoch': 2.0}
{'loss': 0.2641, 'grad_norm': 0.15904687345027924, 'learning_rate': 2e-05, 'epoch': 3.0}


  0%|          | 0/19 [00:00<?, ?it/s]

{'eval_loss': 1.2714046239852905, 'eval_accuracy': 0.7105263157894737, 'eval_f1': 0.706668190326934, 'eval_precision': 0.7031774916013438, 'eval_recall': 0.7105263157894737, 'eval_runtime': 1.2572, 'eval_samples_per_second': 120.903, 'eval_steps_per_second': 15.113, 'epoch': 3.0}
{'loss': 0.1325, 'grad_norm': 0.01548162940889597, 'learning_rate': 1e-05, 'epoch': 4.0}


  0%|          | 0/19 [00:00<?, ?it/s]

{'eval_loss': 1.5487794876098633, 'eval_accuracy': 0.7368421052631579, 'eval_f1': 0.7206778823064102, 'eval_precision': 0.7206697298802561, 'eval_recall': 0.7368421052631579, 'eval_runtime': 1.2231, 'eval_samples_per_second': 124.276, 'eval_steps_per_second': 15.535, 'epoch': 4.0}
{'loss': 0.0313, 'grad_norm': 0.011891782283782959, 'learning_rate': 0.0, 'epoch': 5.0}


  0%|          | 0/19 [00:00<?, ?it/s]

{'eval_loss': 1.4958992004394531, 'eval_accuracy': 0.75, 'eval_f1': 0.7533178524894382, 'eval_precision': 0.759915080253826, 'eval_recall': 0.75, 'eval_runtime': 1.3121, 'eval_samples_per_second': 115.843, 'eval_steps_per_second': 14.48, 'epoch': 5.0}
{'train_runtime': 193.6529, 'train_samples_per_second': 31.241, 'train_steps_per_second': 3.925, 'train_loss': 0.3678650699163738, 'epoch': 5.0}


TrainOutput(global_step=760, training_loss=0.3678650699163738, metrics={'train_runtime': 193.6529, 'train_samples_per_second': 31.241, 'train_steps_per_second': 3.925, 'total_flos': 397959044313600.0, 'train_loss': 0.3678650699163738, 'epoch': 5.0})

In [111]:
trainer.state.log_history

[{'loss': 0.8617,
  'grad_norm': 47.028038024902344,
  'learning_rate': 4e-05,
  'epoch': 1.0,
  'step': 152},
 {'eval_loss': 0.8264140486717224,
  'eval_accuracy': 0.631578947368421,
  'eval_f1': 0.6106526478798622,
  'eval_precision': 0.6313230994152047,
  'eval_recall': 0.631578947368421,
  'eval_runtime': 1.2545,
  'eval_samples_per_second': 121.163,
  'eval_steps_per_second': 15.145,
  'epoch': 1.0,
  'step': 152},
 {'loss': 0.5498,
  'grad_norm': 69.61203002929688,
  'learning_rate': 3e-05,
  'epoch': 2.0,
  'step': 304},
 {'eval_loss': 0.7540877461433411,
  'eval_accuracy': 0.7368421052631579,
  'eval_f1': 0.7174865823537059,
  'eval_precision': 0.7186380872129767,
  'eval_recall': 0.7368421052631579,
  'eval_runtime': 1.2127,
  'eval_samples_per_second': 125.338,
  'eval_steps_per_second': 15.667,
  'epoch': 2.0,
  'step': 304},
 {'loss': 0.2641,
  'grad_norm': 0.15904687345027924,
  'learning_rate': 2e-05,
  'epoch': 3.0,
  'step': 456},
 {'eval_loss': 1.2714046239852905,
  'e

In [112]:
test_dataset = Dataset.from_dict({
    'input_ids': [ids.tolist() for ids in tokenize_function(list(X_test))['input_ids']],  # Convert tensors to lists
    'attention_mask': [mask.tolist() for mask in tokenize_function(list(X_test))['attention_mask']],  # Convert tensors to lists
    'labels': y_test.tolist()
})
y_test_list = y_test.tolist()

predictions = trainer.predict(test_dataset) 
y_pred = np.argmax(predictions.predictions, axis=1)
print(classification_report(y_test_list, y_pred))

  0%|          | 0/19 [00:00<?, ?it/s]

              precision    recall  f1-score   support

           0       0.83      0.82      0.83        95
           1       0.39      0.39      0.39        18
           2       0.65      0.67      0.66        39

    accuracy                           0.73       152
   macro avg       0.62      0.63      0.62       152
weighted avg       0.73      0.73      0.73       152



# Lexicon

In [44]:
from nltk.tokenize import word_tokenize
import nltk

# Download the NLTK tokenizer if not already downloaded
nltk.download('punkt')

# Load sentiment lexicons
def load_lexicon(file_path):
    df = pd.read_csv(file_path, sep='\t', header=0)
    return dict(zip(df['word'], df['weight']))

positive_lexicon = load_lexicon('positive.tsv')
negative_lexicon = load_lexicon('negative.tsv')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [45]:
def get_sentiment_score(text, pos_lexicon, neg_lexicon):
    tokens = word_tokenize(text.lower())
    score = 0
    
    for token in tokens:
        if token in pos_lexicon:
            score += pos_lexicon[token]
        elif token in neg_lexicon:
            score += neg_lexicon[token]
    
    return score

def classify_sentiment(score):
    if score > 0:
        return 'Positive'
    elif score < 0:
        return 'Negative'
    else:
        return 'Neutral'

In [47]:
# nltk.download()
nltk.download('punkt_tab')
text = "hai merekam detail isak"
sentiment_score = get_sentiment_score(text, positive_lexicon, negative_lexicon)
sentiment = classify_sentiment(sentiment_score)

print(f"Sentiment Score: {sentiment_score}")
print(f"Sentiment: {sentiment}")


[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...


Sentiment Score: 2
Sentiment: Positive


[nltk_data]   Unzipping tokenizers\punkt_tab.zip.
