In [None]:
# Import Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizer, TFBertForSequenceClassification
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam
from transformers import TFBertForSequenceClassification
import json

In [None]:
# Load Dataset
data = pd.read_csv('dataset_keluhan_bisnis_100k_balanced.csv')

In [None]:
# Preprocessing Dataset
def clean_text(text):
    # Bersihkan teks (contoh sederhana)
    return text.lower()

data['cleaned_text'] = data['Keluhan'].apply(clean_text)

In [None]:
# Encode Labels
label_encoder = LabelEncoder()
data['encoded_category'] = label_encoder.fit_transform(data['Category Bidang'])
num_classes = len(label_encoder.classes_)

In [None]:
# Split Data
X_train, X_test, y_train, y_test = train_test_split(
    data['cleaned_text'],
    data['encoded_category'],
    test_size=0.2,
    random_state=42,
    stratify=data['encoded_category']
)

In [None]:
# Load IndoBERT Tokenizer and Model
tokenizer = BertTokenizer.from_pretrained('indobenchmark/indobert-base-p1')
model = TFBertForSequenceClassification.from_pretrained('indobenchmark/indobert-base-p1', num_labels=num_classes)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/229k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.53k [00:00<?, ?B/s]

tf_model.h5:   0%|          | 0.00/656M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Tokenization Function
def bert_encode(texts, tokenizer, max_len=128):
    input_ids = []
    attention_masks = []

    for text in texts:
        encoded = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=max_len,
            truncation=True,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='tf'
        )
        input_ids.append(encoded['input_ids'])
        attention_masks.append(encoded['attention_mask'])

    return np.array(input_ids), np.array(attention_masks)

In [None]:
# Tokenize Data
max_len = 128
X_train_ids, X_train_masks = bert_encode(X_train, tokenizer, max_len)
X_test_ids, X_test_masks = bert_encode(X_test, tokenizer, max_len)


In [None]:
!pip install --upgrade tensorflow



In [None]:
from tensorflow.keras.optimizers import Adam

# Compile Model
# Instead of passing the instantiated object:
# model.compile(optimizer=Adam(learning_rate=2e-5), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Pass the Adam class (or identifier 'adam') directly:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
# OR
# model.compile(optimizer=Adam(learning_rate=2e-5), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [None]:
# Callbacks
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2, min_lr=1e-6)

In [None]:
# Callbacks - Use the correctly imported EarlyStopping and ReduceLROnPlateau:
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2, min_lr=1e-6)

In [None]:
# Tokenization Function
def bert_encode(texts, tokenizer, max_len=128):
    input_ids = []
    attention_masks = []

    for text in texts:
        encoded = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=max_len,
            truncation=True,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='tf'  # Return TensorFlow tensors
        )
        input_ids.append(encoded['input_ids'][0])  # Append the first element of the tensor
        attention_masks.append(encoded['attention_mask'][0]) # Append the first element of the tensor

    return tf.stack(input_ids, axis=0), tf.stack(attention_masks, axis=0) # Stack tensors using tf.stack

In [None]:
# Import necessary libraries
from transformers import BertTokenizer, TFBertForSequenceClassification
import tensorflow as tf # Import tensorflow
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau # Correct import for EarlyStopping and ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam

# ... your existing code ...

# Callbacks - Use the correctly imported EarlyStopping and ReduceLROnPlateau:
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True) # Use tf.keras.callbacks.EarlyStopping
reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2, min_lr=1e-6) # Use tf.keras.callbacks.ReduceLROnPlateau

# ... rest of your code ...

In [None]:
# Load IndoBERT tokenizer and configuration
tokenizer = BertTokenizer.from_pretrained('indolem/indobert-base-uncased')
config = BertConfig.from_pretrained('indolem/indobert-base-uncased')
# Load the configuration

# Create the model with the configuration and load PyTorch weights
model = TFBertForSequenceClassification.from_pretrained('indolem/indobert-base-uncased', from_pt=True, config=config)

# ... (rest of your code remains the same) ...

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/234k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.01k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/445M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Tokenization Function
def bert_encode(texts, tokenizer, max_len=128):
    input_ids, attention_masks = [], []
    for text in texts:
        encoded = tokenizer.encode_plus(text, add_special_tokens=True, max_length=max_len,
                                        truncation=True, padding='max_length', return_attention_mask=True,
                                        return_tensors='tf')
        input_ids.append(encoded['input_ids'][0])
        attention_masks.append(encoded['attention_mask'][0])
    return tf.stack(input_ids, axis=0), tf.stack(attention_masks, axis=0)

In [None]:
def predict_bidang_and_konsultan(text, model, tokenizer, label_encoder, df, max_len=128):
    input_ids, attention_mask = bert_encode([text], tokenizer, max_len)
    predictions = model.predict([input_ids, attention_mask], verbose=0)
    logits = predictions.logits
    top_index = np.argmax(logits[0])
    predicted_bidang = label_encoder.inverse_transform([top_index])[0]

    # Ambil Nama Konsultan berdasarkan bidang yang diprediksi
    konsultan = df[df['Category Bidang'] == predicted_bidang]['Nama Konsultan'].iloc[0]

    return predicted_bidang, konsultan

# Example Prediction
example_text = "Kesulitan dalam strategi pemasaran karena saya masih pemula dalam pengembangannya"
predicted_bidang, konsultan = predict_bidang_and_konsultan(example_text, model, tokenizer, label_encoder, data)
print(f"Bidang yang direkomendasikan: {predicted_bidang}")
print(f"Nama Konsultan: {konsultan}")

Bidang yang direkomendasikan: Industri Pengembangan Bisnis
Nama Konsultan: Ardhi Wijaya


In [None]:
def predict_top_4(text, model, tokenizer, label_encoder, max_len=128):
    """Prediksi empat kategori teratas."""
    # Encode input text using bert_encode
    input_ids, attention_mask = bert_encode([text], tokenizer, max_len)

    # Get predictions from the model
    predictions = model.predict([input_ids, attention_mask])

    if hasattr(predictions, 'logits'):  # Ensure logits exist
        logits = predictions.logits[0]  # Extract logits for the input
    else:
        logits = predictions[0]  # Some models may return predictions directly

    # Get indices of the top 4 predictions
    top_4_indices = np.argsort(logits)[-4:][::-1]  # Sort and take the 4 largest indices

    # Convert indices to labels and get scores
    top_4_classes = label_encoder.inverse_transform(top_4_indices)  # Convert indices to labels
    top_4_scores = logits[top_4_indices]  # Get the scores for the top 4 predictions

    # Return list of tuples (category, score)
    return list(zip(top_4_classes, top_4_scores))

# Example Prediction
example_text = "Masalah dengan efisiensi	dan Kesulitan dalam strategi Hambatan dalam penerapan kebijakan	Ketidakefisienan pada pemasaran digital	"
print(predict_top_4(example_text, model, tokenizer, label_encoder))


[('Komunikasi Pemasaran', 0.2757853), ('Industri Pengembangan Bisnis', 0.16685967)]


In [None]:
# Save Label Encoder as JSON
label_map = {i: label for i, label in enumerate(label_encoder.classes_)}
with open('label_encoder.json', 'w') as f:
    json.dump(label_map, f)
print("Label encoder saved as label_encoder.json")

Label encoder saved as label_encoder.json


In [None]:
# Save Model as TFLite
converter = tf.lite.TFLiteConverter.from_keras_model(model)
tflite_model = converter.convert()
with open('model.tflite', 'wb') as f:
    f.write(tflite_model)
print("Model saved as model.tflite")

Model saved as model.tflite
