In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AlbertTokenizer, TFAlbertForSequenceClassification
import tensorflow as tf

# Configurations
FILE_PATH = 'datasets/datasets_beauty_product.csv'
EPOCH = 20
BATCH = 32
LEARNING_RATE = 1e-5
MAX_LENGTH = 1000
BASE_PRETRAINED_MODEL = 'albert-base-v2'
NUM_LABELS = 3  # Adjust this if using more labels
MODEL_PATH = "models/my-product-data-202412221621.h5"

# Read and clean dataset
df = pd.read_csv(FILE_PATH)
valid_sentiments = ['Positive', 'Neutral', 'Negative']
df = df[df['sentiment'].isin(valid_sentiments)]

if df.empty:
    raise ValueError("Dataset is empty after filtering. Check sentiment labels.")

# Label mapping
label_mapping = {'Positive': 0, 'Neutral': 1, 'Negative': 2}

# Prepare data
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['reviews'].values,
    df['sentiment'].values,
    test_size=0.2,
    random_state=42
)

train_labels_numeric = [label_mapping[label] for label in train_labels]
test_labels_numeric = [label_mapping[label] for label in test_labels]

# Tokenization
tokenizer = AlbertTokenizer.from_pretrained(BASE_PRETRAINED_MODEL)
train_encodings = tokenizer(list(train_texts), truncation=True, padding=True, max_length=MAX_LENGTH, return_tensors='tf')
test_encodings = tokenizer(list(test_texts), truncation=True, padding=True, max_length=MAX_LENGTH, return_tensors='tf')

# Extract NumPy arrays
train_input_ids = train_encodings['input_ids'].numpy()
train_attention_mask = train_encodings['attention_mask'].numpy()
test_input_ids = test_encodings['input_ids'].numpy()
test_attention_mask = test_encodings['attention_mask'].numpy()

# Create tf.data.Dataset
train_dataset = tf.data.Dataset.from_tensor_slices(((train_input_ids, train_attention_mask), train_labels_numeric)).batch(BATCH)
test_dataset = tf.data.Dataset.from_tensor_slices(((test_input_ids, test_attention_mask), test_labels_numeric)).batch(BATCH)

# Load and compile model
model = TFAlbertForSequenceClassification.from_pretrained(BASE_PRETRAINED_MODEL, num_labels=NUM_LABELS)
optimizer = tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE)
model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train model
model.fit(train_dataset, epochs=EPOCH)

# Evaluate model
eval_results = model.evaluate(test_dataset)
print("Test loss:", eval_results[0])
print("Test accuracy:", eval_results[1])

# Predict
new_texts = [
    'The serum feels very sticky on my skin.' ,
    'It is really works in my skin.',
    'The texture is great, but the brightening effect takes a long time to show.'
]

new_encodings = tokenizer(new_texts, truncation=True, padding=True, max_length=MAX_LENGTH, return_tensors='tf')
predictions = model.predict([new_encodings['input_ids'], new_encodings['attention_mask']])
logits = predictions.logits
predicted_labels = tf.argmax(logits, axis=1).numpy()
predicted_sentiments = [list(label_mapping.keys())[list(label_mapping.values()).index(label)] for label in predicted_labels]
print("Predicted sentiments:", predicted_sentiments)

# Save model weights
model.save_weights(MODEL_PATH)


All PyTorch model weights were used when initializing TFAlbertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFAlbertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Test loss: 0.7099828124046326
Test accuracy: 0.9047619104385376
Predicted sentiments: ['Negative', 'Positive', 'Neutral']


In [None]:
"""Module providing a train pipelines for sentiment analysis"""

from transformers import AlbertTokenizer, TFAlbertForSequenceClassification
import tensorflow as tf

# Download ALBERT Pre-trained Model
label_mapping = {'Positive': 0, 'Neutral': 1, 'Negative': 2}
# label_mapping = {'Very Positive': 0, 'Very Negative': 1, 'Mixed': 2, 'Positive': 3, 'Negative': 4, 'Neutral': 5}
MAX_LENGTH = 1000
NUM_LABELS = 3 # Adjust num_labels based on the number of sentiments
MODEL_PATH = 'models/my-product-data-202412221621.h5'
BASE_PRETRAINED_MODEL='albert-base-v2'


tokenizer = AlbertTokenizer.from_pretrained(BASE_PRETRAINED_MODEL)
model = TFAlbertForSequenceClassification.from_pretrained(BASE_PRETRAINED_MODEL, num_labels=NUM_LABELS)
model.load_weights(MODEL_PATH)

new_texts = [
    'This brightening serum made my skin feel so oily and I see tehre is no brightening effect after weeks of use.',
    'It is an average serum. My skin feels smooth, but the brightening effect is subtle.',
    'I really love how this serum has brightened my complexion and reduced redness on my face.',
    'Unfortunately, this serum caused irritation, and I had to discontinue using it.',
    'It is okay. My skin feels hydrated, but I was expecting a more noticeable improvement.',
    'After a month of using this brightening serum, I can see a visible glow in my skin!',
    'This serum is too heavy for my skin type, and I did not notice any brightening effects.',
    'It is fine, it brightens my skin a little, but I expected faster results.',
    'This brightening serum has transformed my dull skin into a radiant glow!',
    'I did not see any difference after using this serum for several weeks, which was disappointing.'
]

new_encodings = tokenizer(new_texts, truncation=True, padding=True, max_length=MAX_LENGTH, return_tensors='tf')

new_input_ids = new_encodings['input_ids'].numpy()
new_attention_mask = new_encodings['attention_mask'].numpy()


# Mengambil logits dari TFSequenceClassifierOutput dan lakukan predictions
predictions = model.predict([new_input_ids, new_attention_mask]) # type: ignore
logits = predictions.logits
predicted_labels = tf.argmax(logits, axis=1).numpy()
predicted_sentiments = [list(label_mapping.keys())[list(label_mapping.values()).index(label)] for label in predicted_labels]
print(f'Predicted sentiments: {predicted_sentiments}')

All PyTorch model weights were used when initializing TFAlbertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFAlbertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Predicted sentiments: ['Positive', 'Neutral', 'Positive', 'Negative', 'Neutral', 'Positive', 'Negative', 'Neutral', 'Positive', 'Negative']
