In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from langchain_ollama.embeddings import OllamaEmbeddings

In [10]:
df = pd.read_csv("dataset.csv")
texts = df["text"].astype(str).tolist()
labels = df["label"].astype(str).tolist()

In [13]:
embeddings_model = OllamaEmbeddings(model="bge-m3:567m")

def get_ollama_embeddings(texts):
    if not isinstance(texts, list):
        texts = [texts]
    embeddings = embeddings_model.embed_documents(texts)
    return np.array(embeddings, dtype=np.float32)

In [14]:
X = get_ollama_embeddings(texts)
encoder = LabelEncoder()
y = encoder.fit_transform(labels)

In [16]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(X.shape[1],)),
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(len(np.unique(y)), activation='softmax')  # multi-class
])

model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 256)               262400    
                                                                 
 dropout (Dropout)           (None, 256)               0         
                                                                 
 dense_1 (Dense)             (None, 128)               32896     
                                                                 
 dense_2 (Dense)             (None, 6)                 774       
                                                                 
Total params: 296,070
Trainable params: 296,070
Non-trainable params: 0
_________________________________________________________________


In [17]:
history = model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=15,
    batch_size=32
)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [18]:
loss, acc = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {acc:.4f}")

Test Accuracy: 0.9538


In [19]:
model.save("befoys_text_classifier_model.h5")
np.save("befoys_label_encoder.npy", encoder.classes_)

print("Model and label encoder saved!")

Model and label encoder saved!
