# Product Text Classification with OpenAI Embeddings

Steps:

- Load and filter dataset

- Create embeddings with text-embedding-3-small.

- Split train/test (stratify by category).

- Train Random Forest classifier.

- Evaluate with accuracy & classification report.

In [36]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [None]:
from openai import OpenAI
client = OpenAI(api_key="OPENAI_API_KEY")

### Import and filter dataset based on a minimum number of samples per class

In [3]:
df_input = pd.read_csv('train.csv')

In [12]:
n_sample = 1000
min_samples = 3
random_state = 42

# Sample reproducibly
df_sample = df_input.sample(n=n_sample, random_state=random_state)

# Drop columns
df_sample = df_sample.drop(["language", "label_quality"], axis=1)

# Keep only categories with at least `min_samples`
valid_categories = df_sample['category'].value_counts()
valid_categories = valid_categories[valid_categories >= min_samples].index
df_sample = df_sample[df_sample['category'].isin(valid_categories)]

# Renamdf_input_samplee column
df_sample = df_sample.rename(columns={'title': 'text'})

In [13]:
df_sample

Unnamed: 0,text,category
19555084,Nissan Versa,CLASSIC_CARS
831796,Llave Contacto Yamaha Fz 16 + Tapa Tanque Mpr,MOTORCYCLE_IGNITION_SWITCHES
6623530,Switch Ubiquiti Unifi 8 - Puertos Giga Admini...,NETWORK_SWITCHES
8310326,Moto G5 Libre Huella Zona Oeste,CELLPHONES
14784082,Bateria Heliar 50 Jd *** New Civic- Crv***bate...,AUTOMOTIVE_BATTERIES
...,...,...
11931008,Cable Tipo Taller Tpr 5x6 Normalizado - X 800 ...,ELECTRICAL_CABLES
6453562,"Somente Rede Creme (1,20 X 1,25)",PROTECTION_NETS
10253712,Funda 70-50,CLEANING_CLOTHS
18456754,4 Planchas De Pelo Babyliss,HAIR_STRAIGHTENERS


### Create embeddings

What to use: text or [text]? Both actually work

- input=text → returns 1 embedding.
- input=[text] → also returns 1 embedding (inside a list).

##### TODO: clean up raw data

In [8]:
def get_embedding(text, model="text-embedding-3-small"):
    # TODO: pre-processing text text = text.replace("\n", " ")
    return client.embeddings.create(input = [text], model=model).data[0].embedding

In [14]:
df_sample['embedding_small'] = df_sample['text'].apply(lambda x: get_embedding(x, model='text-embedding-3-small'))

In [16]:
df_sample = df_sample.reset_index(drop=True)
df_sample = df_sample.drop('text', axis = 1)

In [17]:
len(df_sample['embedding_small'][0])

1536

In [21]:
pd.set_option('display.max_colwidth', None)
df_sample.head(1)

Unnamed: 0,category,embedding_small
0,CLASSIC_CARS,"[-0.013427658006548882, -0.01984761655330658, -0.0205822940915823, 0.010505898855626583, -0.014139731414616108, -0.02068401873111725, -0.011833971366286278, 0.038836222141981125, 0.017654884606599808, -0.0209891926497221, -0.011370559222996235, 0.03198676183819771, -0.010794118978083134, 0.015823839232325554, 0.04295042157173157, -0.0027183096390217543, -0.007748030126094818, -0.003913574852049351, -0.021373486146330833, -0.009189129807054996, 0.06460648030042648, 0.04656729847192764, 0.06510379910469055, -0.034835051745176315, 0.018988607451319695, -0.014456207863986492, -0.031466834247112274, 0.03038177266716957, -0.02495645545423031, -0.03983086347579956, 0.02250375971198082, -0.029658395797014236, 0.04873742535710335, -0.021927321329712868, -0.00959037709981203, 0.01792614907026291, 0.056604135781526566, -0.042385283857584, -0.009935110807418823, -0.027126582339406013, -0.001426971284672618, 0.06817814707756042, 0.046476878225803375, -0.01251213625073433, 0.0567849799990654, 0.013970189727842808, -0.006838159170001745, 0.01291903480887413, 0.008742671459913254, 0.015462151728570461, -0.058683838695287704, -0.034563787281513214, 0.017214076593518257, 0.03885883092880249, -0.04973206669092178, 0.005589206237345934, 0.01867213100194931, -0.0002647314395289868, 0.046205613762140274, -0.01654721610248089, -0.01326941978186369, -0.06334056705236435, -0.02590588666498661, 0.11501670628786087, 0.024481739848852158, -0.009420836344361305, 0.04432935640215874, 0.0427921824157238, -0.013178997673094273, -0.02445913478732109, -0.018581708893179893, 0.01271558552980423, 0.004413721151649952, -0.0006926461937837303, 0.015066555701196194, 0.010155513882637024, -0.006318232975900173, -0.03962741419672966, 0.01618552766740322, 0.00549313286319375, -0.05018417909741402, 0.00658384757116437, -0.028505517169833183, 0.0031788963824510574, -0.06727392226457596, -0.04161669686436653, -0.019870221614837646, -0.014456207863986492, -0.004408069420605898, 0.008234048262238503, -0.029681002721190453, -0.017270591109991074, -0.06062791123986244, -0.004758454859256744, 0.02032233215868473, 0.04084811359643936, 0.0012941639870405197, -0.05497654154896736, 0.017417525872588158, -0.003582969307899475, ...]"


In [22]:
pd.reset_option('display.max_colwidth', None)

##### TODO: check embeddings len when saving file

In [25]:
# Save
df_sample.to_parquet("embeddings/df_sample_with_embeddings.parquet", index=False)

In [None]:
# Load
df_embeddings = pd.read_parquet("embeddings/df_sample_with_embeddings.parquet")

In [None]:
# Check if embeddings are equal
all_close = np.allclose(
    # vstack: takes a list (or iterable) of arrays/vectors and stacks them row by row into a 2D array.
    np.vstack(df_sample["embedding_small"]),
    np.vstack(df_embeddings["embedding_small"]),
    rtol=1e-6
)

print("Embeddings match:", all_close)

Embeddings match: True


### Split with stratification

In [39]:
X_train, X_valid, y_train, y_valid = train_test_split(df_sample['embedding_small'], df_sample['category'], 
                                                      test_size=0.3, random_state=42, stratify=df_sample['category'])

In [40]:
len(X_train), len(X_valid), len(y_train), len(y_valid)

(172, 75, 172, 75)

In [41]:
X_train.shape, y_train.shape, X_valid.shape, y_valid.shape

((172,), (172,), (75,), (75,))

In [42]:
X_train.head()

217    [0.03802502155303955, 0.05629558116197586, -0....
127    [0.013571598567068577, -0.009913485497236252, ...
197    [0.04497840255498886, -0.006360092666000128, 0...
139    [0.04696916788816452, 0.024686014279723167, -0...
211    [0.03358994051814079, 0.011265762150287628, -0...
Name: embedding_small, dtype: object

scikit-learn
fit(X, y, sample_weight=None)

X_train is a serie of list, but RandomForestClassifier needs 2D array
X: {array-like, sparse matrix} of shape (`n_samples`, `n_features`)

In [45]:
# convert serie to list
import numpy as np
X_train = np.array(X_train.tolist())
X_valid  = np.array(X_valid.tolist())

X_train.shape, X_valid.shape

((172, 1536), (75, 1536))

### Model and Train: RandomForestClassifier

In [64]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

clf = RandomForestClassifier(n_estimators=200)
clf.fit(X_train, y_train)
pred_rfc = clf.predict(X_valid)

#### KNeighborsClassifier

Data preparation for Knn:

- Rescale Data : KNN performs better when data is on the same scale. Normalizing to [0, 1] or standardizing (for Gaussian distributions) improves accuracy.

- Reduce Dimensionality : KNN works best with fewer features. In high-dimensional data, feature selection can improve performance by reducing irrelevant variables. [link](https://medium.com/@RobuRishabh/knn-k-nearest-neighbour-5ae18ae8e274)

In [74]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_valid_scaled = scaler.fit_transform(X_valid)

In [114]:
from sklearn.decomposition import PCA

pca = PCA(n_components=50)
X_train_reduced = pca.fit_transform(X_train)
X_valid_reduced = pca.transform(X_valid)

In [115]:
X_train.shape, X_valid.shape

((172, 1536), (75, 1536))

In [92]:
X_train_reduced.shape, X_valid_reduced.shape

((172, 70), (75, 70))

In [77]:
from sklearn.neighbors import KNeighborsClassifier

knn_clf = KNeighborsClassifier(n_neighbors=5)
knn_clf.fit(X_train, y_train)
pred_knn = knn_clf.predict(X_valid)

In [80]:
from sklearn.neighbors import KNeighborsClassifier

knn_clf_scaled = KNeighborsClassifier(n_neighbors=5)
knn_clf_scaled.fit(X_train_scaled, y_train)
pred_knn_scaled = knn_clf_scaled.predict(X_valid_scaled)

In [116]:
knn_clf_reduced = KNeighborsClassifier(n_neighbors=5)
knn_clf_reduced.fit(X_train_reduced,y_train)
pred_knn_reduced = knn_clf_reduced.predict(X_valid_reduced)

In [117]:
print(f'Random Forest acc: {accuracy_score(y_valid, pred_rfc)}')
print(f'KNN acc: {accuracy_score(y_valid,pred_knn)}')
print(f'KNNScaled acc: {accuracy_score(y_valid,pred_knn_scaled)}')
print(f'KNNReduced acc {accuracy_score(y_valid, pred_knn_reduced)}')
# print(f'Ensamble acc: {accuracy_score(y_test,ensamble_pred_2)}')

Random Forest acc: 0.41333333333333333
KNN acc: 0.5066666666666667
KNNScaled acc: 0.5333333333333333
KNNReduced acc 0.5066666666666667
