# Product Text Classification with OpenAI Embeddings

Steps:

- Load and filter dataset

- Create embeddings with text-embedding-3-small.

- Split train/test (stratify by category).

- Train Random Forest classifier.

- Evaluate with accuracy & classification report.

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

  from pandas.core import (


In [None]:
from openai import OpenAI
client = OpenAI(api_key="OPENAI_API_KEY")

### Import and filter dataset based on a minimum number of samples per class

In [3]:
df_input = pd.read_csv('train.csv')

In [None]:
# df_input = df_input[df_input['language'] == 'spanish']
n_sample = 1000
min_samples = 3

df_input_sample = df_input.sample(n = n_sample).drop(["language", "label_quality"], axis = 1)

unique_categories = df_input_sample['category'].value_counts()
list_filtered_categories = unique_categories[unique_categories > min_samples].index.to_list()
df_input_filtered = df_input_sample[df_input_sample["category"].isin(list_filtered_categories)]
df_input_filtered = df_input_filtered.rename(columns={'title': 'text'})

In [78]:
df_input_filtered

Unnamed: 0,text,category
11642328,Leshp Hdmi 2.0 1x2 Splitter Apoyo Completo Hd ...,COMPUTER_MONITORS
11555541,Robofish Nemo Coleccion Buscando A Dory Reyna ...,TOY_ROBOTS
9204094,Kit Cables De Bujias Chevrolet Corsa Spin Coba...,SPARK_PLUG_WIRESETS
7544631,Loro Sentido De La Mano Rc Drone Led Luces De ...,TOY_ROBOTS
14598291,Sandalias Plataforma Con Tachas Livianas,SANDALS_AND_FLIP_FLOPS
...,...,...
2122960,Cable Bujía Ferrazzi Competicion Citroen Bx 1....,SPARK_PLUG_WIRESETS
11993176,Mesa Rectangular De Marmol Y Bronce Doble Con ...,COFFEE_TABLES
12978432,Bandeja Extersa 105 Rec,SERVING_AND_HOME_TRAYS
4168756,Faro Patente Gacel G2 88/95,CAR_LIGHT_BULBS


### Create embeddings

What to use: text or [text]? Both actually work

- input=text → returns 1 embedding.
- input=[text] → also returns 1 embedding (inside a list).

##### TODO: clean up raw data

In [79]:
def get_embedding(text, model="text-embedding-3-small"):
    # TODO: pre-processing text text = text.replace("\n", " ")
    return client.embeddings.create(input = [text], model=model).data[0].embedding

In [80]:
df_input_filtered['embedding_small'] = df_input_filtered['text'].apply(lambda x: get_embedding(x, model='text-embedding-3-small'))

In [None]:
df_input_filtered = df_input_filtered.reset_index(drop=True)
df_input_filtered = df_input_filtered.drop('text', axis = 1)

In [82]:
len(df_input_filtered['embedding_small'][0])

1536

In [83]:
pd.set_option('display.max_colwidth', None)
df_input_filtered.iloc[0]

text                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    

##### TODO: check embeddings len when saving file

In [106]:
#TODO check, the len of embeddings increases
df_input_filtered.to_csv(f'embeddings/embedded_{n_sample}_products.csv', index=False)

In [86]:
df_embeddings = pd.read_csv('embeddings/embedded_10_products.csv')

In [88]:
df_input_filtered['embedding_small'].map(len), df_embeddings['embedding_small'].map(len)

(0      1536
 1      1536
 2      1536
 3      1536
 4      1536
        ... 
 102    1536
 103    1536
 104    1536
 105    1536
 106    1536
 Name: embedding_small, Length: 107, dtype: int64,
 0     34172
 1     34221
 2     34257
 3     34204
 4     34178
       ...  
 92    34231
 93    34252
 94    34191
 95    34168
 96    34237
 Name: embedding_small, Length: 97, dtype: int64)

### Split with stratification

In [90]:
X_train, X_test, y_train, y_test = train_test_split(df_input_filtered['embedding_small'], df_input_filtered['category'], 
                                                      test_size=0.3, random_state=42, stratify=df_input_filtered['category'])

In [95]:
len(X_train) , len(X_test), X_train.shape, y_train.shape

(74, 33, (74,), (74,))

In [92]:
X_train.head()

37    [0.021981798112392426, -0.04122736677527428, -...
64    [0.01887599192559719, -0.0016054322477430105, ...
47    [0.00400319742038846, 0.021482350304722786, 0....
31    [0.028445715084671974, -0.030287211760878563, ...
25    [0.050353433936834335, -0.049755536019802094, ...
Name: embedding_small, dtype: object

scikit-learn
fit(X, y, sample_weight=None)

X_train is a serie of list, but RandomForestClassifier needs 2D array
X: {array-like, sparse matrix} of shape (`n_samples`, `n_features`)

In [None]:
# convert serie to list
import numpy as np
X_train = np.array(X_train.tolist())
X_test  = np.array(X_test.tolist())

X_train.shape, X_test.shape

((74, 1536), (33, 1536))

### Model and Train: RandomForestClassifier

In [100]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

clf = RandomForestClassifier(n_estimators=200)
clf.fit(X_train, y_train)
preds = clf.predict(X_test)

In [101]:
from sklearn.metrics import classification_report

report = classification_report(y_test,preds)
print(report)

                              precision    recall  f1-score   support

          AUTOMOTIVE_EMBLEMS       1.00      1.00      1.00         1
AUTOMOTIVE_SIDE_VIEW_MIRRORS       1.00      1.00      1.00         1
            BATHROOM_FAUCETS       1.00      1.00      1.00         1
             CAR_LIGHT_BULBS       0.00      0.00      0.00         1
               COFFEE_TABLES       1.00      1.00      1.00         1
           COMPUTER_MONITORS       1.00      1.00      1.00         1
                  DEODORANTS       1.00      0.50      0.67         2
                FLOOD_LIGHTS       0.67      1.00      0.80         2
                    HANDBAGS       0.33      1.00      0.50         2
                KITCHEN_POTS       1.00      1.00      1.00         1
             LAPTOP_CHARGERS       1.00      1.00      1.00         1
                  LED_STRIPS       1.00      1.00      1.00         1
          MOTORCYCLE_HELMETS       1.00      1.00      1.00         1
   MOTORCYCLE_IGNIT

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
