# Product Text Classification with OpenAI Embeddings

Steps:

- Load and filter dataset

- Create embeddings with text-embedding-3-small.

- Split train/test (stratify by category).

- Train Random Forest classifier.

- Evaluate with accuracy & classification report.

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [None]:
from openai import OpenAI
client = OpenAI(api_key="OPENAI_API_KEY")

### Import and filter dataset based on a minimum number of samples per class

In [None]:
df_input = pd.read_csv('train.csv')

In [None]:
# df_input = df_input[df_input['language'] == 'spanish']
n_sample = 1000
min_samples = 3

df_input_sample = df_input.sample(n = n_sample).drop(["language", "label_quality"], axis = 1)

unique_categories = df_input_sample['category'].value_counts()
list_filtered_categories = unique_categories[unique_categories > min_samples].index.to_list()
df_input_filtered = df_input_sample[df_input_sample["category"].isin(list_filtered_categories)]
df_input_filtered = df_input_filtered.rename(columns={'title': 'text'})

In [None]:
df_input_filtered

### Create embeddings

What to use: text or [text]? Both actually work

- input=text → returns 1 embedding.
- input=[text] → also returns 1 embedding (inside a list).

##### TODO: clean up raw data

In [None]:
def get_embedding(text, model="text-embedding-3-small"):
    # TODO: pre-processing text text = text.replace("\n", " ")
    return client.embeddings.create(input = [text], model=model).data[0].embedding

In [None]:
df_input_filtered['embedding_small'] = df_input_filtered['text'].apply(lambda x: get_embedding(x, model='text-embedding-3-small'))

In [None]:
df_input_filtered = df_input_filtered.reset_index(drop=True)
df_input_filtered = df_input_filtered.drop('text', axis = 1)

In [None]:
len(df_input_filtered['embedding_small'][0])

In [None]:
pd.set_option('display.max_colwidth', None)
df_input_filtered.iloc[0]

##### TODO: check embeddings len when saving file

In [None]:
#TODO check, the len of embeddings increases
df_input_filtered.to_csv(f'embeddings/embedded_{n_sample}_products.csv', index=False)

In [None]:
df_embeddings = pd.read_csv('embeddings/embedded_10_products.csv')

In [None]:
df_input_filtered['embedding_small'].map(len), df_embeddings['embedding_small'].map(len)

### Split with stratification

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_input_filtered['embedding_small'], df_input_filtered['category'], 
                                                      test_size=0.3, random_state=42, stratify=df_input_filtered['category'])

In [None]:
len(X_train) , len(X_test), X_train.shape, y_train.shape

In [None]:
X_train.head()

scikit-learn
fit(X, y, sample_weight=None)

X_train is a serie of list, but RandomForestClassifier needs 2D array
X: {array-like, sparse matrix} of shape (`n_samples`, `n_features`)

In [None]:
# convert serie to list
import numpy as np
X_train = np.array(X_train.tolist())
X_test  = np.array(X_test.tolist())

X_train.shape, X_test.shape

### Model and Train: RandomForestClassifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

clf = RandomForestClassifier(n_estimators=200)
clf.fit(X_train, y_train)
preds = clf.predict(X_test)

In [None]:
from sklearn.metrics import classification_report

report = classification_report(y_test,preds)
print(report)