In [1]:
import numpy as np
from pymongo import MongoClient
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, DataCollatorWithPadding
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score, classification_report
from sklearn.utils.class_weight import compute_class_weight

2024-06-13 12:14:48.458243: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# Connect to MongoDB and load data
client = MongoClient('mongodb+srv://mkhanani69:AM123$Pro@cubicvision.1hhgged.mongodb.net/?retryWrites=true&w=majority&appName=CubicVision')
db = client['horecastore']
collection = db['products']
data = list(collection.find())
df = pd.DataFrame(data)

# Check column names and inspect the first few rows
print(df.columns)
print(df.head())

Index(['_id', 'id', 'name', 'description', 'price_per_unit', 'quantity',
       'keywords'],
      dtype='object')
                        _id        id  \
0  6659a9e10db47e69c306e52f  AM-52030   
1  6659a9e10db47e69c306e534  AM-95006   
2  6659a9e10db47e69c306e530  AM-17741   
3  6659a9e10db47e69c306e532  AM-18411   
4  6659a9e10db47e69c306e533  AM-95012   

                                                name  \
0                                      Chesse Slicer   
1                Lacor Salad Spinner Manual 25 Liter   
2              Lacor Confectionary Funnel With Stand   
3  Paderno Abs Japanese Mandoline Slicer 15 x 35 ...   
4                Lacor Salad Spinner Manual 12 Liter   

                                         description price_per_unit quantity  \
0  Material: Stainless Steel, Dimension : 2.5 H x...            120        4   
1  Manufacturer: Lacor Spain ,Item Code: 61425, M...         653.06       10   
2  Manufacturer: Lacor Spain, Item Code: 67150, M...         

In [3]:
# Ensure 'name' and 'description' columns are present
if 'name' in df.columns and 'description' in df.columns:
    # Text cleaning
    df['description'] = df['description'].str.lower().str.replace('[^\w\s]', '', regex=True)

    # Tokenization
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    df['tokens'] = df['description'].apply(lambda x: tokenizer.encode(x, add_special_tokens=True))

    # Label encoding
    label_encoder = LabelEncoder()
    df['label_encoded'] = label_encoder.fit_transform(df['name'])
else:
    print("Required columns 'name' and 'description' are not present in the DataFrame.")

# Check data distribution
print(df['name'].value_counts())

Jiwins Plastic Ingredient Bin White with Lid    3
SCHNEIDER Ridged Baking Mould                   2
Lacor Lid 40cm                                  2
Lacor S/S Grid 650x530mm                        2
Lacor S/S Grid 530x325mm                        2
                                               ..
Lacor Conical Mixing Bowl 30cm                  1
Lacor Conical Mixing Bowl 20cm                  1
Pujadas 1/2 Ball Reinforced Colander 19cm       1
Lacor Conical Strainer With Fine Mesh 10cm      1
Prestige Stainless Steel Rice Cooker 1.8L       1
Name: name, Length: 375, dtype: int64


In [4]:
# Prepare data for training
class ProductDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data.iloc[idx]
        return {
            'input_ids': torch.tensor(item['tokens'], dtype=torch.long),
            'label': torch.tensor(item['label_encoded'], dtype=torch.long)
        }

dataset = ProductDataset(df)
# Custom collate function
def custom_collate_fn(batch):
    input_ids = [item['input_ids'] for item in batch]
    labels = [item['label'] for item in batch]
    
    # Use DataCollatorWithPadding to pad input_ids
    data_collator = DataCollatorWithPadding(tokenizer)
    batch = data_collator([{'input_ids': input_id} for input_id in input_ids])
    
    # Add labels back to the batch
    batch['labels'] = torch.tensor(labels, dtype=torch.long)
    
    return batch

dataloader = DataLoader(dataset, batch_size=16, shuffle=True, collate_fn=custom_collate_fn)

In [6]:
# Load pre-trained BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_encoder.classes_))

# Compute class weights
class_weights = compute_class_weight('balanced', classes=np.unique(df['label_encoded']), y=df['label_encoded'])
class_weights = torch.tensor(class_weights, dtype=torch.float)

# Optimizer and loss function
optimizer = AdamW(model.parameters(), lr=1e-5)
loss_fn = torch.nn.CrossEntropyLoss(weight=class_weights)

# Training loop
model.train()
for epoch in range(10):  # Number of epochs
    for batch in dataloader:
        optimizer.zero_grad()
        input_ids = batch['input_ids']
        labels = batch['labels']
        outputs = model(input_ids, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        print(f'Epoch: {epoch}, Loss: {loss.item()}')

# Evaluation
model.eval()
predictions, true_labels = [], []

with torch.no_grad():
    for batch in dataloader:
        input_ids = batch['input_ids']
        labels = batch['labels']
        outputs = model(input_ids)
        logits = outputs.logits
        predictions.extend(torch.argmax(logits, dim=1).tolist())
        true_labels.extend(labels.tolist())

# Calculate accuracy
accuracy = accuracy_score(true_labels, predictions)
print(f'Accuracy: {accuracy}')

# Detailed classification report
print(classification_report(true_labels, predictions, target_names=label_encoder.classes_))

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


Epoch: 0, Loss: 6.011236667633057
Epoch: 0, Loss: 5.995007514953613
Epoch: 0, Loss: 6.052751541137695
Epoch: 0, Loss: 6.018484592437744
Epoch: 0, Loss: 5.926253795623779
Epoch: 0, Loss: 5.914660453796387
Epoch: 0, Loss: 5.952935218811035
Epoch: 0, Loss: 5.836736679077148
Epoch: 0, Loss: 6.008844375610352
Epoch: 0, Loss: 5.9241790771484375
Epoch: 0, Loss: 6.016955852508545
Epoch: 0, Loss: 5.918779373168945
Epoch: 0, Loss: 5.993082046508789
Epoch: 0, Loss: 6.021120548248291
Epoch: 0, Loss: 5.9600605964660645
Epoch: 0, Loss: 6.0740156173706055
Epoch: 0, Loss: 5.78887939453125
Epoch: 0, Loss: 5.897466659545898
Epoch: 0, Loss: 5.94154167175293
Epoch: 0, Loss: 6.076204299926758
Epoch: 0, Loss: 6.116046905517578
Epoch: 0, Loss: 5.908572196960449
Epoch: 0, Loss: 5.912426471710205
Epoch: 0, Loss: 5.909814834594727
Epoch: 0, Loss: 6.196063995361328
Epoch: 1, Loss: 5.99150276184082
Epoch: 1, Loss: 5.89846658706665
Epoch: 1, Loss: 6.008439064025879
Epoch: 1, Loss: 6.048862934112549
Epoch: 1, Loss: