<a href="https://colab.research.google.com/github/montatriki/ICCME2024/blob/main/Untitled0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
ls

[0m[01;34mdrive[0m/  faiss_index.index  model.pt  products.json  [01;34msample_data[0m/


In [None]:
# Cell 1: Install Dependencies
!pip install -q transformers faiss-cpu faiss-gpu torch
!pip install -q sentence-transformers nltk unidecode

In [None]:
import torch
import json
import numpy as np
from transformers import AutoModel, AutoTokenizer
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import faiss
from tqdm import tqdm
import re
from typing import List, Dict
import time

In [None]:
# Cell 3: Configuration
class Config:
    MODEL_NAME = 'dbmdz/bert-base-turkish-cased'
    MAX_LENGTH = 128
    BATCH_SIZE = 32  # Reduced batch size for stability
    EPOCHS = 5
    LEARNING_RATE = 2e-5
    EMBEDDING_DIM = 768
    SAVE_PATH = '/content/drive/MyDrive/product_search_model'
    DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

config = Config()
print(f"Using device: {config.DEVICE}")


Using device: cuda


In [None]:
# Cell 4: Dataset Class with Fixed Device Handling
class ProductDataset(Dataset):
    def __init__(self, products, tokenizer, max_length=128):
        self.products = products
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.category_to_products = self._group_by_category()

    def _group_by_category(self):
        category_dict = {}
        for product in self.products:
            category = product['product_category']
            if category not in category_dict:
                category_dict[category] = []
            category_dict[category].append(product)
        return category_dict

    def _normalize_text(self, text):
        text = text.lower().strip()
        text = re.sub(r'[^a-zğüşıöçA-ZĞÜŞİÖÇ0-9\s]', ' ', text)
        return ' '.join(text.split())

    def __len__(self):
        return len(self.products)

    def __getitem__(self, idx):
        product = self.products[idx]
        category = product['product_category']

        # Get same category product
        same_category_products = [p for p in self.category_to_products[category] if p['id'] != product['id']]
        if same_category_products:
            positive_product = np.random.choice(same_category_products)
        else:
            positive_product = product

        # Get different category product
        other_categories = [c for c in self.category_to_products.keys() if c != category]
        if other_categories:
            neg_category = np.random.choice(other_categories)
            negative_product = np.random.choice(self.category_to_products[neg_category])
        else:
            negative_product = product

        # Create texts
        anchor_text = f"{product['brand']} {product['master_data_name']} {category}"
        positive_text = f"{positive_product['brand']} {positive_product['master_data_name']} {category}"
        negative_text = f"{negative_product['brand']} {negative_product['master_data_name']} {neg_category}"

        # Tokenize
        encoding = self.tokenizer(
            [self._normalize_text(anchor_text),
             self._normalize_text(positive_text),
             self._normalize_text(negative_text)],
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'],
            'attention_mask': encoding['attention_mask'],
            'product_id': product['id']
        }

In [None]:
# Cell 5: Model Definition
class ProductSearchModel(nn.Module):
    def __init__(self, bert_model_name: str, embedding_dim: int = 768):
        super().__init__()
        self.bert = AutoModel.from_pretrained(bert_model_name)
        self.dropout = nn.Dropout(0.1)
        self.product_embedding = nn.Linear(768, embedding_dim)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0, :]
        pooled_output = self.dropout(pooled_output)
        embeddings = self.product_embedding(pooled_output)
        return torch.nn.functional.normalize(embeddings, p=2, dim=1)


In [None]:
# Cell 6: Training Function
def train_model(model, dataset, config):
    model = model.to(config.DEVICE)
    model.train()

    dataloader = DataLoader(
        dataset,
        batch_size=config.BATCH_SIZE,
        shuffle=True,
        num_workers=2,
        pin_memory=True
    )

    optimizer = torch.optim.AdamW(model.parameters(), lr=config.LEARNING_RATE)
    criterion = nn.TripletMarginLoss(margin=0.5)

    for epoch in range(config.EPOCHS):
        total_loss = 0
        progress_bar = tqdm(dataloader, desc=f'Epoch {epoch + 1}/{config.EPOCHS}')

        for batch in progress_bar:
            optimizer.zero_grad()

            # Move batch to device
            input_ids = batch['input_ids'].to(config.DEVICE)
            attention_mask = batch['attention_mask'].to(config.DEVICE)

            # Get embeddings for anchor, positive, and negative
            all_embeddings = model(input_ids.view(-1, input_ids.size(-1)),
                                 attention_mask.view(-1, attention_mask.size(-1)))

            # Reshape embeddings
            anchor = all_embeddings[0::3]
            positive = all_embeddings[1::3]
            negative = all_embeddings[2::3]

            # Calculate loss
            loss = criterion(anchor, positive, negative)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            progress_bar.set_postfix({'loss': loss.item()})

        avg_loss = total_loss / len(dataloader)
        print(f'Epoch {epoch+1}, Average Loss: {avg_loss:.4f}')

        # Save checkpoint
        if (epoch + 1) % 5 == 0:
            torch.save({
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'loss': avg_loss,
            }, f'{config.SAVE_PATH}_epoch_{epoch+1}.pt')


In [None]:
# Cell 7: Initialize and Train
print("Loading data...")
with open('products.json', 'r', encoding='utf-8') as f:
    products = json.load(f)

print("Initializing model and tokenizer...")
model = ProductSearchModel(config.MODEL_NAME, config.EMBEDDING_DIM)
tokenizer = AutoTokenizer.from_pretrained(config.MODEL_NAME)
dataset = ProductDataset(products, tokenizer, config.MAX_LENGTH)

print("Starting training...")
train_model(model, dataset, config)

Loading data...
Initializing model and tokenizer...
Starting training...


Epoch 1/5: 100%|██████████| 237/237 [06:46<00:00,  1.72s/it, loss=0.0254]


Epoch 1, Average Loss: 0.0511


Epoch 2/5: 100%|██████████| 237/237 [06:57<00:00,  1.76s/it, loss=0]


Epoch 2, Average Loss: 0.0073


Epoch 3/5: 100%|██████████| 237/237 [06:56<00:00,  1.76s/it, loss=0]


Epoch 3, Average Loss: 0.0035


Epoch 4/5: 100%|██████████| 237/237 [06:55<00:00,  1.76s/it, loss=0]


Epoch 4, Average Loss: 0.0031


Epoch 5/5: 100%|██████████| 237/237 [06:55<00:00,  1.75s/it, loss=0]


Epoch 5, Average Loss: 0.0027


In [None]:
# Cell 8: Fixed Search Engine Implementation
class ProductSearchEngine:
    def __init__(self, model, tokenizer, config, products):
        self.model = model.to(config.DEVICE)
        self.model.eval()
        self.tokenizer = tokenizer
        self.config = config

        # Initialize FAISS
        self.dimension = config.EMBEDDING_DIM
        self.index = faiss.IndexFlatIP(self.dimension)

        # Store product info
        self.products_cache = {i: product for i, product in enumerate(products)}
        self._build_index(products)

    def _normalize_text(self, text: str) -> str:
        text = text.lower().strip()
        text = re.sub(r'[^a-zğüşıöçA-ZĞÜŞİÖÇ0-9\s]', ' ', text)
        return ' '.join(text.split())

    def _build_index(self, products):
        print("Building search index...")
        batch_size = 64
        embeddings = []

        for i in tqdm(range(0, len(products), batch_size)):
            batch = products[i:i + batch_size]
            texts = [
                f"{p['brand']} {p['master_data_name']} {p['product_category']}"
                for p in batch
            ]
            texts = [self._normalize_text(t) for t in texts]

            with torch.no_grad():
                encoded = self.tokenizer(
                    texts,
                    padding=True,
                    truncation=True,
                    max_length=self.config.MAX_LENGTH,
                    return_tensors='pt'
                )

                # Move to device and only pass input_ids and attention_mask
                input_ids = encoded['input_ids'].to(self.config.DEVICE)
                attention_mask = encoded['attention_mask'].to(self.config.DEVICE)

                # Get embeddings
                batch_embeddings = self.model(input_ids, attention_mask).cpu().numpy()
                embeddings.append(batch_embeddings)

        # Combine all embeddings
        embeddings = np.vstack(embeddings)

        # Add to index
        self.index.add(embeddings)
        print(f"Index built with {len(products)} products")

    def search(self, query: str, top_k: int = 5):
        start_time = time.time()

        # Preprocess query
        query = self._normalize_text(query)

        # Encode query
        with torch.no_grad():
            encoded = self.tokenizer(
                query,
                padding=True,
                truncation=True,
                max_length=self.config.MAX_LENGTH,
                return_tensors='pt'
            )

            # Move to device and only pass input_ids and attention_mask
            input_ids = encoded['input_ids'].to(self.config.DEVICE)
            attention_mask = encoded['attention_mask'].to(self.config.DEVICE)

            # Get query embedding
            query_embedding = self.model(input_ids, attention_mask).cpu().numpy()

        # Search
        distances, indices = self.index.search(query_embedding, top_k)

        # Prepare results
        results = []
        for i, idx in enumerate(indices[0]):
            if idx != -1:
                product = self.products_cache[idx]
                results.append({
                    'id': product['id'],
                    'master_data_name': product['master_data_name'],
                    'brand': product['brand'],
                    'category': product['product_category'],
                    'similarity_score': float(distances[0][i])
                })

        return {
            'results': results,
            'metadata': {
                'query_time_ms': round((time.time() - start_time) * 1000, 2),
                'processed_query': query
            }
        }

In [None]:
# Cell 9: Initialize and Test Search Engine
print("Initializing search engine...")
search_engine = ProductSearchEngine(model, tokenizer, config, products)

# Test queries
test_queries = [
    "iphone telefon arıyorum",
    "kablosuz kulaklik bluetooth",
    "samsung tv 4k",
    "tablet için şarj aleti"
]

print("\nTesting search functionality:")
for query in test_queries:
    print(f"\nQuery: {query}")
    results = search_engine.search(query)

Initializing search engine...
Building search index...


100%|██████████| 119/119 [00:18<00:00,  6.38it/s]


Index built with 7579 products

Testing search functionality:

Query: iphone telefon arıyorum

Query: kablosuz kulaklik bluetooth

Query: samsung tv 4k

Query: tablet için şarj aleti


In [None]:
# Cell 11: Save Components
model_path, index_path = save_all_components(model, search_engine, config)

Saved model to: /content/drive/MyDrive/product_search/model_20241222_1457.pt
Saved index to: /content/drive/MyDrive/product_search/faiss_index_20241222_1457.index


In [None]:
# Cell 12: Test a few more complex queries
complex_queries = [
    "evime yeni bir televizyon sistemi arıyorum ses kalitesi iyi olsun",
    "telefon sarj aleti type c",
    "gaming mouse rgb ışıklı",
    "bluetooth hoparlör taşınabilir"
]

print("\nTesting complex queries:")
for query in complex_queries:
    print(f"\nQuery: {query}")
    results = search_engine.search(query)
    for result in results['results']:
        print(f"- {result['brand']} {result['master_data_name']} (Score: {result['similarity_score']:.3f})")



Testing complex queries:

Query: evime yeni bir televizyon sistemi arıyorum ses kalitesi iyi olsun
- VESTEL 50UA9631 50 inç 126 Ekran Uydu Alıcılı Smart 4K UHD Android LED TV Siyah Gri (Score: 0.729)
- VESTEL 55UA9631 55 inç 139 Ekran Dahili Uydu Alıcılı 4K Android TV (Score: 0.724)
- VESTEL 58UA9631 58 inç 146 Ekran Dahili Uydu Alıcılı 4K Android TV (Score: 0.722)
- GRUNDIG 75 GHU 8500 A 75 inç 189 Ekran Uydu Alıcılı Google Smart 4K Ultra HD LED TV Antrasit (Score: 0.718)
- VESTEL 70UA9630 70 inç 177 Ekran Uydu Alıcılı Smart 4K UHD Android TV Siyah (Score: 0.717)

Query: telefon sarj aleti type c
- APPLE USB-C VGA Çoklu Bağlantı Noktası Adaptörü MJ1L2ZM/A (Score: 0.844)
- HAMA HM.135727 Adaptör USB-C Fiş VGA Soket  Full HD (Score: 0.838)
- SANDISK iXpand Flash Drive 128GB Type A + Lightning Taşınabilir USB Bellek Gri (Score: 0.838)
- SANDISK Ultra Shift USB 3.0 64GB USB Bellek (Score: 0.835)
- SANDISK Extreme Go 256GB 3.2 USB Bellek (Score: 0.834)

Query: gaming mouse rgb ışıklı
- PH

In [None]:
# In Colab
torch.save({
    'model_state_dict': model.state_dict(),
    'config': {
        'model_name': config.MODEL_NAME,
        'embedding_dim': config.EMBEDDING_DIM
    }
}, 'model.pt')

In [None]:
# Save FAISS index
import faiss
index_path = 'faiss_index.index'
faiss.write_index(search_engine.index, index_path)
print(f"FAISS index saved to: {index_path}")

FAISS index saved to: faiss_index.index


In [None]:
# Create directory in Google Drive
import os
save_dir = '/content/drive/MyDrive/monta'
os.makedirs(save_dir, exist_ok=True)

# Save model state
model_path = os.path.join(save_dir, 'model.pt')
torch.save({
    'model_state_dict': model.state_dict(),
    'config': {
        'model_name': config.MODEL_NAME,
        'embedding_dim': config.EMBEDDING_DIM,
        'max_length': config.MAX_LENGTH
    }
}, model_path)

# Save FAISS index
import faiss
index_path = os.path.join(save_dir, 'faiss_index.index')
faiss.write_index(search_engine.index, index_path)

# Copy products.json if needed
import shutil
json_path = os.path.join(save_dir, 'products.json')
shutil.copy('products.json', json_path)

# Verify files and sizes
print("\nSaved files in:", save_dir)
for filename in os.listdir(save_dir):
    file_path = os.path.join(save_dir, filename)
    size_mb = os.path.getsize(file_path) / (1024 * 1024)
    print(f"- {filename}: {size_mb:.2f} MB")


Saved files in: /content/drive/MyDrive/monta
- model.pt: 424.29 MB
- faiss_index.index: 22.20 MB
- products.json: 5.68 MB
