In [4]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
import pandas as pd

# Load the dataset
# Update the path to your dataset file in Google Drive
data_link = '/content/drive/MyDrive/NXT Hackathon Data /Watches Data Dump.csv'
data = pd.read_csv(data_link)

In [6]:
import torch
print(torch.__version__)


2.5.1+cu121


In [None]:
import pandas as pd
import numpy as np
import torch
from transformers import BertTokenizer, BertModel
from torchvision import models, transforms
from PIL import Image
from sklearn.cluster import AgglomerativeClustering
from sklearn.preprocessing import normalize
from sklearn.metrics import silhouette_score
import os
import json
import requests
from io import BytesIO
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Path to the folder containing all CSV files
data_folder = "/content/drive/MyDrive/NXT_Hackathon_Data/"
output_folder = "/content/drive/MyDrive/NXT_Hackathon_Output/"
os.makedirs(output_folder, exist_ok=True)

# List of all CSV files
data_files = [
    "Bathroom Vanities Data Dump.csv",
    "Data Dump Kurtis.csv",
    "Dresses Data Dump.csv",
    "Earrings Data Dump.csv",
    "Jeans Data Dump.csv",
    "Saree Data_dump.csv",
    "shirts_data_dump.csv",
    "Sneakers Data Dump.csv",
    "Tshirts Data Dump.csv",
    "Watches Data Dump.csv"
]

# Device setup for GPU/CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# --------------------------
# STEP 1: Load Pre-trained Models
# --------------------------
print("Loading pre-trained models...")

# Load pre-trained BERT model and tokenizer
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased').to(device)
bert_model.eval()

# Load pre-trained ResNet model
resnet_model = models.resnet50(pretrained=True).to(device)
resnet_model.eval()
resnet_model = torch.nn.Sequential(*(list(resnet_model.children())[:-1]))

# Image preprocessing transformations
image_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# --------------------------
# STEP 2: Helper Functions
# --------------------------
def get_bert_embedding(text):
    """Generate BERT embeddings for a given text."""
    inputs = bert_tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128).to(device)
    with torch.no_grad():
        outputs = bert_model(**inputs)
    cls_embedding = outputs.last_hidden_state[:, 0, :].squeeze(0).cpu().numpy()
    return cls_embedding

def get_image_embedding(image_url):
    """Generate ResNet embeddings for a given image."""
    try:
        response = requests.get(image_url, timeout=5)
        image = Image.open(BytesIO(response.content)).convert('RGB')
        image = image_transform(image).unsqueeze(0).to(device)
        with torch.no_grad():
            embedding = resnet_model(image).squeeze().cpu().numpy()
        return embedding.flatten()
    except Exception as e:
        print(f"Error processing image {image_url}: {e}")
        return np.zeros(2048)

def find_optimal_clusters(features, max_clusters=10):
    """Determine the optimal number of clusters using the Silhouette Score."""
    best_score = -1
    best_num_clusters = 2  # Minimum clusters to test
    for n_clusters in range(2, max_clusters + 1):
        clustering_model = AgglomerativeClustering(n_clusters=n_clusters, affinity='cosine', linkage='average')
        cluster_labels = clustering_model.fit_predict(features)
        score = silhouette_score(features, cluster_labels, metric='cosine')
        if score > best_score:
            best_score = score
            best_num_clusters = n_clusters
    return best_num_clusters

# --------------------------
# STEP 3: Process Each File
# --------------------------
all_data = []  # To store combined data across all CSVs

for file in data_files:
    print(f"Processing {file}...")

    # Load dataset
    file_path = os.path.join(data_folder, file)
    data = pd.read_csv(file_path)

    # Fill missing values
    data['description'] = data['description'].fillna("")
    data['feature_image'] = data['feature_image'].fillna("")

    # Extract text features
    print("Extracting textual features...")
    text_features = np.array([get_bert_embedding(desc) for desc in data['description']])
    text_features = normalize(text_features)

    # Extract image features
    print("Extracting image features...")
    image_features = np.array([get_image_embedding(img_url) for img_url in data['feature_image']])
    image_features = normalize(image_features)

    # Combine text and image features
    print("Combining features...")
    combined_features = np.hstack((text_features, image_features))

    # Determine the optimal number of clusters
    print("Determining the optimal number of clusters...")
    optimal_clusters = find_optimal_clusters(combined_features)
    print(f"Optimal number of clusters for {file}: {optimal_clusters}")

    # Perform clustering with the optimal number of clusters
    print("Clustering features...")
    clustering_model = AgglomerativeClustering(n_clusters=optimal_clusters, affinity='cosine', linkage='average')
    cluster_labels = clustering_model.fit_predict(combined_features)

    # Add cluster labels to the dataset
    data['cluster'] = cluster_labels

    # Append to all_data
    all_data.append(data)

    # Create an ontology for the file
    print("Creating ontology...")
    ontology = {}
    for cluster_id in range(optimal_clusters):
        cluster_data = data[data['cluster'] == cluster_id]
        ontology[f"Cluster {cluster_id}"] = {
            "size": len(cluster_data),
            "categories": cluster_data['category_name'].unique().tolist() if 'category_name' in cluster_data else [],
            "brands": cluster_data['brand'].unique().tolist() if 'brand' in cluster_data else [],
            "sample_products": cluster_data['product_name'].tolist()[:10] if 'product_name' in cluster_data else []
        }

    # Save ontology as JSON
    ontology_file = os.path.join(output_folder, f"ontology_{file.split('.')[0]}.json")
    with open(ontology_file, "w") as json_file:
        json.dump(ontology, json_file, indent=4)

    print(f"Ontology for {file} saved to: {ontology_file}")

# Combine all data and save to a single file
final_data = pd.concat(all_data, ignore_index=True)
final_output_file = os.path.join(output_folder, "combined_clustered_data.csv")
final_data.to_csv(final_output_file, index=False)

print(f"Combined clustered data saved to: {final_output_file}")


AttributeError: partially initialized module 'torch' has no attribute 'fx' (most likely due to a circular import)