In [66]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import glob
import os

# Load Dataset
# df = pd.read_parquet('dataset/dataset_batch_40.parquet')
# print(f"Total samples: {len(df)}")
# print(df.head())

# path where your parquet files are stored
data_path = "datasets/"   # e.g. ./datasets/ or /Users/you/projects/sneakers/datasets/
files = glob.glob(os.path.join(data_path, "*.parquet"))

print(f"🗂️ Found {len(files)} parquet files")

# load all parquet files and concatenate into one big DataFrame
dfs = [pd.read_parquet(f) for f in files]
df = pd.concat(dfs, ignore_index=True)

print(f"✅ Combined {len(files)} files — total samples: {len(df)}")
print("Available columns:", list(df.columns))



🗂️ Found 92 parquet files
✅ Combined 92 files — total samples: 91081
Available columns: ['image', 'brand', 'model']


In [67]:
# load all parquet files and concatenate into one big DataFrame
dfs = [pd.read_parquet(f) for f in files]
df = pd.concat(dfs, ignore_index=True)

print(f"✅ Combined {len(files)} files — total samples: {len(df)}")
print("Available columns:", list(df.columns))

✅ Combined 92 files — total samples: 91081
Available columns: ['image', 'brand', 'model']


In [68]:
# check how many unique brands exist
print("\nTop 10 brands:")
print(df['brand'].value_counts().head(10))

# check a few rows
print(df.head(3))


Top 10 brands:
brand
Nike        29999
Adidas      20700
Jordan       7800
adidas       5600
Asics        4000
Vans         3685
New          3500
Converse     3101
Reebok       3100
Puma         2800
Name: count, dtype: int64
                                               image brand     model
0  b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...  Puma  RBD Game
1  b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...  Puma  RBD Game
2  b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00...  Puma  RBD Game


In [69]:
# drop rows with missing image bytes or brand labels
df = df.dropna(subset=['image', 'brand'])

# remove brands with too few samples (less than 5)
df = df.groupby('brand').filter(lambda x: len(x) > 5)

print(f"🧹 Cleaned dataset — remaining rows: {len(df)}, brands: {df['brand'].nunique()}")

🧹 Cleaned dataset — remaining rows: 91081, brands: 40


In [70]:
# Save the combined dataset for future use
output_path = "all_sneakers_combined.parquet"
df.to_parquet(output_path, index=False)
print(f"💾 Saved combined dataset as: {output_path}")

💾 Saved combined dataset as: all_sneakers_combined.parquet


In [71]:
# Reload to verify
df_check = pd.read_parquet("all_sneakers_combined.parquet")
print(f"Reloaded dataset: {len(df_check)} rows, {df_check['brand'].nunique()} brands")

Reloaded dataset: 91081 rows, 40 brands


In [72]:
# Split into train and test sets

df = pd.read_parquet("all_sneakers_combined.parquet")
train_df, val_df = train_test_split(df, test_size=0.2, stratify=df['brand'], random_state=42)
print(f"Train: {len(train_df)}, Validation: {len(val_df)}")
print(f"Brand Distribution:\n{df['brand'].value_counts()}")
print(f"Model Distribution:\n{df['model'].value_counts()}")

Train: 72864, Validation: 18217
Brand Distribution:
brand
Nike           29999
Adidas         20700
Jordan          7800
adidas          5600
Asics           4000
Vans            3685
New             3500
Converse        3101
Reebok          3100
Puma            2800
Saucony         1100
Timberland       700
ON               600
Off-White        500
Autry            500
Clarks           400
Balenciaga       300
Veja             300
Salomon          200
Diadora          200
Mizuno           100
Lacoste          100
Alexander        100
alexander        100
Le               100
Karhu            100
Dr.              100
Hoka             100
Ewing            100
Camper           100
Suicoke          100
Crocs            100
Moon             100
Keen             100
Onitsuka         100
Lanvin           100
BAPE             100
Amiri            100
Birkenstock      100
KangaROOS         96
Name: count, dtype: int64
Model Distribution:
model
100                                 2281
Waffle On

In [73]:
# Dataset Class(JPEG Bytes → Tensor)
from PIL import Image
import io
import torch
from torch.utils.data import Dataset
import numpy as np

class SneakerArrayDataset(Dataset):
    def __init__(self, df, label_col="brand"):
        self.images = df['image'].values
        self.labels = df[label_col].astype('category').cat.codes
        self.label2name = dict(enumerate(df[label_col].astype('category').cat.categories))
    
    def __len__(self):
        return len(self.images)
    
    def __getitem__(self, idx):
        # Convert binary bytes → Image
        img_bytes = self.images[idx]
        
        # Some Parquet exports store bytes as strings, ensure conversion
        if isinstance(img_bytes, str):
            img_bytes = bytes(img_bytes, 'utf-8')
        
        image = Image.open(io.BytesIO(img_bytes)).convert("RGB")
        
        # Resize + Normalize
        image = image.resize((224, 224))
        img_tensor = torch.tensor(np.array(image), dtype=torch.float32).permute(2, 0, 1) / 255.0
        
        label = torch.tensor(self.labels.iloc[idx], dtype=torch.long)
        return img_tensor, label

In [74]:
# Data Loaders
import torch.nn as nn
from torch.utils.data import DataLoader

train_dataset = SneakerArrayDataset(train_df)
val_dataset = SneakerArrayDataset(val_df)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

print(f"Number of classes: {len(set(train_dataset.labels))}")

Number of classes: 40


In [75]:
# Test one batch
sample_img, sample_label = train_dataset[0]
print(sample_img.shape, sample_label)

torch.Size([3, 224, 224]) tensor(0)


In [76]:
# Load Pretrained Model (ResNet50)
# -----------------------------
# Manual download + load
# -----------------------------
import os
import torch
from torchvision import models

# path to the downloaded weights
weights_path = os.path.expanduser("~/.cache/torch/hub/checkpoints/resnet50-0676ba61.pth")

# create model without attempting to download
model = models.resnet50(pretrained=False)
state = torch.load(weights_path, map_location="cpu")
model.load_state_dict(state)

# move to device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

print(f"✅ Model loaded from local weights: {weights_path}")



✅ Model loaded from local weights: /Users/paabonsu/.cache/torch/hub/checkpoints/resnet50-0676ba61.pth


In [77]:
# Loss, Optimzer, and Training Loop
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
epochs = 10

for epochs in range(epochs):
    model.train()
    total_loss = 0.0

    for imgs, labels in train_loader:
        imgs, labels = imgs.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(imgs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    avg_loss = total_loss / len(train_loader)
    print(f"Epoch [{epochs+1}/{epochs}] | Loss: {avg_loss:.4f}")

Epoch [1/0] | Loss: 0.6801
Epoch [2/1] | Loss: 0.3357
Epoch [3/2] | Loss: 0.2395
Epoch [4/3] | Loss: 0.1822


KeyboardInterrupt: 

In [78]:
torch.save(model.state_dict(), "checkpoint_manual_stop.pth")
print("💾 Partial model saved safely.")

💾 Partial model saved safely.


In [25]:
# Evaluation on Validation Set
model.eval()
correct, total = 0, 0

with torch.no_grad():
    for imgs, labels in val_loader:
        imgs, labels = imgs.to(device), labels.to(device)
        outputs = model(imgs)
        _, preds = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (preds == labels).sum().item()

accuracy = 100 * correct / total
print(f"Validation Accuracy: {accuracy:.2f}%")

Validation Accuracy: 100.00%


In [33]:
# Test Single Prediction
model.eval()
test_img, _ = val_dataset[0]
test_img = test_img.unsqueeze(0).to(device)

with torch.no_grad():
    output = model(test_img)
    _, pred = torch.max(output, 1)

predicted_label = train_dataset.label2name[pred.item()]
print(f"Predicted Sneaker: {predicted_label}")

Predicted Sneaker: Adidas


In [None]:
import matplotlib.pyplot as plt

model.eval()
for i in range(5):
    img_tensor, label = val_dataset[i]
    img = img_tensor.permute(1, 2, 0).cpu().numpy()

    with torch.no_grad():
        output = model(img_tensor.unsqueeze(0).to(device))
        _, pred = torch.max(output, 1)
        predicted_label = train_dataset.label2name[pred.item()]
        actual_label = train_dataset.label2name[label.item()]

    plt.imshow(img)
    plt.title(f"Predicted: {predicted_label} | Actual: {actual_label}")
    plt.axis('off')
    plt.show()

In [None]:
# Save the trained model
torch.save({
    'model_state_dict': model.state_dict(),
    'label_map': train_dataset.label2name
}, "sneaker_model.pth")

print("💾 Model and label map saved as sneaker_model.pth")