In [1]:
import pandas as pd
import pickle
from tqdm import tqdm
import sys
import os
import numpy as np

# --- Configuration ---
# No more sample size! We do everything.
IMAGE_DIR = '../data/images/'

# --- Make sure everything is loaded correctly ---
sys.path.append('../src')
from utils import download_images

os.environ['CUDA_VISIBLE_DEVICES'] = '-1' # Use CPU
import tensorflow as tf
from tensorflow.keras.applications.resnet50 import ResNet50

print("Loading the ResNet50 model from local file...")
weights_path = '../data/models/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5'
model = ResNet50(weights=weights_path, include_top=False, pooling='avg')
print("✅ ResNet50 model loaded.")

# --- Re-using your reliable image processing function ---
def get_image_embedding(image_path, model):
    try:
        img = tf.keras.preprocessing.image.load_img(image_path, target_size=(224, 224))
        img_array = tf.keras.preprocessing.image.img_to_array(img)
        img_batch = np.expand_dims(img_array, axis=0)
        img_preprocessed = tf.keras.applications.resnet50.preprocess_input(img_batch)
        embedding = model.predict(img_preprocessed, verbose=0)
        return embedding[0]
    except Exception as e:
        return None

# ===================================================================
#                       PROCESS ALL TRAINING IMAGES
# ===================================================================
print(f"--- Processing ALL Training Images ---")
train_df = pd.read_csv('../data/train.csv')
train_ids = train_df['sample_id'].tolist()
train_links = train_df['image_link'].tolist()
train_images_to_download = list(zip(train_links, train_ids))

print("Downloading all training images...")
download_images(train_images_to_download, IMAGE_DIR)

train_embeddings = {}
output_path_train = '../data/train_image_embeddings_FULL.pkl'
for sample_id in tqdm(train_ids, desc="Processing Train Images"):
    image_path = os.path.join(IMAGE_DIR, f'{sample_id}.jpg')
    if os.path.exists(image_path):
        embedding = get_image_embedding(image_path, model)
        if embedding is not None:
            train_embeddings[sample_id] = embedding

with open(output_path_train, 'wb') as f:
    pickle.dump(train_embeddings, f)
print(f"✅ Saved {len(train_embeddings)} train embeddings.")

# ===================================================================
#                         PROCESS ALL TEST IMAGES
# ===================================================================
print(f"\n--- Processing ALL Test Images ---")
test_df = pd.read_csv('../data/test.csv')
test_ids = test_df['sample_id'].tolist()
test_links = test_df['image_link'].tolist()
test_images_to_download = list(zip(test_links, test_ids))

print("Downloading all test images...")
download_images(test_images_to_download, IMAGE_DIR)

test_embeddings = {}
output_path_test = '../data/test_image_embeddings_FULL.pkl'
for sample_id in tqdm(test_ids, desc="Processing Test Images"):
    image_path = os.path.join(IMAGE_DIR, f'{sample_id}.jpg')
    if os.path.exists(image_path):
        embedding = get_image_embedding(image_path, model)
        if embedding is not None:
            test_embeddings[sample_id] = embedding

with open(output_path_test, 'wb') as f:  

    
    pickle.dump(test_embeddings, f)
print(f"✅ Saved {len(test_embeddings)} test embeddings.")



Loading the ResNet50 model from local file...
✅ ResNet50 model loaded.
--- Processing ALL Training Images ---
Downloading all training images...
Starting sequential download of 75000 images...


 52%|████████████████████████████████████████████████████████████▊                                                       | 39282/75000 [06:38<06:01, 98.68it/s]


KeyboardInterrupt: 