In [1]:
import os
import zipfile
import pandas as pd
import numpy as np
from PIL import Image

import matplotlib.pyplot as plt
import seaborn as sns

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Dense, Flatten, Dropout

from tensorflow.keras.preprocessing.image import ImageDataGenerator

from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

try:
    from google.colab import drive
    drive.mount('/content/drive')
    print("Google Drive mounted successfully at /content/drive.")
except Exception as e:
    print(f"Could not mount Google Drive: {e}")
    print("Proceeding without Google Drive mount. You will not be able to save files permanently.")


GDRIVE_SAVE_DIR = '/content/drive/MyDrive/my_fashion_cnn_from_scratch_outputs'
os.makedirs(GDRIVE_SAVE_DIR, exist_ok=True)
print(f"\nGoogle Drive output directory checked/created: {GDRIVE_SAVE_DIR}")

Mounted at /content/drive
Google Drive mounted successfully at /content/drive.

Google Drive output directory checked/created: /content/drive/MyDrive/my_fashion_cnn_from_scratch_outputs


In [2]:
!pip install kaggle
print("Kaggle library installed.")

Kaggle library installed.


In [3]:
import os

uploaded_kaggle_json_path = '/content/kaggle.json'

print(f"\nChecking for kaggle.json at {uploaded_kaggle_json_path}...")
if os.path.exists(uploaded_kaggle_json_path):
    print(f"Found kaggle.json at {uploaded_kaggle_json_path}.")

    kaggle_dir = os.path.join(os.path.expanduser("~"), ".kaggle")
    os.makedirs(kaggle_dir, exist_ok=True)

    destination_path = os.path.join(kaggle_dir, "kaggle.json")
    !cp "{uploaded_kaggle_json_path}" "{destination_path}"

    !chmod 600 "{destination_path}"

    print("Kaggle API key setup complete using manually uploaded file.")
else:
    print(f"Error: kaggle.json NOT found at {uploaded_kaggle_json_path}.")
    print("Please perform the manual upload step described in the comments above BEFORE running this cell.")
    print("Ensure the file is named exactly 'kaggle.json' and uploaded to the main /content/ directory.")


Checking for kaggle.json at /content/kaggle.json...
Found kaggle.json at /content/kaggle.json.
Kaggle API key setup complete using manually uploaded file.


In [4]:
import os

dataset_name = 'paramaggarwal/fashion-product-images-small'
data_dir = '/content/fashion_data'

os.makedirs(data_dir, exist_ok=True)

print(f"\nDownloading dataset '{dataset_name}' to '{data_dir}'...")
!kaggle datasets download -d {dataset_name} -p {data_dir} --unzip
print("Download and extraction process initiated.")


csv_path = os.path.join(data_dir, 'styles.csv')
images_dir = os.path.join(data_dir, 'images')

print(f"\nExpected CSV file path: {csv_path}")
print(f"Expected images directory: {images_dir}")
print("\nReminder: Data in /content/ is temporary and will be deleted when the Colab session ends.")


Downloading dataset 'paramaggarwal/fashion-product-images-small' to '/content/fashion_data'...
Dataset URL: https://www.kaggle.com/datasets/paramaggarwal/fashion-product-images-small
License(s): MIT
Download and extraction process initiated.

Expected CSV file path: /content/fashion_data/styles.csv
Expected images directory: /content/fashion_data/images

Reminder: Data in /content/ is temporary and will be deleted when the Colab session ends.


In [5]:
import os
if 'data_dir' in locals() and os.path.exists(data_dir) and \
   'csv_path' in locals() and 'images_dir' in locals():

    print(f"\nListing contents of download directory: {data_dir}")
    !ls -l "{data_dir}"

    print(f"\nChecking for existence of CSV file at: {csv_path}")
    if os.path.exists(csv_path):
        print(f"Success: {csv_path} found!")
    else:
        print(f"Error: {csv_path} NOT found at the expected path.")
        print("Possible issue: Download/unzip failed in Cell 4 or file unzipped to a subfolder.")
        print("Action: Check the output of the 'ls' command above. If the CSV is missing or in a subfolder, re-run Cell 4 and check its output for errors.")


    print(f"\nChecking for existence of images directory at: {images_dir}")
    if os.path.exists(images_dir):
        print(f"Success: {images_dir} found!")
    else:
        print(f"Error: {images_dir} NOT found at the expected path.")
        print("Action: If this folder is missing, re-run Cell 4 and check its output for errors.")

else:
    print("Error: Download directory, csv_path, or images_dir not properly defined or accessible. Please check Cells 3 and 4.")


Listing contents of download directory: /content/fashion_data
total 5364
drwxr-xr-x 2 root root 1155072 May  9 12:46 images
drwxr-xr-x 3 root root    4096 May  9 12:46 myntradataset
-rw-r--r-- 1 root root 4332000 May  9 12:46 styles.csv

Checking for existence of CSV file at: /content/fashion_data/styles.csv
Success: /content/fashion_data/styles.csv found!

Checking for existence of images directory at: /content/fashion_data/images
Success: /content/fashion_data/images found!


In [6]:
import pandas as pd

if 'csv_path' in locals() and os.path.exists(csv_path):
    try:
        df = pd.read_csv(csv_path, on_bad_lines='skip')
        print("CSV file loaded successfully (bad lines skipped).")

        print("\nFirst 5 rows of the dataframe:")
        print(df.head())

        print("\nDataframe Info:")
        df.info()

        print("\nExploring 'articleType' column:")
        num_article_types = df['articleType'].nunique()
        print(f"Number of unique article types: {num_article_types}")

        print("\nTop 20 most frequent article types:")
        print(df['articleType'].value_counts().head(20))

    except Exception as e:
        print(f"An error occurred while loading or exploring CSV: {e}")
        df = None
else:
    print(f"Error: CSV file not found at {csv_path} or csv_path not defined. Please check Cell 5.")
    df = None

CSV file loaded successfully (bad lines skipped).

First 5 rows of the dataframe:
      id gender masterCategory subCategory  articleType baseColour  season  \
0  15970    Men        Apparel     Topwear       Shirts  Navy Blue    Fall   
1  39386    Men        Apparel  Bottomwear        Jeans       Blue  Summer   
2  59263  Women    Accessories     Watches      Watches     Silver  Winter   
3  21379    Men        Apparel  Bottomwear  Track Pants      Black    Fall   
4  53759    Men        Apparel     Topwear      Tshirts       Grey  Summer   

     year   usage                             productDisplayName  
0  2011.0  Casual               Turtle Check Men Navy Blue Shirt  
1  2012.0  Casual             Peter England Men Party Blue Jeans  
2  2016.0  Casual                       Titan Women Silver Watch  
3  2011.0  Casual  Manchester United Men Solid Black Track Pants  
4  2012.0  Casual                          Puma Men Grey T-shirt  

Dataframe Info:
<class 'pandas.core.frame.Data

In [7]:
from sklearn.preprocessing import LabelEncoder
import os

if 'df' in locals() and df is not None and 'images_dir' in locals() and os.path.exists(images_dir):
    top_n = 20

    top_article_types = df['articleType'].value_counts().nlargest(top_n).index.tolist()
    print(f"\nFocusing on the top {top_n} article types.")

    df_filtered = df[df['articleType'].isin(top_article_types)].reset_index(drop=True)
    print(f"Dataframe size after filtering: {len(df_filtered)}")

    df_filtered['image_path'] = df_filtered['id'].apply(lambda x: os.path.join(images_dir, f"{x}.jpg"))

    df_processed = df_filtered[df_filtered['image_path'].apply(os.path.exists)].reset_index(drop=True)
    print(f"Dataframe size after checking image paths: {len(df_processed)}")

    le = LabelEncoder()
    df_processed['articleType_encoded'] = le.fit_transform(df_processed['articleType'])

    label_map = dict(zip(le.transform(le.classes_), le.classes_))
    print("Label encoding map created.")

    num_classes = len(label_map)
    print(f"Final number of classes for the model: {num_classes}")

else:
    print("Cannot process data. Dependencies (df, images_dir) missing or invalid. Check Cells 5 and 6.")
    df_processed = None
    label_map = None
    num_classes = 0


Focusing on the top 20 article types.
Dataframe size after filtering: 33146
Dataframe size after checking image paths: 33142
Label encoding map created.
Final number of classes for the model: 20


In [8]:
from sklearn.model_selection import train_test_split

if 'df_processed' in locals() and df_processed is not None and len(df_processed) > 0 and 'num_classes' in locals() and num_classes > 0:
    print("\nSplitting data into training and validation sets...")
    train_df, val_df = train_test_split(
        df_processed,
        test_size=0.2,
        random_state=42,
        stratify=df_processed['articleType_encoded']
    )

    print(f"Training data size: {len(train_df)}")
    print(f"Validation data size: {len(val_df)}")
else:
    print("Cannot split data as df_processed is not available or is empty. Check Cell 7.")
    train_df, val_df = None, None


Splitting data into training and validation sets...
Training data size: 26513
Validation data size: 6629


In [9]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator

IMG_HEIGHT = 224
IMG_WIDTH = 224
BATCH_SIZE = 32

if 'train_df' in locals() and train_df is not None and 'val_df' in locals() and val_df is not None and 'num_classes' in locals() and num_classes > 0:

    print("\nSetting up Data Generators...")
    train_datagen = ImageDataGenerator(
        rescale=1./255,
        rotation_range=15, width_shift_range=0.1, height_shift_range=0.1,
        shear_range=0.1, zoom_range=0.1, horizontal_flip=True, fill_mode='nearest'
    )

    val_datagen = ImageDataGenerator(rescale=1./255)

    train_generator = train_datagen.flow_from_dataframe(
        dataframe=train_df,
        x_col='image_path',
        y_col='articleType',
        target_size=(IMG_HEIGHT, IMG_WIDTH),
        batch_size=BATCH_SIZE,
        class_mode='categorical',
        shuffle=True
    )

    val_generator = val_datagen.flow_from_dataframe(
        dataframe=val_df,
        x_col='image_path',
        y_col='articleType',
        target_size=(IMG_HEIGHT, IMG_WIDTH),
        batch_size=BATCH_SIZE,
        class_mode='categorical',
        shuffle=False
    )

    generator_class_indices = train_generator.class_indices
    generator_label_map = {v: k for k, v in generator_class_indices.items()}

    print("\nData generators created successfully.")

    steps_per_epoch = train_generator.n // train_generator.batch_size
    validation_steps = val_generator.n // val_generator.batch_size
    print(f"\nSteps per epoch: {steps_per_epoch}")
    print(f"Validation steps: {validation_steps}")

else:
    print("Cannot create generators. Training/validation data or number of classes not available. Check Cells 8.")
    train_generator, val_generator = None, None
    generator_label_map = None
    steps_per_epoch = 0
    validation_steps = 0


Setting up Data Generators...
Found 26513 validated image filenames belonging to 20 classes.
Found 6629 validated image filenames belonging to 20 classes.

Data generators created successfully.

Steps per epoch: 828
Validation steps: 207


In [10]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Dense, Flatten, Dropout

if 'num_classes' in locals() and num_classes > 0 and 'IMG_HEIGHT' in locals():
    print("\nBuilding custom CNN model from scratch...")

    model = Sequential([
        Conv2D(32, (3, 3), activation='relu', input_shape=(IMG_HEIGHT, IMG_WIDTH, 3)),
        MaxPooling2D((2, 2)),

        Conv2D(64, (3, 3), activation='relu'),
        MaxPooling2D((2, 2)),

        Conv2D(128, (3, 3), activation='relu'),
        MaxPooling2D((2, 2)),

        Flatten(),

        Dense(128, activation='relu'),
        Dropout(0.5),

        Dense(num_classes, activation='softmax')
    ])

    model.summary()
    print("Custom CNN model built successfully.")
else:
    print("Cannot build model. Number of classes or image dimensions not defined/valid. Check Cells 8 and 9.")
    model = None


Building custom CNN model from scratch...


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Custom CNN model built successfully.


In [11]:
if 'model' in locals() and model is not None:
    print("\nCompiling model...")
    model.compile(optimizer='adam',
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])

    print("Model compiled successfully.")
else:
    print("Cannot compile model as it was not built. Check Cell 10.")


Compiling model...
Model compiled successfully.


In [None]:
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
import matplotlib.pyplot as plt
import os

if 'GDRIVE_SAVE_DIR' in locals() and os.path.exists(GDRIVE_SAVE_DIR):
    checkpoint_filename = 'cnn_from_scratch_best_weights_{epoch:02d}_{val_accuracy:.4f}.weights.h5'
    checkpoint_filepath_drive = os.path.join(GDRIVE_SAVE_DIR, checkpoint_filename)

    model_checkpoint_callback_drive = ModelCheckpoint(
        filepath=checkpoint_filepath_drive,
        save_weights_only=True,
        monitor='val_accuracy',
        mode='max',
        save_best_only=True,
        verbose=1
    )

    early_stopping_callback = EarlyStopping(
        monitor='val_accuracy',
        patience=10,
        mode='max',
        restore_best_weights=True
    )

    epochs = 100

    if 'model' in locals() and model is not None and 'train_generator' in locals() and train_generator is not None and 'val_generator' in locals() and val_generator is not None and steps_per_epoch > 0 and validation_steps >= 0:
        print(f"\nStarting training from scratch for a maximum of {epochs} epochs...")
        print("Note: Training from scratch takes much longer and may require more epochs.")

        history = model.fit(
            train_generator,
            steps_per_epoch=steps_per_epoch,
            epochs=epochs,
            validation_data=val_generator,
            validation_steps=validation_steps,
            callbacks=[model_checkpoint_callback_drive, early_stopping_callback]
        )

        print("\nTraining finished.")

        if history:
            acc = history.history['accuracy']
            val_acc = history.history['val_accuracy']
            loss = history.history['loss']
            val_loss = history.history['val_loss']
            epochs_trained = len(history.history['loss'])
            epochs_range = range(epochs_trained)

            plt.figure(figsize=(12, 4))
            plt.subplot(1, 2, 1)
            plt.plot(epochs_range, acc, label='Training Accuracy')
            plt.plot(epochs_range, val_acc, label='Validation Accuracy')
            plt.legend(loc='lower right')
            plt.title('Training and Validation Accuracy (From Scratch)')
            plt.xlabel("Epochs")
            plt.ylabel("Accuracy")

            plt.subplot(1, 2, 2)
            plt.plot(epochs_range, loss, label='Training Loss')
            plt.plot(epochs_range, val_loss, label='Validation Loss')
            plt.legend(loc='upper right')
            plt.title('Training and Validation Loss (From Scratch)')
            plt.xlabel("Epochs")
            plt.ylabel("Loss")

            plt.show()
        else:
            print("Training history not recorded.")

    else:
        print("Cannot start training. Model, generators, or step counts are not available. Check previous cells (9, 10, 11).")
        history = None


else:
    print("Google Drive not mounted or save directory not accessible. Cannot save checkpoints.")
    print("Training will proceed WITHOUT saving checkpoints to Drive.")
    if 'model' in locals() and model is not None and 'train_generator' in locals() and train_generator is not None and 'val_generator' in locals() and val_generator is not None and steps_per_epoch > 0 and validation_steps >= 0:
         print(f"\nStarting training from scratch for a maximum of {epochs} epochs (no Drive checkpoints)...")
         history = model.fit(
             train_generator,
             steps_per_epoch=steps_per_epoch,
             epochs=epochs,
             validation_data=val_generator,
             validation_steps=validation_steps
         )
         print("\nTraining finished (no Drive checkpoints saved).")
         if history:
             acc = history.history['accuracy']
             val_acc = history.history['val_accuracy']
             loss = history.history['loss']
             val_loss = history.history['val_loss']
             epochs_trained = len(history.history['loss'])
             epochs_range = range(epochs_trained)
             plt.figure(figsize=(12, 4))
             plt.subplot(1, 2, 1)
             plt.plot(epochs_range, acc, label='Training Accuracy')
             plt.plot(epochs_range, val_acc, label='Validation Accuracy')
             plt.legend(loc='lower right')
             plt.title('Training and Validation Accuracy (From Scratch)')
             plt.xlabel("Epochs")
             plt.ylabel("Accuracy")
             plt.subplot(1, 2, 2)
             plt.plot(epochs_range, loss, label='Training Loss')
             plt.plot(epochs_range, val_loss, label='Validation Loss')
             plt.legend(loc='upper right')
             plt.title('Training and Validation Loss (From Scratch)')
             plt.xlabel("Epochs")
             plt.ylabel("Loss")
             plt.show()
         else:
              print("Training history not recorded.")
    else:
         print("Cannot start training (even without checkpoints). Dependencies missing.")
         history = None


Starting training from scratch for a maximum of 100 epochs...
Note: Training from scratch takes much longer and may require more epochs.


  self._warn_if_super_not_called()


Epoch 1/100
[1m828/828[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 458ms/step - accuracy: 0.3715 - loss: 2.0931
Epoch 1: val_accuracy improved from -inf to 0.75030, saving model to /content/drive/MyDrive/my_fashion_cnn_from_scratch_outputs/cnn_from_scratch_best_weights_01_0.7503.weights.h5
[1m828/828[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m398s[0m 471ms/step - accuracy: 0.3717 - loss: 2.0925 - val_accuracy: 0.7503 - val_loss: 0.7345
Epoch 2/100
[1m  1/828[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m51s[0m 63ms/step - accuracy: 0.6875 - loss: 0.8661




Epoch 2: val_accuracy did not improve from 0.75030
[1m828/828[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 24ms/step - accuracy: 0.6875 - loss: 0.8661 - val_accuracy: 0.7429 - val_loss: 0.7344
Epoch 3/100
[1m828/828[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 418ms/step - accuracy: 0.6210 - loss: 1.0955
Epoch 3: val_accuracy improved from 0.75030 to 0.77582, saving model to /content/drive/MyDrive/my_fashion_cnn_from_scratch_outputs/cnn_from_scratch_best_weights_03_0.7758.weights.h5
[1m828/828[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m441s[0m 501ms/step - accuracy: 0.6210 - loss: 1.0955 - val_accuracy: 0.7758 - val_loss: 0.6206
Epoch 4/100
[1m  1/828[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m49s[0m 60ms/step - accuracy: 0.5938 - loss: 1.2316
Epoch 4: val_accuracy improved from 0.77582 to 0.77899, saving model to /content/drive/MyDrive/my_fashion_cnn_from_scratch_outputs/cnn_from_scratch_best_weights_04_0.7790.weights.h5
[1m828/828[0m [32m━━━━━━━━━━━━━━━━━━