# Clothing Category Classifier
(Predicting subCategory or articleType)


### importing required libraries

In [None]:
import pandas as pd  
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from datetime import datetime
from datasets import load_dataset

## Data loading

### Load & Explore the Data

In [None]:
# Load the dataset from huggingface
dataset = load_dataset("nreimers/fashion-dataset")
print(dataset)

# Converting to pandas DataFrame
df = dataset["train"].to_pandas()

In [None]:
df.isnull().sum()

## Data Preprocessing

In [None]:
# Define relevant clothing items
relevant_clothing = [
    "Sweaters", "Jackets", "Mufflers", "Scarves", "Gloves", "Rain Jacket", 
    "Rain Trousers", "Boots", "Hats", "Trousers", "Tshirts", "Jeans", "Shirts", 
    "Track Pants", "Shorts", "Socks", "Dresses", "Skirts"
]

# Filter dataset to keep only relevant clothing items
df = df[df["articleType"].isin(relevant_clothing)]

# Handle missing season values by filling based on articleType
season_mapping = {
    "Sweaters": "Winter", "Jackets": "Winter", "Mufflers": "Winter", "Scarves": "Winter",
    "Gloves": "Winter", "Rain Jacket": "Fall", "Rain Trousers": "Fall", "Boots": "Winter",
    "Hats": "Summer", "Trousers": "Fall", "Tshirts": "Summer", "Jeans": "Fall",
    "Shirts": "Summer", "Track Pants": "Winter", "Shorts": "Summer", "Socks": "Winter",
    "Dresses": "Spring", "Skirts": "Spring"
}
df["season"] = df["season"].fillna(df["articleType"].map(season_mapping))

# Handle missing 'usage' values (if useful)
df["usage"] = df["usage"].fillna("Unknown")

## Fill missing baseColour with 'Unknown'
df['baseColour'].fillna('Unknown', inplace=True)

## Fill missing year with the most common year
df['year'].fillna(df['year'].mode()[0], inplace=True)

# Save the cleaned dataset
df.to_csv("filtered_fashion_dataset.csv", index=False)

In [None]:
# Check for missing values to make sure we've handled them all
print("Missing values in each column:")
print(df.isnull().sum())

Visualizing to deepen the knowledge about the Data.

In [None]:
# Distribution of Seasons
plt.figure(figsize=(10, 6))
sns.countplot(data=df, x='season')
plt.title('Distribution of Seasons')
plt.show()

# Category Coverage
plt.figure(figsize=(12, 8))
sns.countplot(data=df, x='articleType')
plt.title('Category Coverage')
plt.xticks(rotation=90)
plt.show()

# Gender Representation
plt.figure(figsize=(10, 6))
sns.countplot(data=df, x='gender')
plt.title('Gender Representation')
plt.show()

# Color Distribution
plt.figure(figsize=(12, 8))
sns.countplot(data=df, x='baseColour')
plt.title('Color Distribution')
plt.xticks(rotation=90)
plt.show()

# Usage Column Analysis
print("Unique values in 'usage' column:")
print(df['usage'].value_counts())

# Visualize Usage Column
plt.figure(figsize=(10, 6))
sns.countplot(data=df, x='usage')
plt.title('Usage Column Distribution')
plt.show()

Let's make sure we've done it correctly

In [None]:
# Explore the dataset
print(df.head())  # View first few rows
print(df.info())  # Check column data types
print(df.isnull().sum())  # Check for missing values
print(df['subCategory'].value_counts())  # Check class distribution

### Drop Irrelevant Columns

In [None]:
df = df.drop(columns=['id', 'productDisplayName'])

### Encode Categorical Variables

In [None]:
df = pd.get_dummies(df, columns=['gender', 'masterCategory',], drop_first=True)

# Label encode articleType, subCategory, season, usage, and baseColour
le_article = LabelEncoder()
df['articleType'] = le_article.fit_transform(df['articleType'])

le_subCategory = LabelEncoder()
df['subCategory'] = le_subCategory.fit_transform(df['subCategory'])

le_season = LabelEncoder()
df['season'] = le_season.fit_transform(df['season'])

le_usage = LabelEncoder()
df['usage'] = le_usage.fit_transform(df['usage'])

le_baseColour = LabelEncoder()
df['baseColour'] = le_baseColour.fit_transform(df['baseColour'])


In [None]:
# Check unique values assigned
print(df[['articleType', 'subCategory', 'season', 'usage', 'baseColour']].head())

Let's turn year to age for a better performance.

In [None]:
# Get current year dynamically
current_year = datetime.now().year

# Calculate age by subtracting birth year from current year
df['year'] = current_year - df['year']

### Handle Imbalanced Data
Some subcategories have very few instances. I decided to remove them.

In [None]:
# Define a minimum threshold (e.g., categories with < 50 instances)
min_count = 50

# Filter categories that appear more than the threshold
valid_subcategories = df['subCategory'].value_counts()[df['subCategory'].value_counts() >= min_count].index

# Keep only valid categories
df = df[df['subCategory'].isin(valid_subcategories)]

# Print updated category distribution
print(df['subCategory'].value_counts())

## Splitting Data for Training

In [None]:
# Define target variable (subCategory) and features (everything else)
X = df.drop(columns=['subCategory'])
y = df['subCategory']

# Split into 80% training and 20% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Check the split
print(y_train.value_counts())  # Ensure balanced distribution in training set
print(y_test.value_counts())   # Ensure balanced distribution in test set

## Model training

In [None]:
# Create a pipeline for scaling & training
model = Pipeline([
    ('scaler', StandardScaler()),  # Normalize numerical features
    ('classifier', LogisticRegression(max_iter=500))  # Train model
])

# Train the model
model.fit(X_train, y_train)

## Evaluation

In [None]:
# Predict on test set
y_pred = model.predict(X_test)

# Print evaluation metrics
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

## Next Steps

The results obtained so far are promising, but we need to take this analysis to the next level. In the following parts of this notebook, we will explore deeper by performing:

1. **Model Comparison**: Compare different machine learning models to find the best performing one.
2. **Hyperparameter Tuning**: Optimize the hyperparameters of the selected model to improve performance.
3. **Cross-Validation Analysis**: Perform cross-validation to ensure the model's robustness and generalizability.
4. **Feature Importance Visualization**: Visualize the importance of different features in the model.
5. **Confusion Matrix and Classification Metrics**: Generate a confusion matrix and other classification metrics to evaluate the model's performance.

Let's dive into each of these steps in detail.

In [None]:
# Model Comparison - Training and evaluating multiple models
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import time

# Define models to compare
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'SVM': SVC(probability=True, random_state=42),
    'KNN': KNeighborsClassifier(n_neighbors=5)
}

# Dictionary to store results
results = {}
train_times = {}

# Train and evaluate each model
for name, model in models.items():
    print(f"Training {name}...")
    start_time = time.time()
    model.fit(X_train, y_train)
    train_time = time.time() - start_time
    
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    
    results[name] = accuracy
    train_times[name] = train_time
    
    print(f"{name} - Accuracy: {accuracy:.4f}, Training time: {train_time:.2f} seconds")
    print(f"Classification Report:\n{classification_report(y_test, y_pred)}\n")

# Visualize model comparison
plt.figure(figsize=(10, 6))
models_df = pd.DataFrame({
    'Model': list(results.keys()),
    'Accuracy': list(results.values()),
    'Training Time (s)': list(train_times.values())
})

# Plot accuracy comparison
plt.subplot(1, 2, 1)
sns.barplot(x='Model', y='Accuracy', data=models_df)
plt.title('Model Accuracy Comparison')
plt.xticks(rotation=45)
plt.ylim([models_df.Accuracy.min() - 0.05, 1.0])

# Plot training time comparison
plt.subplot(1, 2, 2)
sns.barplot(x='Model', y='Training Time (s)', data=models_df)
plt.title('Model Training Time Comparison')
plt.xticks(rotation=45)

plt.tight_layout()
plt.show()

# Select best model
best_model_name = max(results, key=results.get)
best_model = models[best_model_name]
print(f"Best performing model: {best_model_name} with accuracy of {results[best_model_name]:.4f}")

In [None]:
# Hyperparameter Tuning for the Best Model
from sklearn.model_selection import GridSearchCV

print(f"Performing hyperparameter tuning for {best_model_name}...")

# Define parameter grids for different models
param_grids = {
    'Logistic Regression': {
        'C': [0.001, 0.01, 0.1, 1, 10, 100],
        'solver': ['liblinear', 'saga'],
        'penalty': ['l1', 'l2']
    },
    'Random Forest': {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10]
    },
    'SVM': {
        'C': [0.1, 1, 10],
        'gamma': ['scale', 'auto', 0.1, 1],
        'kernel': ['rbf', 'linear']
    },
    'KNN': {
        'n_neighbors': [3, 5, 7, 9],
        'weights': ['uniform', 'distance'],
        'algorithm': ['auto', 'ball_tree', 'kd_tree']
    }
}

# Set up GridSearchCV
grid_search = GridSearchCV(
    estimator=best_model,
    param_grid=param_grids[best_model_name],
    cv=5,
    scoring='accuracy',
    verbose=1,
    n_jobs=-1
)

# Fit grid search
grid_search.fit(X_train, y_train)

# Best parameters and score
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best cross-validation accuracy: {grid_search.best_score_:.4f}")

# Evaluate on test set
tuned_model = grid_search.best_estimator_
y_pred_tuned = tuned_model.predict(X_test)
tuned_accuracy = accuracy_score(y_test, y_pred_tuned)

print(f"Tuned model accuracy on test set: {tuned_accuracy:.4f}")
print(f"Improvement over base model: {tuned_accuracy - results[best_model_name]:.4f}")

In [None]:
# Cross-Validation Analysis
from sklearn.model_selection import cross_val_score, KFold

# Setup k-fold cross-validation
k_folds = 5
kf = KFold(n_splits=k_folds, shuffle=True, random_state=42)

# Perform k-fold CV on the tuned model
print("Performing k-fold cross-validation...")
cv_scores = cross_val_score(tuned_model, X_train, y_train, cv=kf, scoring='accuracy')

# Display cross-validation results
plt.figure(figsize=(8, 6))
plt.bar(range(1, k_folds+1), cv_scores, color='skyblue')
plt.axhline(y=cv_scores.mean(), color='red', linestyle='-', label=f'Mean: {cv_scores.mean():.4f}')
plt.xlabel('Fold')
plt.ylabel('Accuracy')
plt.title('5-Fold Cross-Validation Results')
plt.xticks(range(1, k_folds+1))
plt.ylim([cv_scores.min() - 0.05, 1.0])
plt.legend()
plt.show()

print(f"Cross-validation accuracy: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")

In [None]:
# Feature Importance Analysis
plt.figure(figsize=(12, 8))

# For tree-based models (Random Forest)
if 'Random Forest' in best_model_name:
    importances = tuned_model.feature_importances_
    indices = np.argsort(importances)[::-1]
    
    # Get feature names (assuming X_train is a DataFrame with column names)
    if isinstance(X_train, pd.DataFrame):
        features = X_train.columns
    else:
        features = [f'Feature {i}' for i in range(X_train.shape[1])]
    
    plt.title('Feature Importance (Random Forest)')
    plt.bar(range(X_train.shape[1]), importances[indices], align='center')
    plt.xticks(range(X_train.shape[1]), [features[i] for i in indices], rotation=90)
    plt.tight_layout()

# For linear models (Logistic Regression)
elif 'Logistic Regression' in best_model_name:
    coef = tuned_model.coef_[0]
    
    # Get feature names
    if isinstance(X_train, pd.DataFrame):
        features = X_train.columns
    else:
        features = [f'Feature {i}' for i in range(X_train.shape[1])]
    
    # Sort coefficients
    coef_sorted_idx = np.argsort(np.abs(coef))
    top_features = 20  # Show top 20 features
    
    plt.title('Feature Importance (Logistic Regression)')
    plt.barh(range(top_features), coef[coef_sorted_idx[-top_features:]])
    plt.yticks(range(top_features), [features[i] for i in coef_sorted_idx[-top_features:]])
    plt.xlabel('Coefficient magnitude')

# For other models, use permutation importance
else:
    from sklearn.inspection import permutation_importance
    
    perm_importance = permutation_importance(tuned_model, X_test, y_test, n_repeats=10, random_state=42)
    sorted_idx = perm_importance.importances_mean.argsort()[::-1]
    
    # Get feature names
    if isinstance(X_train, pd.DataFrame):
        features = X_train.columns
    else:
        features = [f'Feature {i}' for i in range(X_train.shape[1])]
    
    plt.title('Feature Importance (Permutation Importance)')
    plt.bar(range(len(sorted_idx)), perm_importance.importances_mean[sorted_idx])
    plt.xticks(range(len(sorted_idx)), [features[i] for i in sorted_idx], rotation=90)
    plt.tight_layout()

plt.show()

In [None]:
# Detailed Evaluation Metrics and Confusion Matrix
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, precision_recall_curve, average_precision_score

# Confusion Matrix
plt.figure(figsize=(10, 8))
cm = confusion_matrix(y_test, y_pred_tuned)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=np.unique(y))
disp.plot(cmap='Blues', values_format='d')
plt.title('Confusion Matrix')
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()

# For multi-class problems, evaluate precision, recall and F1 for each class
print("Classification Report:")
print(classification_report(y_test, y_pred_tuned))

# If it's a multi-class problem with many classes, show metrics for top misclassified classes
if len(np.unique(y)) > 10:
    print("\nTop Misclassified Classes:")
    error_mask = y_test != y_pred_tuned
    misclassified = pd.DataFrame({
        'True': y_test[error_mask],
        'Predicted': y_pred_tuned[error_mask]
    })
    misclass_counts = misclassified.groupby(['True', 'Predicted']).size().reset_index(name='Count')
    misclass_counts = misclass_counts.sort_values('Count', ascending=False).head(10)
    print(misclass_counts)