# Clothing Category Classifier
(Predicting subCategory or articleType)


### importing required libraries

In [None]:
import pandas as pd  
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from datetime import datetime
from datasets import load_dataset

## Data loading

### Load & Explore the Data

In [None]:
# Load the dataset from huggingface
dataset = load_dataset("nreimers/fashion-dataset")
print(dataset)

# Converting to pandas DataFrame
df = dataset["train"].to_pandas()

In [None]:
df.isnull().sum()

## Data Preprocessing

In [None]:
# Define relevant clothing items
relevant_clothing = [
    "Sweaters", "Jackets", "Mufflers", "Scarves", "Gloves", "Rain Jacket", 
    "Rain Trousers", "Boots", "Hats", "Trousers", "Tshirts", "Jeans", "Shirts", 
    "Track Pants", "Shorts", "Socks", "Dresses", "Skirts"
]

# Filter dataset to keep only relevant clothing items
df = df[df["articleType"].isin(relevant_clothing)]

# Handle missing season values by filling based on articleType
season_mapping = {
    "Sweaters": "Winter", "Jackets": "Winter", "Mufflers": "Winter", "Scarves": "Winter",
    "Gloves": "Winter", "Rain Jacket": "Fall", "Rain Trousers": "Fall", "Boots": "Winter",
    "Hats": "Summer", "Trousers": "Fall", "Tshirts": "Summer", "Jeans": "Fall",
    "Shirts": "Summer", "Track Pants": "Winter", "Shorts": "Summer", "Socks": "Winter",
    "Dresses": "Spring", "Skirts": "Spring"
}
df["season"] = df["season"].fillna(df["articleType"].map(season_mapping))

# Handle missing 'usage' values (if useful)
df["usage"] = df["usage"].fillna("Unknown")

## Fill missing baseColour with 'Unknown'
df['baseColour'].fillna('Unknown', inplace=True)

## Fill missing year with the most common year
df['year'].fillna(df['year'].mode()[0], inplace=True)

# Save the cleaned dataset
df.to_csv("filtered_fashion_dataset.csv", index=False)

In [None]:
# Check for missing values to make sure we've handled them all
print("Missing values in each column:")
print(df.isnull().sum())

Visualizing to deepen the knowledge about the Data.

In [None]:
# Distribution of Seasons
plt.figure(figsize=(10, 6))
sns.countplot(data=df, x='season')
plt.title('Distribution of Seasons')
plt.show()

# Category Coverage
plt.figure(figsize=(12, 8))
sns.countplot(data=df, x='articleType')
plt.title('Category Coverage')
plt.xticks(rotation=90)
plt.show()

# Gender Representation
plt.figure(figsize=(10, 6))
sns.countplot(data=df, x='gender')
plt.title('Gender Representation')
plt.show()

# Color Distribution
plt.figure(figsize=(12, 8))
sns.countplot(data=df, x='baseColour')
plt.title('Color Distribution')
plt.xticks(rotation=90)
plt.show()

# Usage Column Analysis
print("Unique values in 'usage' column:")
print(df['usage'].value_counts())

# Visualize Usage Column
plt.figure(figsize=(10, 6))
sns.countplot(data=df, x='usage')
plt.title('Usage Column Distribution')
plt.show()

Let's make sure we've done it correctly

In [None]:
# Explore the dataset
print(df.head())  # View first few rows
print(df.info())  # Check column data types
print(df.isnull().sum())  # Check for missing values
print(df['subCategory'].value_counts())  # Check class distribution

### Drop Irrelevant Columns

In [None]:
df = df.drop(columns=['id', 'productDisplayName'])

### Encode Categorical Variables

In [None]:
df = pd.get_dummies(df, columns=['gender', 'masterCategory',], drop_first=True)

# Label encode articleType, subCategory, season, usage, and baseColour
le_article = LabelEncoder()
df['articleType'] = le_article.fit_transform(df['articleType'])

le_subCategory = LabelEncoder()
df['subCategory'] = le_subCategory.fit_transform(df['subCategory'])

le_season = LabelEncoder()
df['season'] = le_season.fit_transform(df['season'])

le_usage = LabelEncoder()
df['usage'] = le_usage.fit_transform(df['usage'])

le_baseColour = LabelEncoder()
df['baseColour'] = le_baseColour.fit_transform(df['baseColour'])


In [None]:
# Check unique values assigned
print(df[['articleType', 'subCategory', 'season', 'usage', 'baseColour']].head())

Let's turn year to age for a better performance.

In [None]:
# Get current year dynamically
current_year = datetime.now().year

# Calculate age by subtracting birth year from current year
df['year'] = current_year - df['year']

### Handle Imbalanced Data
Some subcategories have very few instances. I decided to remove them.

In [None]:
# Define a minimum threshold (e.g., categories with < 50 instances)
min_count = 50

# Filter categories that appear more than the threshold
valid_subcategories = df['subCategory'].value_counts()[df['subCategory'].value_counts() >= min_count].index

# Keep only valid categories
df = df[df['subCategory'].isin(valid_subcategories)]

# Print updated category distribution
print(df['subCategory'].value_counts())

## Splitting Data for Training

In [None]:
# Define target variable (subCategory) and features (everything else)
X = df.drop(columns=['subCategory'])
y = df['subCategory']

# Split into 80% training and 20% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Check the split
print(y_train.value_counts())  # Ensure balanced distribution in training set
print(y_test.value_counts())   # Ensure balanced distribution in test set

## Model training

In [None]:
# Create a pipeline for scaling & training
model = Pipeline([
    ('scaler', StandardScaler()),  # Normalize numerical features
    ('classifier', LogisticRegression(max_iter=500))  # Train model
])

# Train the model
model.fit(X_train, y_train)

## Evaluation

In [None]:
# Predict on test set
y_pred = model.predict(X_test)

# Print evaluation metrics
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))