# Cleaning

In [7]:
!pip install spacy


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.2.1[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [8]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m49.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.2.1[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


**Merge datasets**

In [9]:
import os
import pandas as pd
from glob import glob

def merge_all_csvs(base_path, save_path):
    all_dfs = []

    for gender in ["Women", "Men"]:
        folder_path = os.path.join(base_path, gender)
        csv_files = glob(os.path.join(folder_path, "*.csv")) + glob(os.path.join(folder_path, "*.CSV"))

        for file in csv_files:
            df = pd.read_csv(file)
            # Standardize columns
            df.columns = [col.strip().lower().replace(" ", "_") for col in df.columns]
            df.columns = ["product_images" if c in ["product_image", "product_images"] else c for c in df.columns]

            # Add gender and category
            df["gender"] = gender.lower()
            df["product_category"] = os.path.splitext(os.path.basename(file))[0]

            all_dfs.append(df)

    # Concatenate all and save
    if all_dfs:
        clothes = pd.concat(all_dfs, ignore_index=True, sort=False)
        clothes.to_csv(save_path, index=False)
        print(f"✅ Merged all datasets into '{save_path}' with {clothes.shape[0]} rows and {clothes.shape[1]} columns.")
        return clothes
    else:
        print("⚠️ No CSV files found.")
        return None

base_path = "/home/kloor/code/pawarsp/what-to-wear-today/raw_data"

# Save merged CSV to the same folder
save_path = os.path.join(base_path, "clothes.csv")

# Merge and save
clothes = merge_all_csvs(base_path, save_path)


✅ Merged all datasets into '/home/kloor/code/pawarsp/what-to-wear-today/raw_data/clothes.csv' with 4185 rows and 8 columns.


In [10]:
clothes = clothes.reset_index().rename(columns={'index': 'product_id'})
clothes.head()

Unnamed: 0,product_id,unnamed:_0,product_name,link,product_images,price,details,gender,product_category
0,0,0,3-IN-1 BAR REFILL,https://www.zara.com/in/en/3-in-1-bar-refill-p...,[],"₹ 1,290.00","Creamy texture multifunction makeup bar, avail...",women,BEAUTY
1,1,2,LIP OIL,https://www.zara.com/in/en/lip-oil-p24130314.html,[],₹ 790.00,This creamy lip oil glides like silk to achiev...,women,BEAUTY
2,2,4,ULTIMATTE MATTE LIQUID LIPSTICK,https://www.zara.com/in/en/ultimatte-matte-liq...,[],₹ 790.00,"Slick with it. Our favourite lipstick, now in ...",women,BEAUTY
3,3,10,FACE BRUSH #3,https://www.zara.com/in/en/face-brush--3-p2414...,[{'https://static.zara.net/photos///2022/I/2/1...,₹ 950.00,Small face brush for the application of powder...,women,BEAUTY
4,4,11,FACE BRUSH #2,https://www.zara.com/in/en/face-brush--2-p2414...,[],"₹ 1,290.00",Designed to apply powder and cream products. P...,women,BEAUTY


**Data cleaning**

In [11]:
import pandas as pd
import re
import os
import spacy

nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])


# Basic cleaning: lowercase, remove punctuation, strip
def basic_clean(text):
    if pd.isna(text):
        return ""
    text = str(text).lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

clothes["product_name"] = clothes["product_name"].apply(basic_clean)
clothes["details"] = clothes["details"].apply(basic_clean)

# remove unwanted words & numbers
def remove_words_numbers(text):
    # Remove 'ZW', 'Zara' (case-insensitive) and all numbers
    text = re.sub(r'\b(zw|zara)\b|\d+', '', text, flags=re.IGNORECASE)
    # Remove extra whitespace again
    text = re.sub(r'\s+', ' ', text).strip()
    return text

clothes = clothes.dropna(subset=["product_name", "details"])
clothes["product_name"] = clothes["product_name"].apply(remove_words_numbers)
clothes["details"] = clothes["details"].apply(remove_words_numbers)

# Tokenization,lemmatization,stopword removal
def preprocess_spacy(text):
    if not text:
        return ""
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if token.is_alpha and not token.is_stop]
    return " ".join(tokens)

clothes["product_name_clean"] = clothes["product_name"].apply(preprocess_spacy)
clothes["details_clean"] = clothes["details"].apply(preprocess_spacy)


clothes.drop(columns=["product_name", "details","link", "price", "unnamed:_0" ], inplace=True)
clothes.rename(columns={"product_name_clean": "product_name", "details_clean": "details"}, inplace=True)


clothes.head()



Unnamed: 0,product_id,product_images,gender,product_category,product_name,details
0,0,[],women,BEAUTY,bar refill,creamy texture multifunction makeup bar availa...
1,1,[],women,BEAUTY,lip oil,creamy lip oil glide like silk achieve uniform...
2,2,[],women,BEAUTY,ultimatte matte liquid lipstick,slick favourite lipstick liquid version covera...
3,3,[{'https://static.zara.net/photos///2022/I/2/1...,women,BEAUTY,face brush,small face brush application powder cream liqu...
4,4,[],women,BEAUTY,face brush,design apply powder cream product perfect blus...


# Feature engineering

In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report

In [13]:
required_cols = ["product_name", "details", "product_category"]
for col in required_cols:
    if col not in clothes.columns:
        raise ValueError(f"Missing required column: '{col}'")

clothes["product_name"] = clothes["product_name"].astype(str)
clothes["details"] = clothes["details"].astype(str)
clothes["combined_text"] = clothes["product_name"].fillna('') + " " + clothes["details"].fillna('')

# Defining ambiguous categories to reassign their product items into relevant clear categories already existing in our dataset
ambiguous_cats = ["WORKWEARNEW", "BASICS", "LOUNGEWEARNEW",
                  "SPECIAL PRICES", "LINEN", "ZARA ATHLETICZ", "ZARA ORIGINS",  "KNITWEAR", "CO-ORD SETS", 'PERFUMESNEW', 'ACCESSORIES_JEWELLERY', 'BEAUTY', 'BAGS_BACKPACKS']



In [14]:

# Training a model to reassign product items into the relevant categories

train_df = clothes[~clothes["product_category"].isin(ambiguous_cats)].copy()
predict_df = clothes[clothes["product_category"].isin(ambiguous_cats)].copy()

train_df = train_df.dropna(subset=["product_category"])

counts = train_df["product_category"].value_counts()
valid_cats = counts[counts >= 2].index
train_df = train_df[train_df["product_category"].isin(valid_cats)]

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    train_df["combined_text"], train_df["product_category"],
    test_size=0.2, random_state=42, stratify=train_df["product_category"]
)

model = Pipeline([
    ("tfidf", TfidfVectorizer(stop_words="english", max_features=20000, ngram_range=(1, 2))),
    ("clf", LogisticRegression(max_iter=2000, class_weight="balanced"))
])

# Train model
model.fit(X_train, y_train)

# evaluation
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Predict categories for ambiguous rows into final_category
if not predict_df.empty:
    clothes.loc[predict_df.index, "final_category"] = model.predict(predict_df["combined_text"])

clothes["final_category"].fillna(clothes["product_category"], inplace=True)



Accuracy: 0.8998384491114702
                      precision    recall  f1-score   support

         ACCESSORIES       0.96      0.80      0.87        30
                BAGS       1.00      1.00      1.00        22
             BLAZERS       0.91      0.56      0.69        18
   DRESSES_JUMPSUITS       1.00      0.97      0.98       129
 HOODIES_SWEATSHIRTS       0.50      0.83      0.62         6
             JACKETS       0.97      0.83      0.90        36
               JEANS       1.00      1.00      1.00         6
          OVERSHIRTS       0.40      0.80      0.53         5
            PERFUMES       1.00      1.00      1.00         9
         POLO SHIRTS       0.88      0.78      0.82         9
              SHIRTS       0.96      0.90      0.93        77
               SHOES       0.99      0.99      0.99        90
              SHORTS       0.94      1.00      0.97        17
       SHORTS_SKORTS       0.71      0.91      0.80        11
              SKIRTS       1.00      0.6

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  clothes["final_category"].fillna(clothes["product_category"], inplace=True)


In [15]:
clothes.loc[clothes["product_category"].isin(ambiguous_cats), "product_category"] = clothes["final_category"]

clothes.drop(columns=["final_category"], inplace=True)


In [16]:
clothes.head()


Unnamed: 0,product_id,product_images,gender,product_category,product_name,details,combined_text
0,0,[],women,ACCESSORIES,bar refill,creamy texture multifunction makeup bar availa...,bar refill creamy texture multifunction makeup...
1,1,[],women,ACCESSORIES,lip oil,creamy lip oil glide like silk achieve uniform...,lip oil creamy lip oil glide like silk achieve...
2,2,[],women,ACCESSORIES,ultimatte matte liquid lipstick,slick favourite lipstick liquid version covera...,ultimatte matte liquid lipstick slick favourit...
3,3,[{'https://static.zara.net/photos///2022/I/2/1...,women,SHOES,face brush,small face brush application powder cream liqu...,face brush small face brush application powder...
4,4,[],women,SHOES,face brush,design apply powder cream product perfect blus...,face brush design apply powder cream product p...


In [17]:
clothes["product_category"].unique()

array(['ACCESSORIES', 'SHOES', 'TOPS_BODYSUITS', 'JACKETS', 'SKIRTS',
       'SHIRTS', 'JEANS', 'SWEATERS_CARDIGANS', 'TROUSERS',
       'WAISTCOATS_GILETS', 'BLAZERS', 'SHORTS_SKORTS', 'BAGS',
       'DRESSES_JUMPSUITS', 'T-SHIRTS_SWEATSHIRTS', 'T-SHIRTS',
       'HOODIES_SWEATSHIRTS', 'PERFUMES', 'SHORTS', 'OVERSHIRTS',
       'POLO SHIRTS', 'SUITS', 'SWIMWEAR'], dtype=object)

In [18]:

print(clothes.shape)


(4185, 7)


In [19]:
import pandas as pd
import re

tops = [
    "T-SHIRTS", "SHIRTS", "BLAZERS", "JACKETS", "OVERSHIRTS",
    "T-SHIRTS_SWEATSHIRTS", "HOODIES_SWEATSHIRTS", "POLO SHIRTS",
    "SWEATERS_CARDIGANS", "WAISTCOATS_GILETS", "KNITWEAR", "TOPS_BODYSUITS"
]
bottoms = ["TROUSERS", "JEANS", "SHORTS", "SKIRTS", "SHORTS_SKORTS"]
shoes = ["SHOES"]
accessories = ["ACCESSORIES"]

def assign_category_type(cat):
    if cat in tops:
        return "Top"
    elif cat in bottoms:
        return "Bottom"
    elif cat in shoes:
        return "Shoes"
    elif cat in accessories:
        return "Accessories"
    else:
        return "Other"

clothes["category_type"] = clothes["product_category"].apply(assign_category_type)

# --- Mappings ---
material_mapping = [
    'neoprene', 'denim', 'cashmere', 'linenviscose', 'silk', 'cotton',
    'jute', 'woolcotton', 'alpaca', 'ramiecotton', 'polyamide', 'satiny',
    'knit', 'linen', 'gabardinestyle', 'nylon', 'juteline', 'corduroycotton',
    'faux', 'leathercotton', 'cottonlinen', 'suede', 'rubberise', 'viscosecotton',
    'linencotton', 'vinylcotton', 'modal', 'liocell', 'semisheer', 'chiffon',
    'ramie', 'woolcanvas', 'gabardinetype', 'satin', 'lyocell', 'polyurethane',
    'polyester', 'denimcotton', 'organza', 'silkcotton', 'acetate', 'polyesterorganza',
    'gabardine', 'neoprenecotton', 'leather', 'viscose', 'patent', 'velvet',
    'wool', 'poplin', 'woolwool', 'linen','liocellcotton'
]

weather_mapping = ["breathability", "frontbreathableantiodour",
   "absorbent", "absorption","water", "waterrepellent","wind", "windbreaker","thermal", "thermos", "thermoseale", "insulation", "insulate",
    "lightweight", "lightness", "lightly",
    "anorak", "bomber", "hoodie", "raincoat", "trench coat", "puffer", "puffy",
    "sweater", "sweatshirt", "cardigan", "knit", "pullover"
]

coverage_mapping = [
    "short", "shortsstyle", "sleeveless", "sleevelessfitte", "mini", "crop", "cropped", "midi", "long", "longline", "longleg"
]

product_subtype_mapping = [
    'backpack', 'bag', 'band', 'bandana', 'beanie', 'belt', 'bracelet', 'brief', 'cap', 'bottle', 'bow', 'hat', 'box', 'earring', 'earphone', 'necklace', 'pendant', 'ring', 'tote', 'towel', 'trunk',
    'blazer', 'belt', 'cape', 'lapel', 'lapelless', 'waistcoat', 'vest', 'tuxedo',
    'dress', 'jumpsuit', 'blouse', 'camisole', 'bodysuit', 'bustier', 'halterneck', 'kimono', 'playsuit', 'top',
    'hoodie', 'sweatshirt',
    'jacket', 'coat', 'bomber', 'biker', 'blazer', 'gilet', 'hood', 'hooded', 'trench coat', 'anorak', 'puffer', 'overshirt', 'kimono',
    'jean', 'denim',
    'overshirt', 'shirt',
    'polo', 'shirt',
    'shirt', 'blouse', 'oxford', 'chambray',
    'shoe', 'boot', 'loafer', 'sandal', 'trainer', 'mule', 'clog', 'espadrille', 'ballerina',
    'shorts', 'bermuda', 'chino',
    'short', 'skort', 'bermuda',
    'skirt', 'skort', 'pencil',
    'suit', 'blazer', 'trouser', 'waistcoat',
    'sweater', 'cardigan', 'tank', 'pullover',
    'swim', 'swimsuit', 'trunk','bikini',
    'tshirt', 't-shirt', 'tank',
    'tshirt', 'sweatshirt', 'hoodie',
    'top', 'bodysuit', 'camisole', 'bralette',
    'trouser', 'jean', 'denim',
    'waistcoat', 'gilet', 'vest', 'tuxedo'
]

#  Combine text columns
clothes_features = clothes.copy()
clothes_features["combined_text"] = clothes_features["product_name"].fillna("") + " " + clothes_features["details"].fillna("")

# Multi-match extraction function
def get_all_matches(text, mapping):
    text = str(text).lower()
    found = set()
    for keywords in mapping.values():
        for k in keywords:
            if re.search(rf"\b{k}\b", text):
                found.add(k)
    return " ".join(sorted(found)) if found else "other"

# Extract features
clothes_features["material"] = clothes_features["combined_text"].apply(lambda x: get_all_matches(x, {m:[m] for m in material_mapping}))
clothes_features["product_subtype"] = clothes_features["combined_text"].apply(
    lambda x: get_all_matches(x, {m: [m] for m in product_subtype_mapping})
)
clothes_features["weather_feature"] = clothes_features["combined_text"].apply(
    lambda x: get_all_matches(x, {m: [m] for m in weather_mapping})
)
clothes_features["coverage_feature"] = clothes_features["combined_text"].apply(
    lambda x: get_all_matches(x, {m: [m] for m in coverage_mapping})
)

# Combine the feature columns into a single column called 'combined_features'
clothes_features["combined_features"] = (
    clothes_features["material"].fillna("other") + " " +
    clothes_features["product_subtype"].fillna("other") + " " +
    clothes_features["weather_feature"].fillna("other") + " " +
    clothes_features["coverage_feature"].fillna("other")
)

clothes_features["combined_features"] = clothes_features["combined_features"] \
    .str.replace(r'\bother\b', '', regex=True) \
    .str.replace(r'\s+', ' ', regex=True) \
    .str.strip()

clothes_features.drop(columns=["combined_text"], inplace=True)

clothes_features.head()



Unnamed: 0,product_id,product_images,gender,product_category,product_name,details,category_type,material,product_subtype,weather_feature,coverage_feature,combined_features
0,0,[],women,ACCESSORIES,bar refill,creamy texture multifunction makeup bar availa...,Accessories,other,other,other,other,
1,1,[],women,ACCESSORIES,lip oil,creamy lip oil glide like silk achieve uniform...,Accessories,silk,other,other,other,silk
2,2,[],women,ACCESSORIES,ultimatte matte liquid lipstick,slick favourite lipstick liquid version covera...,Accessories,other,other,other,other,
3,3,[{'https://static.zara.net/photos///2022/I/2/1...,women,SHOES,face brush,small face brush application powder cream liqu...,Shoes,other,other,other,other,
4,4,[],women,SHOES,face brush,design apply powder cream product perfect blus...,Shoes,other,other,other,other,


In [21]:
drop_categories = ['PERFUMESNEW', 'ACCESSORIES_JEWELLERY', 'BEAUTY', 'BAGS', 'PERFUMES', 'BAGS_BACKPACKS']
clothes_features = clothes_features[~clothes_features["product_category"].str.upper().isin(drop_categories)].reset_index(drop=True)


In [22]:
counts = clothes_features['category_type'].value_counts()
print(counts)

category_type
Top            1665
Other           754
Bottom          604
Shoes           493
Accessories     316
Name: count, dtype: int64


In [23]:
# Inspect particular categories

for cat in clothes_features["product_category"].unique():
    var_name = cat.replace(" ", "_") + "_df"
    globals()[var_name] = clothes_features[clothes["product_category"] == cat].copy()


SHIRTS_df.head()

  globals()[var_name] = clothes_features[clothes["product_category"] == cat].copy()
  globals()[var_name] = clothes_features[clothes["product_category"] == cat].copy()
  globals()[var_name] = clothes_features[clothes["product_category"] == cat].copy()
  globals()[var_name] = clothes_features[clothes["product_category"] == cat].copy()
  globals()[var_name] = clothes_features[clothes["product_category"] == cat].copy()
  globals()[var_name] = clothes_features[clothes["product_category"] == cat].copy()
  globals()[var_name] = clothes_features[clothes["product_category"] == cat].copy()
  globals()[var_name] = clothes_features[clothes["product_category"] == cat].copy()
  globals()[var_name] = clothes_features[clothes["product_category"] == cat].copy()
  globals()[var_name] = clothes_features[clothes["product_category"] == cat].copy()
  globals()[var_name] = clothes_features[clothes["product_category"] == cat].copy()
  globals()[var_name] = clothes_features[clothes["product_category"] == cat]

Unnamed: 0,product_id,product_images,gender,product_category,product_name,details,category_type,material,product_subtype,weather_feature,coverage_feature,combined_features
77,77,[],women,SHIRTS,poplin shirt,collar vneck shirt long sleeve buttonup,Top,poplin,shirt,other,long,poplin shirt long
78,78,[],women,SHIRTS,poplin shirt,shirt johnny collar long sleeve buttonup,Top,poplin,shirt,other,long,poplin shirt long
79,79,[],women,SHIRTS,cotton blend oxford shirt,shirt cotton blend johnny collar long sleeve a...,Top,cotton,oxford shirt,other,long,cotton oxford shirt long
80,80,[],women,SHIRTS,oxford shirt,shirt cotton blend johnny collar long sleeve b...,Top,cotton,oxford shirt,other,long,cotton oxford shirt long
81,81,[],women,SHIRTS,stripe oversize shirt,loosefitte shirt feature johnny collar long sl...,Top,other,shirt,other,long,shirt long


In [24]:
clothes_df = clothes_features
clothes_df.to_csv("clothes_explicit.csv", index=False)
