**Merge datasets**

In [32]:
import os
import pandas as pd
from glob import glob

def merge_all_csvs(base_path, save_path):
    all_dfs = []

    for gender in ["Women", "Men"]:
        folder_path = os.path.join(base_path, gender)
        csv_files = glob(os.path.join(folder_path, "*.csv")) + glob(os.path.join(folder_path, "*.CSV"))

        for file in csv_files:
            df = pd.read_csv(file)
            # Standardize columns
            df.columns = [col.strip().lower().replace(" ", "_") for col in df.columns]
            df.columns = ["product_images" if c in ["product_image", "product_images"] else c for c in df.columns]

            # Add gender and category
            df["gender"] = gender.lower()
            df["product_category"] = os.path.splitext(os.path.basename(file))[0]

            all_dfs.append(df)

    # Concatenate all and save
    if all_dfs:
        clothes = pd.concat(all_dfs, ignore_index=True, sort=False)
        clothes.to_csv(save_path, index=False)
        print(f"✅ Merged all datasets into '{save_path}' with {clothes.shape[0]} rows and {clothes.shape[1]} columns.")
        return clothes
    else:
        print("⚠️ No CSV files found.")
        return None

base_path = "/home/kloor/code/pawarsp/what-to-wear-today/raw_data"

# Save merged CSV to the same folder
save_path = os.path.join(base_path, "clothes.csv")

# Merge and save
clothes = merge_all_csvs(base_path, save_path)


✅ Merged all datasets into '/home/kloor/code/pawarsp/what-to-wear-today/raw_data/clothes.csv' with 4185 rows and 8 columns.


In [33]:
clothes.head()

Unnamed: 0,unnamed:_0,product_name,link,product_images,price,details,gender,product_category
0,0,3-IN-1 BAR REFILL,https://www.zara.com/in/en/3-in-1-bar-refill-p...,[],"₹ 1,290.00","Creamy texture multifunction makeup bar, avail...",women,BEAUTY
1,2,LIP OIL,https://www.zara.com/in/en/lip-oil-p24130314.html,[],₹ 790.00,This creamy lip oil glides like silk to achiev...,women,BEAUTY
2,4,ULTIMATTE MATTE LIQUID LIPSTICK,https://www.zara.com/in/en/ultimatte-matte-liq...,[],₹ 790.00,"Slick with it. Our favourite lipstick, now in ...",women,BEAUTY
3,10,FACE BRUSH #3,https://www.zara.com/in/en/face-brush--3-p2414...,[{'https://static.zara.net/photos///2022/I/2/1...,₹ 950.00,Small face brush for the application of powder...,women,BEAUTY
4,11,FACE BRUSH #2,https://www.zara.com/in/en/face-brush--2-p2414...,[],"₹ 1,290.00",Designed to apply powder and cream products. P...,women,BEAUTY


**Data cleaning**

In [34]:
clothes = clothes.dropna(subset=["product_name", "details"])

In [35]:
!pip install spacy


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.2.1[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [36]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m33.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.2.1[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [37]:
import pandas as pd
import re
import os
import spacy

nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])

clothes_path = "/home/kloor/code/pawarsp/what-to-wear-today/raw_data/clothes.csv"
clothes = pd.read_csv(clothes_path)

# Basic cleaning: lowercase, remove punctuation, strip
def basic_clean(text):
    if pd.isna(text):
        return ""
    text = str(text).lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

clothes["product_name"] = clothes["product_name"].apply(basic_clean)
clothes["details"] = clothes["details"].apply(basic_clean)

# remove unwanted words & numbers
def remove_words_numbers(text):
    # Remove 'ZW', 'Zara' (case-insensitive) and all numbers
    text = re.sub(r'\b(zw|zara)\b|\d+', '', text, flags=re.IGNORECASE)
    # Remove extra whitespace again
    text = re.sub(r'\s+', ' ', text).strip()
    return text

clothes["product_name"] = clothes["product_name"].apply(remove_words_numbers)
clothes["details"] = clothes["details"].apply(remove_words_numbers)

# Tokenization,lemmatization,stopword removal
def preprocess_spacy(text):
    if not text:
        return ""
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if token.is_alpha and not token.is_stop]
    return " ".join(tokens)

clothes["product_name_clean"] = clothes["product_name"].apply(preprocess_spacy)
clothes["details_clean"] = clothes["details"].apply(preprocess_spacy)

save_path = os.path.join(os.path.dirname(clothes_path), "clothes.csv")
clothes.to_csv(save_path, index=False)
print(f"Preprocessing complete and saved to '{save_path}'")


Preprocessing complete and saved to '/home/kloor/code/pawarsp/what-to-wear-today/raw_data/clothes.csv'


In [38]:
clothes.head()

Unnamed: 0,unnamed:_0,product_name,link,product_images,price,details,gender,product_category,product_name_clean,details_clean
0,0,in bar refill,https://www.zara.com/in/en/3-in-1-bar-refill-p...,[],"₹ 1,290.00",creamy texture multifunction makeup bar availa...,women,BEAUTY,bar refill,creamy texture multifunction makeup bar availa...
1,2,lip oil,https://www.zara.com/in/en/lip-oil-p24130314.html,[],₹ 790.00,this creamy lip oil glides like silk to achiev...,women,BEAUTY,lip oil,creamy lip oil glide like silk achieve uniform...
2,4,ultimatte matte liquid lipstick,https://www.zara.com/in/en/ultimatte-matte-liq...,[],₹ 790.00,slick with it our favourite lipstick now in a ...,women,BEAUTY,ultimatte matte liquid lipstick,slick favourite lipstick liquid version covera...
3,10,face brush,https://www.zara.com/in/en/face-brush--3-p2414...,[{'https://static.zara.net/photos///2022/I/2/1...,₹ 950.00,small face brush for the application of powder...,women,BEAUTY,face brush,small face brush application powder cream liqu...
4,11,face brush,https://www.zara.com/in/en/face-brush--2-p2414...,[],"₹ 1,290.00",designed to apply powder and cream products pe...,women,BEAUTY,face brush,design apply powder cream product perfect blus...


In [39]:

clothes.drop(columns=["product_name", "details", "link", "product_images", "price", "unnamed:_0" ], inplace=True)
clothes.rename(columns={"product_name_clean": "product_name", "details_clean": "details"}, inplace=True)

clothes.to_csv(clothes_path, index=False)
print(f"✅original columns replaced and saved to '{clothes_path}'")

✅original columns replaced and saved to '/home/kloor/code/pawarsp/what-to-wear-today/raw_data/clothes.csv'


In [40]:
clothes.head()

Unnamed: 0,gender,product_category,product_name,details
0,women,BEAUTY,bar refill,creamy texture multifunction makeup bar availa...
1,women,BEAUTY,lip oil,creamy lip oil glide like silk achieve uniform...
2,women,BEAUTY,ultimatte matte liquid lipstick,slick favourite lipstick liquid version covera...
3,women,BEAUTY,face brush,small face brush application powder cream liqu...
4,women,BEAUTY,face brush,design apply powder cream product perfect blus...


In [41]:

drop_categories = ['PERFUMESNEW', 'ACCESSORIES_JEWELLERY', 'BEAUTY', 'BAGS', 'PERFUMES', 'BAGS_BACKPACKS']

clothes = clothes[~clothes["product_category"].str.upper().isin(drop_categories)]
clothes.to_csv(clothes_path, index=False)


**Featuring engineering**

In [42]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report

In [43]:
required_cols = ["product_name", "details", "product_category"]
for col in required_cols:
    if col not in clothes.columns:
        raise ValueError(f"Missing required column: '{col}'")

clothes["product_name"] = clothes["product_name"].astype(str)
clothes["details"] = clothes["details"].astype(str)
clothes["text"] = clothes["product_name"].fillna('') + " " + clothes["details"].fillna('')

# Define ambiguous categories to reassign
ambiguous_cats = ["WORKWEARNEW", "BASICS", "LOUNGEWEARNEW",
                  "SPECIAL PRICES", "LINEN", "ZARA ATHLETICZ", "ZARA ORIGINS", "CO-ORD SETS"]


In [44]:

# Training

train_df = clothes[~clothes["product_category"].isin(ambiguous_cats)].copy()
predict_df = clothes[clothes["product_category"].isin(ambiguous_cats)].copy()

train_df = train_df.dropna(subset=["product_category"])

counts = train_df["product_category"].value_counts()
valid_cats = counts[counts >= 2].index
train_df = train_df[train_df["product_category"].isin(valid_cats)]

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    train_df["text"], train_df["product_category"],
    test_size=0.2, random_state=42, stratify=train_df["product_category"]
)

# Build model pipeline
model = Pipeline([
    ("tfidf", TfidfVectorizer(stop_words="english", max_features=20000, ngram_range=(1, 2))),
    ("clf", LogisticRegression(max_iter=2000, class_weight="balanced"))
])

# Train model
model.fit(X_train, y_train)

# evaluation
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Predict categories for ambiguous rows into final_category
if not predict_df.empty:
    clothes.loc[predict_df.index, "final_category"] = model.predict(predict_df["text"])

clothes["final_category"].fillna(clothes["product_category"], inplace=True)

clothes.to_csv("clothes_updated_final.csv", index=False)

print("Full CSV updated")


Accuracy: 0.8562401263823065
                      precision    recall  f1-score   support

         ACCESSORIES       1.00      0.77      0.87        30
             BLAZERS       0.78      0.78      0.78        18
   DRESSES_JUMPSUITS       0.95      0.94      0.94       129
 HOODIES_SWEATSHIRTS       0.50      0.50      0.50         6
             JACKETS       0.85      0.92      0.88        36
               JEANS       1.00      1.00      1.00         6
            KNITWEAR       0.64      0.61      0.62        46
          OVERSHIRTS       0.60      0.60      0.60         5
         POLO SHIRTS       0.50      1.00      0.67         9
              SHIRTS       0.93      0.91      0.92        77
               SHOES       0.99      0.99      0.99        90
              SHORTS       0.89      1.00      0.94        17
       SHORTS_SKORTS       0.82      0.82      0.82        11
              SKIRTS       0.71      0.83      0.77        12
  SWEATERS_CARDIGANS       0.50      0.3

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  clothes["final_category"].fillna(clothes["product_category"], inplace=True)


In [45]:
clothes.head()


Unnamed: 0,gender,product_category,product_name,details,text,final_category
18,women,SKIRTS,z denim midi skirt,fade highwaist skirt feature fivepocket design...,z denim midi skirt fade highwaist skirt featur...,SKIRTS
19,women,SKIRTS,faux leather mini skirt chain,highwaist mini skirt lining metal chain link i...,faux leather mini skirt chain highwaist mini s...,SKIRTS
20,women,SKIRTS,skort pocket,midwaist skort adjustable elasticate waistband...,skort pocket midwaist skort adjustable elastic...,SKIRTS
21,women,SKIRTS,long satin skirt,long skirt high elastic waist,long satin skirt long skirt high elastic waist,SKIRTS
22,women,SKIRTS,knot midi skirt,highwaist flow midi skirt knot detail slit hem...,knot midi skirt highwaist flow midi skirt knot...,SKIRTS


In [46]:
clothes.drop(columns=["text"], inplace=True)


In [47]:
clothes.to_csv(clothes_path, index=False)

In [48]:
clothes.head()

Unnamed: 0,gender,product_category,product_name,details,final_category
18,women,SKIRTS,z denim midi skirt,fade highwaist skirt feature fivepocket design...,SKIRTS
19,women,SKIRTS,faux leather mini skirt chain,highwaist mini skirt lining metal chain link i...,SKIRTS
20,women,SKIRTS,skort pocket,midwaist skort adjustable elasticate waistband...,SKIRTS
21,women,SKIRTS,long satin skirt,long skirt high elastic waist,SKIRTS
22,women,SKIRTS,knot midi skirt,highwaist flow midi skirt knot detail slit hem...,SKIRTS


In [49]:
clothes.loc[clothes["product_category"].isin(ambiguous_cats), "product_category"] = clothes["final_category"]

clothes.drop(columns=["final_category"], inplace=True)


In [50]:

clothes.to_csv(clothes_path, index=False)
clothes.head()

Unnamed: 0,gender,product_category,product_name,details
18,women,SKIRTS,z denim midi skirt,fade highwaist skirt feature fivepocket design...
19,women,SKIRTS,faux leather mini skirt chain,highwaist mini skirt lining metal chain link i...
20,women,SKIRTS,skort pocket,midwaist skort adjustable elasticate waistband...
21,women,SKIRTS,long satin skirt,long skirt high elastic waist
22,women,SKIRTS,knot midi skirt,highwaist flow midi skirt knot detail slit hem...


In [51]:

print(clothes.shape)


(3633, 4)


In [52]:
# --- 1. Define lists for each broad category ---
tops = [
    "T-SHIRTS", "SHIRTS", "BLAZERS", "JACKETS", "OVERSHIRTS",
    "T-SHIRTS_SWEATSHIRTS", "HOODIES_SWEATSHIRTS", "POLO SHIRTS",
    "SWEATERS_CARDIGANS", "WAISTCOATS_GILETS", "KNITWEAR", "TOPS_BODYSUITS"
]

bottoms = [
    "TROUSERS", "JEANS", "SHORTS", "SKIRTS", "SHORTS_SKORTS", "DRESSES_JUMPSUITS"
]

shoes = ["SHOES"]

accessories = ["ACCESSORIES"]

# --- 2. Define function to map product_category to category_type ---
def assign_category_type(cat):
    if cat in tops:
        return "Top"
    elif cat in bottoms:
        return "Bottom"
    elif cat in shoes:
        return "Shoes"
    elif cat in accessories:
        return "Accessories"
    else:
        return "Other"

# --- 3. Apply function to create new column ---
clothes["category_type"] = clothes["product_category"].apply(assign_category_type)
print(clothes[["product_category", "category_type"]].drop_duplicates())


          product_category category_type
18                  SKIRTS        Bottom
77                  SHIRTS           Top
82                   JEANS        Bottom
84                KNITWEAR           Top
88                TROUSERS        Bottom
89       WAISTCOATS_GILETS           Top
90                 BLAZERS           Top
94           SHORTS_SKORTS        Bottom
96                 JACKETS           Top
211      DRESSES_JUMPSUITS        Bottom
961         TOPS_BODYSUITS           Top
1069  T-SHIRTS_SWEATSHIRTS           Top
1409           ACCESSORIES   Accessories
1421                 SHOES         Shoes
1440              T-SHIRTS           Top
2064   HOODIES_SWEATSHIRTS           Top
2075                SHORTS        Bottom
2724            OVERSHIRTS           Top
2749           POLO SHIRTS           Top
2796                 SUITS         Other
3014              SWIMWEAR         Other
3665    SWEATERS_CARDIGANS           Top


In [53]:
clothes.to_csv(clothes_path, index=False)
clothes.head()

Unnamed: 0,gender,product_category,product_name,details,category_type
18,women,SKIRTS,z denim midi skirt,fade highwaist skirt feature fivepocket design...,Bottom
19,women,SKIRTS,faux leather mini skirt chain,highwaist mini skirt lining metal chain link i...,Bottom
20,women,SKIRTS,skort pocket,midwaist skort adjustable elasticate waistband...,Bottom
21,women,SKIRTS,long satin skirt,long skirt high elastic waist,Bottom
22,women,SKIRTS,knot midi skirt,highwaist flow midi skirt knot detail slit hem...,Bottom


In [54]:
for cat in clothes["category_type"].unique():
    var_name = cat.replace(" ", "_").lower() + "_df"
    globals()[var_name] = clothes[clothes["category_type"] == cat].copy()

In [58]:
top_df.head()

Unnamed: 0,gender,product_category,product_name,details,category_type
77,women,SHIRTS,poplin shirt,collar vneck shirt long sleeve buttonup,Top
78,women,SHIRTS,poplin shirt,shirt johnny collar long sleeve buttonup,Top
79,women,SHIRTS,cotton blend oxford shirt,shirt cotton blend johnny collar long sleeve a...,Top
80,women,SHIRTS,oxford shirt,shirt cotton blend johnny collar long sleeve b...,Top
81,women,SHIRTS,stripe oversize shirt,loosefitte shirt feature johnny collar long sl...,Top
