In [218]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


**Merge datasets**

In [219]:
import os
import pandas as pd
from glob import glob

def merge_all_csvs(base_path, save_path):
    all_dfs = []

    for gender in ["Women", "Men"]:
        folder_path = os.path.join(base_path, gender)
        csv_files = glob(os.path.join(folder_path, "*.csv")) + glob(os.path.join(folder_path, "*.CSV"))

        for file in csv_files:
            df = pd.read_csv(file)
            # Standardize columns
            df.columns = [col.strip().lower().replace(" ", "_") for col in df.columns]
            df.columns = ["product_images" if c in ["product_image", "product_images"] else c for c in df.columns]

            # Add gender and category
            df["gender"] = gender.lower()
            df["product_category"] = os.path.splitext(os.path.basename(file))[0]

            all_dfs.append(df)

    # Concatenate all and save
    if all_dfs:
        clothes = pd.concat(all_dfs, ignore_index=True, sort=False)
        clothes.to_csv(save_path, index=False)
        print(f"✅ Merged all datasets into '{save_path}' with {clothes.shape[0]} rows and {clothes.shape[1]} columns.")
        return clothes
    else:
        print("⚠️ No CSV files found.")
        return None

base_path = "/content/drive/MyDrive/Colab Notebooks/What to wear"
save_path = os.path.join(base_path, "clothes.csv")

# Merge and save
clothes = merge_all_csvs(base_path, save_path)


✅ Merged all datasets into '/content/drive/MyDrive/Colab Notebooks/What to wear/clothes.csv' with 4185 rows and 8 columns.


In [220]:
clothes.head()

Unnamed: 0,unnamed:_0,product_name,link,product_images,price,details,gender,product_category
0,0,OVERSIZE TRENCH COAT WITH POCKETS,https://www.zara.com/in/en/oversize-trench-coa...,[{'https://static.zara.net/photos///2023/I/0/1...,"₹ 7,590.00",Collared trench coat made of a cotton blend. F...,women,WORKWEARNEW
1,1,KNIT SWEATER WITH ZIP,https://www.zara.com/in/en/knit-sweater-with-z...,[{'https://static.zara.net/photos///2023/I/0/1...,"₹ 3,290.00",Sweater made of a viscose blend. Featuring a h...,women,WORKWEARNEW
2,2,ZW MARINE STRAIGHT FIT HIGH-WAIST POCKET JEANS,https://www.zara.com/in/en/zw-marine-straight-...,[{'https://static.zara.net/photos///2023/I/0/1...,"₹ 3,290.00",Slim Fit - Relaxed Leg - High WaistZW ZARA WOM...,women,WORKWEARNEW
3,3,CROSSBODY BAG,https://www.zara.com/in/en/crossbody-bag-p1600...,[{'https://static.zara.net/photos///2023/I/1/1...,"₹ 2,890.00",Crossbody bag with adjustable strap. Lining. Z...,women,WORKWEARNEW
4,4,ZW MARINE STRAIGHT-LEG HIGH-WAIST JEANS,https://www.zara.com/in/en/zw-marine-straight-...,[{'https://static.zara.net/photos///2023/I/0/1...,"₹ 3,290.00",Slim Fit - Relaxed Leg - High WaistZW ZARA WOM...,women,WORKWEARNEW


**Data cleaning**

In [221]:
clothes = clothes.dropna(subset=["product_name", "details"])

In [222]:
import pandas as pd
import re
import os
import spacy

nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])

clothes_path = "/content/drive/MyDrive/Colab Notebooks/What to wear/clothes.csv"
clothes = pd.read_csv(clothes_path)

# Basic cleaning: lowercase, remove punctuation, strip
def basic_clean(text):
    if pd.isna(text):
        return ""
    text = str(text).lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

clothes["product_name"] = clothes["product_name"].apply(basic_clean)
clothes["details"] = clothes["details"].apply(basic_clean)

# remove unwanted words & numbers
def remove_words_numbers(text):
    # Remove 'ZW', 'Zara' (case-insensitive) and all numbers
    text = re.sub(r'\b(zw|zara)\b|\d+', '', text, flags=re.IGNORECASE)
    # Remove extra whitespace again
    text = re.sub(r'\s+', ' ', text).strip()
    return text

clothes["product_name"] = clothes["product_name"].apply(remove_words_numbers)
clothes["details"] = clothes["details"].apply(remove_words_numbers)

# Tokenization,lemmatization,stopword removal
def preprocess_spacy(text):
    if not text:
        return ""
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if token.is_alpha and not token.is_stop]
    return " ".join(tokens)

clothes["product_name_clean"] = clothes["product_name"].apply(preprocess_spacy)
clothes["details_clean"] = clothes["details"].apply(preprocess_spacy)

# Save cleaned CSV
clothes.to_csv(clothes_path, index=False)
print(f"Preprocessing complete and saved to '{clothes_path}'")


✅ Preprocessing complete and saved to '/content/drive/MyDrive/Colab Notebooks/What to wear/clothes.csv'


In [223]:
clothes.head()

Unnamed: 0,unnamed:_0,product_name,link,product_images,price,details,gender,product_category,product_name_clean,details_clean
0,0,oversize trench coat with pockets,https://www.zara.com/in/en/oversize-trench-coa...,[{'https://static.zara.net/photos///2023/I/0/1...,"₹ 7,590.00",collared trench coat made of a cotton blend fe...,women,WORKWEARNEW,oversize trench coat pocket,collared trench coat cotton blend feature long...
1,1,knit sweater with zip,https://www.zara.com/in/en/knit-sweater-with-z...,[{'https://static.zara.net/photos///2023/I/0/1...,"₹ 3,290.00",sweater made of a viscose blend featuring a hi...,women,WORKWEARNEW,knit sweater zip,sweater viscose blend feature high neck metal ...
2,2,marine straight fit highwaist pocket jeans,https://www.zara.com/in/en/zw-marine-straight-...,[{'https://static.zara.net/photos///2023/I/0/1...,"₹ 3,290.00",slim fit relaxed leg high waistzw woman jeansh...,women,WORKWEARNEW,marine straight fit highwaist pocket jean,slim fit relaxed leg high waistzw woman jeansh...
3,3,crossbody bag,https://www.zara.com/in/en/crossbody-bag-p1600...,[{'https://static.zara.net/photos///2023/I/1/1...,"₹ 2,890.00",crossbody bag with adjustable strap lining zip...,women,WORKWEARNEW,crossbody bag,crossbody bag adjustable strap lining zip clos...
4,4,marine straightleg highwaist jeans,https://www.zara.com/in/en/zw-marine-straight-...,[{'https://static.zara.net/photos///2023/I/0/1...,"₹ 3,290.00",slim fit relaxed leg high waistzw woman jeansh...,women,WORKWEARNEW,marine straightleg highwaist jean,slim fit relaxed leg high waistzw woman jeansh...


In [224]:

clothes.drop(columns=["product_name", "details", "link", "product_images", "price", "unnamed:_0" ], inplace=True)
clothes.rename(columns={"product_name_clean": "product_name", "details_clean": "details"}, inplace=True)

clothes.to_csv(clothes_path, index=False)
print(f"✅original columns replaced and saved to '{clothes_path}'")

✅ Preprocessing complete, original columns replaced, and saved to '/content/drive/MyDrive/Colab Notebooks/What to wear/clothes.csv'


In [225]:
clothes.head()

Unnamed: 0,gender,product_category,product_name,details
0,women,WORKWEARNEW,oversize trench coat pocket,collared trench coat cotton blend feature long...
1,women,WORKWEARNEW,knit sweater zip,sweater viscose blend feature high neck metal ...
2,women,WORKWEARNEW,marine straight fit highwaist pocket jean,slim fit relaxed leg high waistzw woman jeansh...
3,women,WORKWEARNEW,crossbody bag,crossbody bag adjustable strap lining zip clos...
4,women,WORKWEARNEW,marine straightleg highwaist jean,slim fit relaxed leg high waistzw woman jeansh...


In [226]:

drop_categories = ['PERFUMESNEW', 'ACCESSORIES_JEWELLERY', 'BEAUTY', 'BAGS', 'PERFUMES', 'BAGS_BACKPACKS']

clothes = clothes[~clothes["product_category"].str.upper().isin(drop_categories)]
clothes.to_csv(clothes_path, index=False)


**Featuring engineering**

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report

In [None]:
required_cols = ["product_name", "details", "product_category"]
for col in required_cols:
    if col not in clothes.columns:
        raise ValueError(f"Missing required column: '{col}'")

clothes["product_name"] = clothes["product_name"].astype(str)
clothes["details"] = clothes["details"].astype(str)
clothes["text"] = clothes["product_name"].fillna('') + " " + clothes["details"].fillna('')

# Define ambiguous categories to reassign
ambiguous_cats = ["WORKWEARNEW", "BASICS", "LOUNGEWEARNEW",
                  "SPECIAL PRICES", "LINEN", "ZARA ATHLETICZ", "ZARA ORIGINS", "CO-ORD SETS"]


In [227]:

# Training

train_df = clothes[~clothes["product_category"].isin(ambiguous_cats)].copy()
predict_df = clothes[clothes["product_category"].isin(ambiguous_cats)].copy()

train_df = train_df.dropna(subset=["product_category"])

counts = train_df["product_category"].value_counts()
valid_cats = counts[counts >= 2].index
train_df = train_df[train_df["product_category"].isin(valid_cats)]

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    train_df["text"], train_df["product_category"],
    test_size=0.2, random_state=42, stratify=train_df["product_category"]
)

# Build model pipeline
model = Pipeline([
    ("tfidf", TfidfVectorizer(stop_words="english", max_features=20000, ngram_range=(1, 2))),
    ("clf", LogisticRegression(max_iter=2000, class_weight="balanced"))
])

# Train model
model.fit(X_train, y_train)

# evaluation
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Predict categories for ambiguous rows into final_category
if not predict_df.empty:
    clothes.loc[predict_df.index, "final_category"] = model.predict(predict_df["text"])

clothes["final_category"].fillna(clothes["product_category"], inplace=True)

clothes.to_csv("clothes_updated_final.csv", index=False)

print("✅ Full CSV updated)


✅ 'text' column created successfully!
Accuracy: 0.8562401263823065
                      precision    recall  f1-score   support

         ACCESSORIES       1.00      0.77      0.87        30
             BLAZERS       0.78      0.78      0.78        18
   DRESSES_JUMPSUITS       0.95      0.94      0.94       129
 HOODIES_SWEATSHIRTS       0.50      0.50      0.50         6
             JACKETS       0.85      0.92      0.88        36
               JEANS       1.00      1.00      1.00         6
            KNITWEAR       0.64      0.61      0.62        46
          OVERSHIRTS       0.60      0.60      0.60         5
         POLO SHIRTS       0.50      1.00      0.67         9
              SHIRTS       0.93      0.91      0.92        77
               SHOES       0.99      0.99      0.99        90
              SHORTS       0.89      1.00      0.94        17
       SHORTS_SKORTS       0.82      0.82      0.82        11
              SKIRTS       0.71      0.83      0.77        12
  

In [228]:
clothes.head()


Unnamed: 0,gender,product_category,product_name,details,text,final_category
0,women,WORKWEARNEW,oversize trench coat pocket,collared trench coat cotton blend feature long...,oversize trench coat pocket collared trench co...,JACKETS
1,women,WORKWEARNEW,knit sweater zip,sweater viscose blend feature high neck metal ...,knit sweater zip sweater viscose blend feature...,KNITWEAR
2,women,WORKWEARNEW,marine straight fit highwaist pocket jean,slim fit relaxed leg high waistzw woman jeansh...,marine straight fit highwaist pocket jean slim...,JEANS
3,women,WORKWEARNEW,crossbody bag,crossbody bag adjustable strap lining zip clos...,crossbody bag crossbody bag adjustable strap l...,ACCESSORIES
4,women,WORKWEARNEW,marine straightleg highwaist jean,slim fit relaxed leg high waistzw woman jeansh...,marine straightleg highwaist jean slim fit rel...,JEANS


In [229]:
clothes.drop(columns=["text"], inplace=True)


In [230]:
clothes.to_csv(clothes_path, index=False)

In [231]:
clothes.head()

Unnamed: 0,gender,product_category,product_name,details,final_category
0,women,WORKWEARNEW,oversize trench coat pocket,collared trench coat cotton blend feature long...,JACKETS
1,women,WORKWEARNEW,knit sweater zip,sweater viscose blend feature high neck metal ...,KNITWEAR
2,women,WORKWEARNEW,marine straight fit highwaist pocket jean,slim fit relaxed leg high waistzw woman jeansh...,JEANS
3,women,WORKWEARNEW,crossbody bag,crossbody bag adjustable strap lining zip clos...,ACCESSORIES
4,women,WORKWEARNEW,marine straightleg highwaist jean,slim fit relaxed leg high waistzw woman jeansh...,JEANS


In [232]:
clothes.loc[clothes["product_category"].isin(ambiguous_cats), "product_category"] = clothes["final_category"]

clothes.drop(columns=["final_category"], inplace=True)


In [233]:

clothes.to_csv(clothes_path, index=False)
clothes.head()

Unnamed: 0,gender,product_category,product_name,details
0,women,JACKETS,oversize trench coat pocket,collared trench coat cotton blend feature long...
1,women,KNITWEAR,knit sweater zip,sweater viscose blend feature high neck metal ...
2,women,JEANS,marine straight fit highwaist pocket jean,slim fit relaxed leg high waistzw woman jeansh...
3,women,ACCESSORIES,crossbody bag,crossbody bag adjustable strap lining zip clos...
4,women,JEANS,marine straightleg highwaist jean,slim fit relaxed leg high waistzw woman jeansh...


In [234]:

print(clothes.shape)


(3633, 4)
