In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
df = pd.read_csv("/kaggle/input/cleaned-csv-fashion/Cleaned_FashionDataset.csv")

In [None]:
def Know_the_df(df):
    print("Info: ",df.info())
    print("****************")
    print("Shape :",df.shape)
    print("****************")
    print("Description :",df.describe())
    print("****************")
    print("Sum of Null values :",df.isnull().sum())
    print("****************")
    print("Random samples :",df.sample(10))
Know_the_df(df)

In [None]:
df.head()

In [None]:
df["Actual Discount"] = pd.DataFrame(df["MRP"]-df["SellPrice"])

In [None]:
df.head()

In [None]:
import ast

# Convert string to list
def clean_sizes(s):
    try:
        sizes = ast.literal_eval(s)
        sizes = list(set([i.strip().upper() for i in sizes if i])) 
        return sizes
    except:
        return []

df['Sizes_list'] = df['Sizes'].apply(clean_sizes)


In [None]:
# Common standard sizes
size_map = {
   'XS': ['XS', 'X-Small', 'Extra Small', 'XX-Small'],
   'S': ['S', 'Small', '30', '32'],
   'M': ['M', 'Medium', '34', '36'],
   'L': ['L', 'Large', '38'],
   'XL': ['XL', 'X-Large', '40'],
   'XXL': ['XXL', 'XX-Large', '42'],
   'XXXL': ['XXXL', 'XXX-Large', '44']
}
# Reverse the mapping: map each alias to its standard size
reverse_map = {}
for standard_size, aliases in size_map.items():
    for alias in aliases:
        reverse_map[alias.upper()] = standard_size  # Use upper for case-insensitive match

#Normalize sizes in each list
df["Sizes_list"] = df["Sizes_list"].apply(
    lambda size_list: [reverse_map.get(size.upper(), size) for size in size_list]
)

#from sklearn.preprocessing import MultiLabelBinarizer

#mlb = MultiLabelBinarizer()
#size_encoded = mlb.fit_transform(df['Sizes_list'])

#size_df = pd.DataFrame(size_encoded, columns=[f'Size_{s}' for s in mlb.classes_], index=df.index)
#df = pd.concat([df, size_df], axis=1)


In [None]:
df.head()

In [None]:
df['Sizes_str'] = df['Sizes_list'].apply(lambda x: " ".join(x))
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()
tfidf_features = tfidf.fit_transform(df['Sizes_str'])


In [None]:
print(tfidf_features.shape)  # Should be (num_rows, num_unique_size_tokens)

In [None]:
tfidf_df = pd.DataFrame(tfidf_features.toarray(), 
                        columns=[f"tfidf_{col}" for col in tfidf.get_feature_names_out()],
                        index=df.index)
df = pd.concat([df, tfidf_df], axis=1)


In [None]:
df.head()

### Dimension increses to 100 , i think it might add a lot noise , I'll drop size for model training as it doesnot have much impact on Target("SellPrice)

In [None]:
df = df[["BrandName", "Details", "Sizes_list", "MRP", "Category", "SellPrice","Actual Discount"]]


In [None]:
df.head()

In [None]:
from sklearn.model_selection import KFold

df['BrandName_target'] = np.nan 
kf = KFold(n_splits=5, shuffle=True, random_state=42)

for train_idx, val_idx in kf.split(df):
    train_df = df.iloc[train_idx]
    val_df = df.iloc[val_idx]

    brand_mean = train_df.groupby('BrandName')['SellPrice'].mean()
    
    df.loc[val_idx, 'BrandName_target'] = df.loc[val_idx, 'BrandName'].map(brand_mean)

print(df[['BrandName', 'SellPrice', 'BrandName_target']].sample(40))

In [None]:
# One-hot encode Category
category_dummies = pd.get_dummies(df['Category'], prefix='Category',dtype='int',drop_first=True)
df = pd.concat([df, category_dummies], axis=1)
df.sample(8)

In [None]:
drop_cols = ['Sizes_list', 'BrandName', 'Category','Actual Discount']
df = df.drop(columns=drop_cols)
df.head()

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_details = TfidfVectorizer(max_features=100, stop_words='english')  
details_tfidf_matrix = tfidf_details.fit_transform(df['Details'])


details_tfidf_df = pd.DataFrame(
    details_tfidf_matrix.toarray(),
    columns=[f"tfidf_details_{word}" for word in tfidf_details.get_feature_names_out()],
    index=df.index
)

df = pd.concat([df, details_tfidf_df], axis=1)

df.head()

In [None]:
from sklearn.decomposition import PCA
#Identify all TF-IDF columns
tfidf_cols = [col for col in df.columns if col.startswith("tfidf_details_")]

# Reduce to, say, 30 components — tune as needed
pca = PCA(n_components=30)
pca_features = pca.fit_transform(df[tfidf_cols])


In [None]:
pca_df = pd.DataFrame(
    pca_features,
    columns=[f'pca_details_{i+1}' for i in range(pca_features.shape[1])],
    index=df.index
)

df = pd.concat([df, pca_df], axis=1)

In [None]:
df.drop(columns=tfidf_cols, inplace=True)

In [None]:
df.shape

In [None]:
df.head()

In [None]:
df.drop(columns=["Details"], inplace=True)
df.head()

In [None]:
df.to_csv("/kaggle/working/processed_fe_done_fashion_data.csv", index=False)