In [1]:
# Import necessary libraries
import pandas as pd
import re
import ast
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split


In [2]:
# Step 1: Load the dataset
df = pd.read_csv("Trendix_Dataset.csv")

In [3]:



# Function to extract specifications safely
def extract_specifications(spec_column):
    if isinstance(spec_column, str):  # Check if the input is a string
        try:
            # Replace single quotes with double quotes for valid JSON format
            spec_column = spec_column.replace("'", '"')
            # Convert the string representation of the dictionary to an actual dictionary
            return pd.Series(ast.literal_eval(spec_column))
        except (ValueError, SyntaxError):
            return pd.Series()  # Return an empty series on error
    else:
        return pd.Series()  # Return an empty series if not a string

# Apply the function to the 'Specifications' column
specifications_df = df['Specifications'].apply(extract_specifications)

# Check for NaN values and fill them with a default placeholder (e.g., None or '')
specifications_df = specifications_df.fillna('')  # Fill NaN values with empty strings

# Concatenate the extracted columns back to the original dataframe
df = pd.concat([df, specifications_df], axis=1)
pd.set_option('display.max_columns', None) 
# Display the updated DataFrame (optional)
df = df.drop(columns=['Specifications'])
print(df)



       Serial_No       Fashion_type  Product_id               Brand_Name  \
0              1            blazers  17464542.0                      H&M   
1              2            blazers  24970972.0        Allen Solly Woman   
2              3            blazers  26893598.0                    Arrow   
3              4            blazers  28844176.0                    MANGO   
4              5            blazers  28774520.0                    MANGO   
...          ...                ...         ...                      ...   
24173        994  Women Office Wear  30111247.0           Style Quotient   
24174        995  Women Office Wear  27631984.0                Chemistry   
24175        996  Women Office Wear  29328582.0  Annabelle by Pantaloons   
24176        997  Women Office Wear  29173318.0                 Cantabil   
24177        998  Women Office Wear  28324258.0                   FITHUB   

             category                                        Description  \
0      wome

In [4]:
# Drop the 'Serial_No' and 'Product_id' columns
df = df.drop(columns=['Serial_No', 'Product_id'])

# Optionally, display the updated DataFrame to verify the columns have been dropped
print(df)


            Fashion_type               Brand_Name        category  \
0                blazers                      H&M  women-clothing   
1                blazers        Allen Solly Woman  women-clothing   
2                blazers                    Arrow    men-clothing   
3                blazers                    MANGO  women-clothing   
4                blazers                    MANGO  women-clothing   
...                  ...                      ...             ...   
24173  Women Office Wear           Style Quotient  women-clothing   
24174  Women Office Wear                Chemistry  women-clothing   
24175  Women Office Wear  Annabelle by Pantaloons  women-clothing   
24176  Women Office Wear                 Cantabil  women-clothing   
24177  Women Office Wear                   FITHUB  women-clothing   

                                             Description  Rating Rating_count  \
0                              Women Black Fitted Jacket     4.4   951Ratings   
1      No

In [5]:
# Function to extract ratings safely
def extract_ratings(rating_column):
    if isinstance(rating_column, str):  # Check if the input is a string
        try:
            # Replace single quotes with double quotes for valid JSON format
            rating_column = rating_column.replace("'", '"')
            # Convert the string representation of the dictionary to an actual dictionary
            return pd.Series(ast.literal_eval(rating_column))
        except (ValueError, SyntaxError):
            return pd.Series()  # Return an empty series on error
    else:
        return pd.Series()  # Return an empty series if not a string


# Apply the function to the 'Rating_as_stars' column
ratings_df = df['Rating_as_stars'].apply(extract_ratings)

# Check for NaN values and fill them with a default placeholder (e.g., None or '')
ratings_df = ratings_df.fillna('')  # Fill NaN values with empty strings

# Concatenate the extracted columns back to the original dataframe
df = pd.concat([df, ratings_df], axis=1)

# Drop the original 'Rating_as_stars' column if you want to remove it
df = df.drop(columns=['Rating_as_stars'])

# Display the updated DataFrame
print(df)


            Fashion_type               Brand_Name        category  \
0                blazers                      H&M  women-clothing   
1                blazers        Allen Solly Woman  women-clothing   
2                blazers                    Arrow    men-clothing   
3                blazers                    MANGO  women-clothing   
4                blazers                    MANGO  women-clothing   
...                  ...                      ...             ...   
24173  Women Office Wear           Style Quotient  women-clothing   
24174  Women Office Wear                Chemistry  women-clothing   
24175  Women Office Wear  Annabelle by Pantaloons  women-clothing   
24176  Women Office Wear                 Cantabil  women-clothing   
24177  Women Office Wear                   FITHUB  women-clothing   

                                             Description  Rating Rating_count  \
0                              Women Black Fitted Jacket     4.4   951Ratings   
1      No

In [6]:
# Function to clean and convert 'Rating_count'
def convert_rating_count(value):
    value = str(value).replace('Ratings', '').strip()  # Remove 'Ratings' suffix
    if 'k' in value:
        value = value.replace('k', '')  # Remove 'k'
        return float(value) * 1000  # Convert to numeric and multiply by 1000
    else:
        return pd.to_numeric(value, errors='coerce')  # Convert remaining numbers to numeric

# Apply the function to 'Rating_count' column
df['Rating_count'] = df['Rating_count'].apply(convert_rating_count)

# Handle 'Rating' as numeric
df['Rating'] = pd.to_numeric(df['Rating'], errors='coerce')

# Fill missing values in 'Rating' and 'Rating_count' with their mean
df['Rating_count'] = df['Rating_count'].fillna(df['Rating_count'].mean())
df['Rating'] = df['Rating'].fillna(df['Rating'].mean())

# Clean 'Price' and 'MRP_Price'
df['Price'] = df['Price'].replace('MRP', '', regex=True).replace('₹', '', regex=True)
df['MRP_Price'] = df['MRP_Price'].replace('MRP', '', regex=True).replace('₹', '', regex=True)

# Convert 'Price' and 'MRP_Price' to numeric, force coercion to handle any unexpected text
df['Price'] = pd.to_numeric(df['Price'], errors='coerce')
df['MRP_Price'] = pd.to_numeric(df['MRP_Price'], errors='coerce')

# Replace null 'MRP_Price' with 'Price' when 'MRP_Price' is empty
df['MRP_Price'] = df['MRP_Price'].fillna(df['Price'])

# Now check the cleaned dataframe
print(df[['Rating_count', 'Rating', 'Price', 'MRP_Price']].head())

   Rating_count  Rating   Price  MRP_Price
0         951.0     4.4  2299.0     2299.0
1         643.0     4.3  3959.0     3999.0
2         161.0     4.2  4879.0     7999.0
3          37.0     4.1  6790.0     6790.0
4          43.0     4.0  4549.0     6790.0


In [7]:

def extract_discount(discount_str):
    if isinstance(discount_str, str) and discount_str.strip() != '':  # Check if not empty
        match = re.search(r'(\d+)% OFF', discount_str)  # Search for percentage
        if match:
            return int(match.group(1))  # Return numerical value
    return ''  # Return empty if not matched or is empty

# Apply the function to the 'Discount' column
df['Discount Percentage'] = df['Discount'].apply(extract_discount)


df = df.drop(columns=['product_description', 'image_ref', 'file_path', 'image_url','Discount','Description'])
# Display the updated DataFrame
print(df)

            Fashion_type               Brand_Name        category  Rating  \
0                blazers                      H&M  women-clothing     4.4   
1                blazers        Allen Solly Woman  women-clothing     4.3   
2                blazers                    Arrow    men-clothing     4.2   
3                blazers                    MANGO  women-clothing     4.1   
4                blazers                    MANGO  women-clothing     4.0   
...                  ...                      ...             ...     ...   
24173  Women Office Wear           Style Quotient  women-clothing     4.0   
24174  Women Office Wear                Chemistry  women-clothing     4.2   
24175  Women Office Wear  Annabelle by Pantaloons  women-clothing     4.6   
24176  Women Office Wear                 Cantabil  women-clothing     4.4   
24177  Women Office Wear                   FITHUB  women-clothing     3.8   

       Rating_count   Price  MRP_Price  \
0             951.0  2299.0     2

In [8]:
# Define the categories to remove
categories_to_remove = [
    {'Fabric': 'Cotton', 'Fit': 'Oversized', 'Length': 'Regular', 'Main Trend': 'Tie and Dye', 
     'Multipack Set': 'Single', 'Neck': 'Round Neck', 'Number of Items': '1', 'Occasion': 'Casual', 
     'Pattern': 'Printed', 'Pattern Coverage': 'Chest', 'Print or Pattern Type': 'Tie and Dye', 
     'Sleeve Length': 'Short Sleeves', 'Sleeve Styling': 'Regular Sleeves', 'Sustainable': 'Regular', 
     'Wash Care': 'Machine Wash', 'Weave Type': 'Woven'},
     
    {'Fabric': 'Cotton', 'Fit': 'Oversized', 'Length': 'Regular', 'Main Trend': 'Abstract', 
     'Multipack Set': 'Single', 'Neck': 'Round Neck', 'Number of Items': '1', 'Occasion': 'Casual', 
     'Pattern': 'Printed', 'Pattern Coverage': 'All-Over', 'Print or Pattern Type': 'Abstract', 
     'Sleeve Length': 'Short Sleeves', 'Sleeve Styling': 'Regular Sleeves', 'Sustainable': 'Regular', 
     'Wash Care': 'Machine Wash', 'Weave Type': 'Woven'},'MRP', ''

]

# Remove rows with specific categories
for category in categories_to_remove:
    df = df[~df['category'].apply(lambda x: x == category)]

# Display the updated DataFrame
print(df)


            Fashion_type               Brand_Name        category  Rating  \
0                blazers                      H&M  women-clothing     4.4   
1                blazers        Allen Solly Woman  women-clothing     4.3   
2                blazers                    Arrow    men-clothing     4.2   
3                blazers                    MANGO  women-clothing     4.1   
4                blazers                    MANGO  women-clothing     4.0   
...                  ...                      ...             ...     ...   
24173  Women Office Wear           Style Quotient  women-clothing     4.0   
24174  Women Office Wear                Chemistry  women-clothing     4.2   
24175  Women Office Wear  Annabelle by Pantaloons  women-clothing     4.6   
24176  Women Office Wear                 Cantabil  women-clothing     4.4   
24177  Women Office Wear                   FITHUB  women-clothing     3.8   

       Rating_count   Price  MRP_Price  \
0             951.0  2299.0     2

In [9]:
import pandas as pd

# Example DataFrame assuming 'category' column is present
# df = pd.DataFrame({'category': [...]})

# Extended gender mapping excluding "Children"
gender_mapping = {
    'Men': 'Men_',
    'Women': 'Women_',
    'Unisex': 'Unisex_',
    'Boys': 'Boys_',
    'Girls': 'Girls_'
}

# One-hot encode the 'category' column
encoded_df = pd.get_dummies(df['category'], prefix='gender', dtype=int)

# Function to map gender-based column names
def map_gender_based_column(col):
    category = col.split('_')[1]  # Extract category after 'gender'
    return gender_mapping.get(category, '') + category

# Rename columns based on the gender mapping
encoded_df.columns = [map_gender_based_column(col) for col in encoded_df.columns]

# Add the encoded columns back to the original DataFrame
df = pd.concat([df, encoded_df], axis=1)

# Print the resulting DataFrame
print(df)

# Save the DataFrame to a CSV file
df.to_csv('updated_dataset.csv', index=False)




            Fashion_type               Brand_Name        category  Rating  \
0                blazers                      H&M  women-clothing     4.4   
1                blazers        Allen Solly Woman  women-clothing     4.3   
2                blazers                    Arrow    men-clothing     4.2   
3                blazers                    MANGO  women-clothing     4.1   
4                blazers                    MANGO  women-clothing     4.0   
...                  ...                      ...             ...     ...   
24173  Women Office Wear           Style Quotient  women-clothing     4.0   
24174  Women Office Wear                Chemistry  women-clothing     4.2   
24175  Women Office Wear  Annabelle by Pantaloons  women-clothing     4.6   
24176  Women Office Wear                 Cantabil  women-clothing     4.4   
24177  Women Office Wear                   FITHUB  women-clothing     3.8   

       Rating_count   Price  MRP_Price  \
0             951.0  2299.0     2

In [None]:
import pandas as pd

# Size dictionary for mapping
size_dict = {
    '34': 'XXS',
    '36': 'XS',
    '38': 'S',
    '40': 'M',
    '42': 'L',
    '44': 'XL',
    '46': 'XXL',
    '48': '3XL',
    '50': '4XL'
}

# Function to convert numeric sizes to labels
def convert_numeric_sizes(size_list):
    if isinstance(size_list, list):  # Check if the entry is a list
        return [size_dict.get(size.strip(), size.strip()) for size in size_list]  # Map sizes with stripping whitespace
    return []  # Return empty list if not a list

# Apply the conversion function to the 'size' column
df['converted_size'] = df['size'].apply(lambda x: eval(x) if isinstance(x, str) else x)  # Convert string representations of lists to actual lists
df['converted_size'] = df['converted_size'].apply(convert_numeric_sizes)  # Convert numeric sizes

# Display the updated DataFrame
print(df[['size', 'converted_size']])


In [None]:
import pandas as pd

# Assuming your DataFrame is named df and has a 'converted_size' column
# Sample data creation for context
# df = pd.DataFrame({'converted_size': [['XXS', 'XS'], [], ['S', 'M'], ['L', 'XL']]})

# Define the unique size list
sizes = ['XXS', 'XS', 'S', 'M', 'L', 'XL', 'XXL', '3XL']

# Create one-hot encoded columns for each size
for size in sizes:
    df[size] = df['converted_size'].apply(lambda x: 1 if size in x else 0)

# Optionally, drop the original 'converted_size' column if no longer needed
# df.drop('converted_size', axis=1, inplace=True)

# Display the updated DataFrame
print(df)
