In [1]:
import pandas as pd
from pathlib import Path
import os
from os.path import join as opj
from tqdm.auto import tqdm
import ast
import numpy as np

In [2]:
BASE_DIR = os.getcwd()
DATASET_DIR = opj(BASE_DIR, "../data")

In [3]:
i = 0
# read dataset
shein_data = pd.read_csv(opj(DATASET_DIR, "shein_sample5000.csv"), sep=";", encoding="ISO-8859-1")
for i in tqdm(list(range(len(shein_data)))):
    desc, images = shein_data.iloc[i][["description", "images"]]
    
    # read description and images info as lists
    desc, images = ast.literal_eval(desc), ast.literal_eval(images)
    shein_data.loc[i, ["description", "images"]] = [desc, images]

  0%|          | 0/111189 [00:00<?, ?it/s]

In [None]:
shein_data.info()

In [None]:
shein_data.head(20)

In [None]:
#dropping unwanted columns
shein_data.drop(columns=['url'])

In [None]:
#finding all of the brand names
shein_data['brand'].unique().tolist()

In [None]:
#suspecting NaN values in the brand column are from the regular line
brands_to_find = ['shein', 'SheIn', 'SHEIN','She In', 'she in', 'SHE IN']
found = shein_data['brand'].isin(brands_to_find).any()
print(found)

In [None]:
#replacing NaN values with SHEIN in the brand column and checking
shein_data['brand'] = shein_data['brand'].replace(np.nan, 'SHEIN')
brands_to_find = ['SHEIN']
found = shein_data['brand'].isin(brands_to_find).any()
print(found)

In [None]:
#changing price column to a float 
shein_data['price'] = shein_data['price'].str.replace('$', '', regex=False)
shein_data['price'] = pd.to_numeric(shein_data['price'], errors='coerce')
shein_data.info()

In [None]:
#investigating name column
print(shein_data['name'].head(100).to_list())

In [None]:
#category function
def assign_category(df, col):

    conditions = [
        shein_data['name'].str.contains('Skirt', case=False, na=False),
        shein_data['name'].str.contains('Dress', case=False, na=False),
        shein_data['name'].str.contains('Shorts', case=False, na=False),
        shein_data['name'].str.contains('skort', case=False, na=False),
        shein_data['name'].str.contains('Top|Cami|Tee', case=False, na=False),
        shein_data['name'].str.contains('Pants|Trousers|Joggers', case=False, na=False),
        shein_data['name'].str.contains('Swimsuit', case=False, na=False),
        shein_data['name'].str.contains('Chain', case=False, na=False),
    ]

    categories = ['Skirt', 'Dress', 'Shorts', 'Skort', 'Top', 'Pants', 'Swimsuit', 'Jewelry']

    shein_data['category'] = np.select(conditions, categories, default='Other')
    return df

In [None]:
#applying the function to create a category
shein_data = assign_category(shein_data, 'name')
shein_data.head()