In [1]:
import os

import pandas as pd
from pathlib import Path
import re
import numpy as np

color_words = open("notebooks/merger/color_words.txt", "r").readlines()
color_words = set([word.strip().lower() for word in color_words])

In [2]:
path = "data/raw/metadata"

In [3]:
discovered_datasets = os.listdir(path)
discovered_datasets

['superkicks.csv',
 'highsnobiety.csv',
 'footshop.csv',
 'kickscrew.csv',
 'sneakerbaas.csv']

In [4]:
try:
    datasets = {
        source: pd.read_csv(str(Path(path, source))) for source in discovered_datasets
    }
    assert len(datasets) > 0
except FileNotFoundError as err:
    print(f"Some dataset could not be resolved:")
    raise

In [5]:
# all columns to lowercase
for key in datasets:
    datasets[key].columns = [x.lower() for x in datasets[key].columns]

In [6]:
def format_superkicks_dataset(raw_df: pd.DataFrame) -> pd.DataFrame:
    def get_color(text):
        colors = list()
        text = text.replace("|", " ").split()

        for color_word in color_words:
            for word in text:
                if color_word in word:
                    colors.append(color_word)
        return list(dict.fromkeys(colors))

    df = raw_df.drop(
        [
            "product_dimensions",
            "collection_url",
            "generic_name",
            "weight",
            "imported_by",
            "manufacturer",
            "unit_of_measurement",
            "marketed_by",
            "article_code",
            "country_of_origin",
        ],
        axis=1,
    )
    df["pricecurrency"] = "INR"
    df["price"] = df["price"].apply(
        lambda x: float(x.replace("₹", "").replace(",", ""))
    )
    df["color"] = df["title"].apply(get_color)

    df["brand"] = df["brand"].apply(lambda x: " ".join(x.split()))
    df["title"] = df["title"].apply(lambda x: " ".join(x.split()))
    df["description"] = df["description"].apply(lambda x: " ".join(x.split()))

    # df['collection'] = df['collection_name'].astype('category')

    # df["collection"].cat.rename_categories({"men-sneakers":"men", ""})

    # df.drop("collection_name", axis=1)
    return df


format_superkicks_dataset(datasets["superkicks.csv"])

Unnamed: 0,brand,title,price,description,collection_name,url,images_path,pricecurrency,color
0,converse,wmns run star legacy cx periwinkle,8499.0,The latest iteration of the best-selling Run S...,men-sneakers,https://www.superkicks.in/products/wmns-run-st...,data/raw/images/superkicks/men-sneakers/conver...,INR,[periwinkle]
1,converse,chuck taylor all star charcoal,4299.0,We could tell you that it's the OG basketball ...,men-sneakers,https://www.superkicks.in/products/chuck-taylo...,data/raw/images/superkicks/men-sneakers/conver...,INR,[charcoal]
2,nike,kd16 nrg ep pink foam|pink|white,14995.0,Kevin Durant is a true hooper. He'd be just as...,men-sneakers,https://www.superkicks.in/products/kd16-nrg-ep...,data/raw/images/superkicks/men-sneakers/nike/k...,INR,"[white, foam, pink]"
3,jordan,air jordan 1 retro high og black|royal blue-wh...,16995.0,The Air Jordan 1 Retro High remakes the classi...,men-sneakers,https://www.superkicks.in/products/air-jordan-...,data/raw/images/superkicks/men-sneakers/jordan...,INR,"[black, blue, white]"
4,nike,blazer mid 77 premium summit white|black-light...,7756.0,Styled for the ‘70s. Loved in the ‘80s. Classi...,men-sneakers,https://www.superkicks.in/products/blazer-mid-...,data/raw/images/superkicks/men-sneakers/nike/b...,INR,"[black, white, silver, light]"
...,...,...,...,...,...,...,...,...,...
1094,converse,run star hike platform animalier egret|black|e...,6999.0,Elevate your look in signature Chuck Taylor st...,women-skateboard-sneakers,https://www.superkicks.in/products/run-star-hi...,data/raw/images/superkicks/women-skateboard-sn...,INR,[black]
1095,nike,wmns dunk high black|white,9295.0,Created for the hardwood but taken to the stre...,women-skateboard-sneakers,https://www.superkicks.in/products/wmns-dunk-h...,data/raw/images/superkicks/women-skateboard-sn...,INR,"[black, white]"
1096,converse,run star motion black,8999.0,We could tell you that it's the OG basketball ...,women-skateboard-sneakers,https://www.superkicks.in/products/run-star-mo...,data/raw/images/superkicks/women-skateboard-sn...,INR,[black]
1097,converse,chuck taylor wmns platform layer ox,3499.0,manufacturer : country_of_origin : imported_by...,women-skateboard-sneakers,https://www.superkicks.in/products/chuck-taylo...,data/raw/images/superkicks/women-skateboard-sn...,INR,[]


In [7]:
def format_sneakerbaas_dataset(raw_df: pd.DataFrame):

    def extract_colors(input_string):
        if pd.notnull(input_string):
            match = re.search(r'(Colour|Colours|Colors|Color|Kleur): (.*?)(?:-|$)', input_string, re.IGNORECASE)
            if match:
                colors = match.group(2).strip()
                colors = colors.replace("/", " ").lower().split()
                return list(dict.fromkeys(colors))
            else:
                return None
        else:
            return None
    df = raw_df.drop(["collection_url"], axis=1)

    df["color"] = df["description"].apply(extract_colors)

    df["brand"] = df["brand"].apply(lambda x: " ".join(x.split()))
    df["title"] = df["title"].apply(lambda x: " ".join(x.split()))


    return df

format_sneakerbaas_dataset(datasets["sneakerbaas.csv"])

Unnamed: 0,brand,description,pricecurrency,price,title,collection_name,url,images_path,color
0,new balance,"- New Balance 550 ""Vintage Teal""- Colour: Whit...",EUR,79.99,550 vintage teal,category-kids,https://www.sneakerbaas.com/collections/sneake...,data/raw/images/sneakerbaas/category-kids/new ...,"[white, green]"
1,new balance,"- New Balance 550 ""WHITE""- Colour: White/Yello...",EUR,79.99,550 white,category-kids,https://www.sneakerbaas.com/collections/sneake...,data/raw/images/sneakerbaas/category-kids/new ...,"[white, yellow, purple]"
2,nike,Stylecode: FN7788-100,EUR,49.99,blazer mid next nature white,category-kids,https://www.sneakerbaas.com/collections/sneake...,data/raw/images/sneakerbaas/category-kids/nike...,
3,nike,"- NIKE AIR MAX 90 GS ""BLACK/HABANERO-SMOKE GRE...",EUR,89.99,air max 90 gs habanero,category-kids,https://www.sneakerbaas.com/collections/sneake...,data/raw/images/sneakerbaas/category-kids/nike...,"[black, red]"
4,vans,"- UY SK8-Mid Reissue V ""CTHR CHB""- Colour: Blu...",EUR,39.99,sk8-mid checkerboard,category-kids,https://www.sneakerbaas.com/collections/sneake...,data/raw/images/sneakerbaas/category-kids/vans...,"[blue, white]"
...,...,...,...,...,...,...,...,...,...
1000,adidas originals,A Clean Classic. These men's Supercourt Shoes ...,EUR,69.99,supercourt white,category-men,https://www.sneakerbaas.com/collections/sneake...,data/raw/images/sneakerbaas/category-men/adida...,
1001,clarks,"- Clarks Desert Boot ""Purple""- Colour: Purple-...",EUR,79.99,desert boot purple,category-men,https://www.sneakerbaas.com/collections/sneake...,data/raw/images/sneakerbaas/category-men/clark...,[purple]
1002,karhu,"- Karhu Legacy OG ""Purple""- Colour: White/Purp...",EUR,69.99,legacy og purple,category-men,https://www.sneakerbaas.com/collections/sneake...,data/raw/images/sneakerbaas/category-men/karhu...,"[white, purple]"
1003,karhu,"- Karhu Synchron Classic ""Gray Violet""- Stylec...",EUR,69.99,synchron classic violet,category-men,https://www.sneakerbaas.com/collections/sneake...,data/raw/images/sneakerbaas/category-men/karhu...,"[gray, violet, jazzy]"


In [9]:
def format_footshop_dataset(raw_df):

    def get_colors(text):
        colors = text.replace("&", "/").replace("/ ", "/").lower().split("/")
        return colors


    df = raw_df.drop(["collection_url"],axis=1)





    df["price"] = df["price"].apply(
        lambda x: float(x.replace("€", "").replace("$", ""))
    )


    df["color_old"] = df["color"]
    df["color"] = df["color"].apply(get_colors)


    df["brand"] = df["brand"].apply(lambda x: " ".join(x.split()))
    df["title"] = df["title"].apply(lambda x: " ".join(x.split()))
    return df

format_footshop_dataset(datasets["footshop.csv"])

Unnamed: 0,brand,title,color,pricecurrency,price,collection_name,url,images_path,color_old
0,vans,vans knu skool,"[black, true white]",EUR,97.95,5-mens-shoes,https://www.footshop.eu/en/mens-shoes/261589-v...,data/raw/images/footshop/5-mens-shoes/vans/van...,Black/ True White
1,dr. martens,dr. martens jadon hdw ii,"[black buttero , black 100% recycled da pk mesh]",EUR,274.95,5-mens-shoes,https://www.footshop.eu/en/mens-shoes/297121-d...,data/raw/images/footshop/5-mens-shoes/dr. mart...,Black Buttero & Black 100% Recycled Da Pk Mesh
2,adidas originals,adidas samba og,"[core black, ftw white, gum5]",EUR,120.00,5-mens-shoes,https://www.footshop.eu/en/mens-shoes/29598-ad...,data/raw/images/footshop/5-mens-shoes/adidas o...,Core Black/ Ftw White/ Gum5
3,new balance,new balance 990 v1,"[green, gold]",EUR,146.22,5-mens-shoes,https://www.footshop.eu/en/mens-shoes/279670-n...,data/raw/images/footshop/5-mens-shoes/new bala...,Green/ Gold
4,asics,asics x andersson bell gel-sonoma 15-50,"[olive oil, dark brown]",EUR,155.51,5-mens-shoes,https://www.footshop.eu/en/mens-shoes/264451-a...,data/raw/images/footshop/5-mens-shoes/asics/as...,Olive Oil/ Dark Brown
...,...,...,...,...,...,...,...,...,...
6442,nike,nike court borough mid 2,"[university red, black-white]",EUR,74.95,55-kids-sneakers-and-shoes,https://www.footshop.eu/en/kids-sneakers-and-s...,data/raw/images/footshop/55-kids-sneakers-and-...,University Red/ Black-White
6443,new balance,new balance ct302,[all black],EUR,114.95,55-kids-sneakers-and-shoes,https://www.footshop.eu/en/kids-sneakers-and-s...,data/raw/images/footshop/55-kids-sneakers-and-...,All Black
6444,vans,vans kids old skool,"[navy, true white]",EUR,59.95,55-kids-sneakers-and-shoes,https://www.footshop.eu/en/kids-sneakers-and-s...,data/raw/images/footshop/55-kids-sneakers-and-...,Navy/ True White
6445,vans,vans old skool kids,"[black, black]",EUR,59.95,55-kids-sneakers-and-shoes,https://www.footshop.eu/en/kids-sneakers-and-s...,data/raw/images/footshop/55-kids-sneakers-and-...,Black/ Black


In [10]:
processed_datasets = {
    "superkicks": format_superkicks_dataset(datasets["superkicks.csv"]),
    "sneakerbaas": format_sneakerbaas_dataset(datasets["sneakerbaas.csv"]),
    "footshop": format_footshop_dataset(datasets["footshop.csv"])
}

In [11]:
# same columns
same_columns = set.intersection(*[set(processed_datasets[key].columns) for key in processed_datasets])
same_columns

{'brand',
 'collection_name',
 'color',
 'images_path',
 'price',
 'pricecurrency',
 'title',
 'url'}

In [None]:
for name, d in datasets.items():
    print(f"{name} columns:", d.columns)

In [None]:
datasets["superkicks"]["images_dir"]

In [None]:
for source, data in datasets.items():
    data["source"] = source

pd.concat(
    datasets.values(),
    keys=[
        "url",
        "brand",
        "slug",
    ],
)