In [1]:
# for PyCharm
import os

current_directory = os.getcwd()
project_directory = os.path.abspath(os.path.join(current_directory, os.pardir, os.pardir))
os.chdir(project_directory)

# решил снова затестить Pycharm - такая лаговая каловая масса по сравнению с vscode.

In [2]:
import os

import pandas as pd
from pathlib import Path
import re
import numpy as np

metadata_path = "data/raw/metadata"
color_path = "notebooks/merger/color_words.txt"
color_words = open(color_path, "r").readlines()
color_words = set([word.strip().lower() for word in color_words])

In [3]:
discovered_datasets = os.listdir(metadata_path)
discovered_datasets

['superkicks.csv',
 'highsnobiety.csv',
 'footshop.csv',
 'kickscrew.csv',
 'sneakerbaas.csv']

In [4]:
try:
    datasets = {
        source: pd.read_csv(str(Path(metadata_path, source))) for source in discovered_datasets
    }
    assert len(datasets) > 0
except FileNotFoundError as err:
    print(f"Some dataset could not be resolved:")
    raise

In [5]:
# all columns to lowercase
for key in datasets:
    datasets[key].columns = [x.lower() for x in datasets[key].columns]

In [6]:
def format_superkicks_dataset(raw_df: pd.DataFrame) -> pd.DataFrame:
    def get_color(text):
        colors = list()
        text = text.replace("|", " ").split()

        for color_word in color_words:
            for word in text:
                if color_word in word:
                    colors.append(color_word)
                    
        
        return list(dict.fromkeys(colors))

    df = raw_df.drop(
        [
            "product_dimensions",
            "collection_url",
            "generic_name",
            "weight",
            "imported_by",
            "manufacturer",
            "unit_of_measurement",
            "marketed_by",
            "article_code",
            "country_of_origin",
        ],
        axis=1,
    )
    df["pricecurrency"] = "INR"
    df["price"] = df["price"].apply(
        lambda x: float(x.replace("₹", "").replace(",", ""))
    )
    df["color"] = df["title"].apply(get_color)

    df["brand"] = df["brand"].apply(lambda x: " ".join(x.split()))
    df["title"] = df["title"].apply(lambda x: " ".join(x.split()))
    df["description"] = df["description"].apply(lambda x: " ".join(x.split()))

    # df['collection'] = df['collection_name'].astype('category')

    # df["collection"].cat.rename_categories({"men-sneakers":"men", ""})

    # df.drop("collection_name", axis=1)
    
    df = df.drop("description", axis=1)
    
    df["website"] = "superkicks"
    
    df = df.drop_duplicates(subset=["title", "collection_name","url"])
    return df


format_superkicks_dataset(datasets["superkicks.csv"])

Unnamed: 0,brand,title,price,collection_name,url,images_path,pricecurrency,color,website
0,converse,wmns run star legacy cx periwinkle,8499.0,men-sneakers,https://www.superkicks.in/products/wmns-run-st...,data/raw/images/superkicks/men-sneakers/conver...,INR,[periwinkle],superkicks
1,converse,chuck taylor all star charcoal,4299.0,men-sneakers,https://www.superkicks.in/products/chuck-taylo...,data/raw/images/superkicks/men-sneakers/conver...,INR,[charcoal],superkicks
2,nike,kd16 nrg ep pink foam|pink|white,14995.0,men-sneakers,https://www.superkicks.in/products/kd16-nrg-ep...,data/raw/images/superkicks/men-sneakers/nike/k...,INR,"[white, foam, pink]",superkicks
3,jordan,air jordan 1 retro high og black|royal blue-wh...,16995.0,men-sneakers,https://www.superkicks.in/products/air-jordan-...,data/raw/images/superkicks/men-sneakers/jordan...,INR,"[white, black, blue]",superkicks
4,nike,blazer mid 77 premium summit white|black-light...,7756.0,men-sneakers,https://www.superkicks.in/products/blazer-mid-...,data/raw/images/superkicks/men-sneakers/nike/b...,INR,"[light, white, black, silver]",superkicks
...,...,...,...,...,...,...,...,...,...
1094,converse,run star hike platform animalier egret|black|e...,6999.0,women-skateboard-sneakers,https://www.superkicks.in/products/run-star-hi...,data/raw/images/superkicks/women-skateboard-sn...,INR,[black],superkicks
1095,nike,wmns dunk high black|white,9295.0,women-skateboard-sneakers,https://www.superkicks.in/products/wmns-dunk-h...,data/raw/images/superkicks/women-skateboard-sn...,INR,"[white, black]",superkicks
1096,converse,run star motion black,8999.0,women-skateboard-sneakers,https://www.superkicks.in/products/run-star-mo...,data/raw/images/superkicks/women-skateboard-sn...,INR,[black],superkicks
1097,converse,chuck taylor wmns platform layer ox,3499.0,women-skateboard-sneakers,https://www.superkicks.in/products/chuck-taylo...,data/raw/images/superkicks/women-skateboard-sn...,INR,[],superkicks


In [7]:
def format_sneakerbaas_dataset(raw_df: pd.DataFrame):
    def extract_colors(input_string):
        if pd.notnull(input_string):
            match = re.search(r'(Colour|Colours|Colors|Color|Kleur): (.*?)(?:-|$)', input_string, re.IGNORECASE)
            if match:
                colors = match.group(2).strip()
                colors = colors.replace("/", " ").lower().split()
                return list(dict.fromkeys(colors))
            else:
                return None
        else:
            return None

    df = raw_df.drop(["collection_url"], axis=1)

    df["color"] = df["description"].apply(extract_colors)

    df["brand"] = df["brand"].apply(lambda x: " ".join(x.split()))
    df["title"] = df["title"].apply(lambda x: " ".join(x.split()))
    
    
    df = df.drop("description", axis=1)
    
    df["website"] = "sneakerbaas"
    
    df = df.drop_duplicates(subset=["title", "collection_name","url"])

    return df


format_sneakerbaas_dataset(datasets["sneakerbaas.csv"])

Unnamed: 0,brand,pricecurrency,price,title,collection_name,url,images_path,color,website
0,new balance,EUR,79.99,550 vintage teal,category-kids,https://www.sneakerbaas.com/collections/sneake...,data/raw/images/sneakerbaas/category-kids/new ...,"[white, green]",sneakerbaas
1,new balance,EUR,79.99,550 white,category-kids,https://www.sneakerbaas.com/collections/sneake...,data/raw/images/sneakerbaas/category-kids/new ...,"[white, yellow, purple]",sneakerbaas
2,nike,EUR,49.99,blazer mid next nature white,category-kids,https://www.sneakerbaas.com/collections/sneake...,data/raw/images/sneakerbaas/category-kids/nike...,,sneakerbaas
3,nike,EUR,89.99,air max 90 gs habanero,category-kids,https://www.sneakerbaas.com/collections/sneake...,data/raw/images/sneakerbaas/category-kids/nike...,"[black, red]",sneakerbaas
4,vans,EUR,39.99,sk8-mid checkerboard,category-kids,https://www.sneakerbaas.com/collections/sneake...,data/raw/images/sneakerbaas/category-kids/vans...,"[blue, white]",sneakerbaas
...,...,...,...,...,...,...,...,...,...
1000,adidas originals,EUR,69.99,supercourt white,category-men,https://www.sneakerbaas.com/collections/sneake...,data/raw/images/sneakerbaas/category-men/adida...,,sneakerbaas
1001,clarks,EUR,79.99,desert boot purple,category-men,https://www.sneakerbaas.com/collections/sneake...,data/raw/images/sneakerbaas/category-men/clark...,[purple],sneakerbaas
1002,karhu,EUR,69.99,legacy og purple,category-men,https://www.sneakerbaas.com/collections/sneake...,data/raw/images/sneakerbaas/category-men/karhu...,"[white, purple]",sneakerbaas
1003,karhu,EUR,69.99,synchron classic violet,category-men,https://www.sneakerbaas.com/collections/sneake...,data/raw/images/sneakerbaas/category-men/karhu...,"[gray, violet, jazzy]",sneakerbaas


In [8]:
def format_footshop_dataset(raw_df: pd.DataFrame):
    def get_colors(text):
        colors = text.replace("&", "/").replace("/ ", "/").lower().split("/")
        return colors

    df = raw_df.drop(["collection_url"], axis=1)

    df["price"] = df["price"].apply(
        lambda x: float(x.replace("€", "").replace("$", ""))
    )

    df["color_old"] = df["color"]
    df["color"] = df["color"].apply(get_colors)

    df["brand"] = df["brand"].apply(lambda x: " ".join(x.split()))
    df["title"] = df["title"].apply(lambda x: " ".join(x.split()))
    
    df = df.drop("color_old", axis=1)
    df["website"] = "footshop"
    
    df = df.drop_duplicates(subset=["title", "collection_name","url"])
    return df


format_footshop_dataset(datasets["footshop.csv"])

Unnamed: 0,brand,title,color,pricecurrency,price,collection_name,url,images_path,website
0,vans,vans knu skool,"[black, true white]",EUR,97.95,5-mens-shoes,https://www.footshop.eu/en/mens-shoes/261589-v...,data/raw/images/footshop/5-mens-shoes/vans/van...,footshop
1,dr. martens,dr. martens jadon hdw ii,"[black buttero , black 100% recycled da pk mesh]",EUR,274.95,5-mens-shoes,https://www.footshop.eu/en/mens-shoes/297121-d...,data/raw/images/footshop/5-mens-shoes/dr. mart...,footshop
2,adidas originals,adidas samba og,"[core black, ftw white, gum5]",EUR,120.00,5-mens-shoes,https://www.footshop.eu/en/mens-shoes/29598-ad...,data/raw/images/footshop/5-mens-shoes/adidas o...,footshop
3,new balance,new balance 990 v1,"[green, gold]",EUR,146.22,5-mens-shoes,https://www.footshop.eu/en/mens-shoes/279670-n...,data/raw/images/footshop/5-mens-shoes/new bala...,footshop
4,asics,asics x andersson bell gel-sonoma 15-50,"[olive oil, dark brown]",EUR,155.51,5-mens-shoes,https://www.footshop.eu/en/mens-shoes/264451-a...,data/raw/images/footshop/5-mens-shoes/asics/as...,footshop
...,...,...,...,...,...,...,...,...,...
6442,nike,nike court borough mid 2,"[university red, black-white]",EUR,74.95,55-kids-sneakers-and-shoes,https://www.footshop.eu/en/kids-sneakers-and-s...,data/raw/images/footshop/55-kids-sneakers-and-...,footshop
6443,new balance,new balance ct302,[all black],EUR,114.95,55-kids-sneakers-and-shoes,https://www.footshop.eu/en/kids-sneakers-and-s...,data/raw/images/footshop/55-kids-sneakers-and-...,footshop
6444,vans,vans kids old skool,"[navy, true white]",EUR,59.95,55-kids-sneakers-and-shoes,https://www.footshop.eu/en/kids-sneakers-and-s...,data/raw/images/footshop/55-kids-sneakers-and-...,footshop
6445,vans,vans old skool kids,"[black, black]",EUR,59.95,55-kids-sneakers-and-shoes,https://www.footshop.eu/en/kids-sneakers-and-s...,data/raw/images/footshop/55-kids-sneakers-and-...,footshop


In [9]:
processed_datasets = {
    "superkicks": format_superkicks_dataset(datasets["superkicks.csv"]),
    "sneakerbaas": format_sneakerbaas_dataset(datasets["sneakerbaas.csv"]),
    "footshop": format_footshop_dataset(datasets["footshop.csv"])
}

In [10]:
for key, value in processed_datasets.items():
    print(value.shape)

(1090, 9)
(1005, 9)
(4864, 9)


In [11]:
# same columns
same_columns = set.intersection(*[set(processed_datasets[key].columns) for key in processed_datasets])
same_columns

{'brand',
 'collection_name',
 'color',
 'images_path',
 'price',
 'pricecurrency',
 'title',
 'url',
 'website'}

In [12]:
merged_datasets = pd.concat(list(processed_datasets.values()), ignore_index=True)
merged_datasets

Unnamed: 0,brand,title,price,collection_name,url,images_path,pricecurrency,color,website
0,converse,wmns run star legacy cx periwinkle,8499.00,men-sneakers,https://www.superkicks.in/products/wmns-run-st...,data/raw/images/superkicks/men-sneakers/conver...,INR,[periwinkle],superkicks
1,converse,chuck taylor all star charcoal,4299.00,men-sneakers,https://www.superkicks.in/products/chuck-taylo...,data/raw/images/superkicks/men-sneakers/conver...,INR,[charcoal],superkicks
2,nike,kd16 nrg ep pink foam|pink|white,14995.00,men-sneakers,https://www.superkicks.in/products/kd16-nrg-ep...,data/raw/images/superkicks/men-sneakers/nike/k...,INR,"[white, foam, pink]",superkicks
3,jordan,air jordan 1 retro high og black|royal blue-wh...,16995.00,men-sneakers,https://www.superkicks.in/products/air-jordan-...,data/raw/images/superkicks/men-sneakers/jordan...,INR,"[white, black, blue]",superkicks
4,nike,blazer mid 77 premium summit white|black-light...,7756.00,men-sneakers,https://www.superkicks.in/products/blazer-mid-...,data/raw/images/superkicks/men-sneakers/nike/b...,INR,"[light, white, black, silver]",superkicks
...,...,...,...,...,...,...,...,...,...
6954,nike,nike court borough mid 2,74.95,55-kids-sneakers-and-shoes,https://www.footshop.eu/en/kids-sneakers-and-s...,data/raw/images/footshop/55-kids-sneakers-and-...,EUR,"[university red, black-white]",footshop
6955,new balance,new balance ct302,114.95,55-kids-sneakers-and-shoes,https://www.footshop.eu/en/kids-sneakers-and-s...,data/raw/images/footshop/55-kids-sneakers-and-...,EUR,[all black],footshop
6956,vans,vans kids old skool,59.95,55-kids-sneakers-and-shoes,https://www.footshop.eu/en/kids-sneakers-and-s...,data/raw/images/footshop/55-kids-sneakers-and-...,EUR,"[navy, true white]",footshop
6957,vans,vans old skool kids,59.95,55-kids-sneakers-and-shoes,https://www.footshop.eu/en/kids-sneakers-and-s...,data/raw/images/footshop/55-kids-sneakers-and-...,EUR,"[black, black]",footshop


In [14]:
merged_datasets.groupby("title").agg({
    "brand": list,
    "collection_name":list,
    "color": list,
    "images_path": list,
    "price": list,
    "pricecurrency": list,
    "url": list,
    "website": list
})

Unnamed: 0_level_0,brand,collection_name,color,images_path,price,pricecurrency,url,website
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
01 low m bi eden,[autry],[category-men],"[[white, green]]",[data/raw/images/sneakerbaas/category-men/autr...,[149.99],[EUR],[https://www.sneakerbaas.com/collections/sneak...,[sneakerbaas]
01 low m bi violet,[autry],[category-men],"[[white, purple]]",[data/raw/images/sneakerbaas/category-men/autr...,[149.99],[EUR],[https://www.sneakerbaas.com/collections/sneak...,[sneakerbaas]
01 low m black,[autry],[category-men],"[[black, white, creme]]",[data/raw/images/sneakerbaas/category-men/autr...,[149.99],[EUR],[https://www.sneakerbaas.com/collections/sneak...,[sneakerbaas]
01 low mustard,[autry],[category-men],"[[yellow, white]]",[data/raw/images/sneakerbaas/category-men/autr...,[149.99],[EUR],[https://www.sneakerbaas.com/collections/sneak...,[sneakerbaas]
01 low w bi ivory,[autry],[category-women],"[[black, white]]",[data/raw/images/sneakerbaas/category-women/au...,[149.99],[EUR],[https://www.sneakerbaas.com/collections/sneak...,[sneakerbaas]
...,...,...,...,...,...,...,...,...
zoom vomero 5 sp anthracite|anthracite-black-wolf grey,[nike],[men-sneakers],"[[grey, black]]",[data/raw/images/superkicks/men-sneakers/nike/...,[14995.0],[INR],[https://www.superkicks.in/products/zoom-vomer...,[superkicks]
zoom vomero 5 sp vast grey|vast grey-black-sail,[nike],[men-sneakers],"[[grey, black]]",[data/raw/images/superkicks/men-sneakers/nike/...,[14995.0],[INR],[https://www.superkicks.in/products/zoom-vomer...,[superkicks]
zx 22 boost cream white,[adidas originals],[category-men],[[beige]],[data/raw/images/sneakerbaas/category-men/adid...,[59.99],[EUR],[https://www.sneakerbaas.com/collections/sneak...,[sneakerbaas]
zx 2k boost 2.0 white,[adidas],[women-sneakers],[[white]],[data/raw/images/superkicks/women-sneakers/adi...,[8399.0],[INR],[https://www.superkicks.in/products/zx-2k-boos...,[superkicks]


In [None]:
for name, d in datasets.items():
    print(f"{name} columns:", d.columns)

In [None]:
datasets["superkicks"]["images_dir"]

In [None]:
for source, data in datasets.items():
    data["source"] = source

pd.concat(
    datasets.values(),
    keys=[
        "url",
        "brand",
        "slug",
    ],
)