In [1]:
# for PyCharm
import os

current_directory = os.getcwd()
project_directory = os.path.abspath(os.path.join(current_directory, os.pardir, os.pardir))
os.chdir(project_directory)

In [2]:
import os

import pandas as pd
from pathlib import Path
import numpy as np
from src.data.merger import SuperkicksFormatter,SneakerbaasFormatter,FootshopFormatter, check_extra_symbols

In [3]:
metadata_path = "data/raw/metadata"
discovered_datasets = os.listdir(metadata_path)
discovered_datasets

['superkicks.csv',
 'highsnobiety.csv',
 'footshop.csv',
 'kickscrew.csv',
 'sneakerbaas.csv']

In [4]:
try:
    datasets = {
        source: pd.read_csv(str(Path(metadata_path, source))) for source in discovered_datasets
    }
    assert len(datasets) > 0
except FileNotFoundError as err:
    print(f"Some dataset could not be resolved:")
    raise

In [5]:
datasets["footshop.csv"]

Unnamed: 0,brand,title,color,pricecurrency,price,collection_name,collection_url,url,images_path
0,vans,vans knu skool,Black/ True White,EUR,97.95 €,5-mens-shoes,https://www.footshop.eu/en/5-mens-shoes,https://www.footshop.eu/en/mens-shoes/261589-v...,data/raw/images/footshop/5-mens-shoes/vans/van...
1,dr. martens,dr. martens jadon hdw ii,Black Buttero & Black 100% Recycled Da Pk Mesh,EUR,274.95 €,5-mens-shoes,https://www.footshop.eu/en/5-mens-shoes,https://www.footshop.eu/en/mens-shoes/297121-d...,data/raw/images/footshop/5-mens-shoes/dr. mart...
2,adidas originals,adidas samba og,Core Black/ Ftw White/ Gum5,EUR,120 €,5-mens-shoes,https://www.footshop.eu/en/5-mens-shoes,https://www.footshop.eu/en/mens-shoes/29598-ad...,data/raw/images/footshop/5-mens-shoes/adidas o...
3,new balance,new balance 990 v1,Green/ Gold,EUR,146.22 €,5-mens-shoes,https://www.footshop.eu/en/5-mens-shoes,https://www.footshop.eu/en/mens-shoes/279670-n...,data/raw/images/footshop/5-mens-shoes/new bala...
4,asics,asics x andersson bell gel-sonoma 15-50,Olive Oil/ Dark Brown,EUR,155.51 €,5-mens-shoes,https://www.footshop.eu/en/5-mens-shoes,https://www.footshop.eu/en/mens-shoes/264451-a...,data/raw/images/footshop/5-mens-shoes/asics/as...
...,...,...,...,...,...,...,...,...,...
6442,nike,nike court borough mid 2,University Red/ Black-White,EUR,74.95 €,55-kids-sneakers-and-shoes,https://www.footshop.eu/en/55-kids-sneakers-an...,https://www.footshop.eu/en/kids-sneakers-and-s...,data/raw/images/footshop/55-kids-sneakers-and-...
6443,new balance,new balance ct302,All Black,EUR,114.95 €,55-kids-sneakers-and-shoes,https://www.footshop.eu/en/55-kids-sneakers-an...,https://www.footshop.eu/en/kids-sneakers-and-s...,data/raw/images/footshop/55-kids-sneakers-and-...
6444,vans,vans kids old skool,Navy/ True White,EUR,59.95 €,55-kids-sneakers-and-shoes,https://www.footshop.eu/en/55-kids-sneakers-an...,https://www.footshop.eu/en/kids-sneakers-and-s...,data/raw/images/footshop/55-kids-sneakers-and-...
6445,vans,vans old skool kids,Black/ Black,EUR,59.95 €,55-kids-sneakers-and-shoes,https://www.footshop.eu/en/55-kids-sneakers-an...,https://www.footshop.eu/en/kids-sneakers-and-s...,data/raw/images/footshop/55-kids-sneakers-and-...


In [6]:
processed_datasets = {
    "superkicks": SuperkicksFormatter(datasets["superkicks.csv"]).format(),
    "sneakerbaas": SneakerbaasFormatter(datasets["sneakerbaas.csv"]).format(),
    "footshop": FootshopFormatter(datasets["footshop.csv"]).format()
}
check_extra_symbols(processed_datasets)

superkicks set()
sneakerbaas set()
footshop set()


In [7]:
for key, value in processed_datasets.items():
    print(value.shape)

(1090, 9)
(1005, 9)
(4864, 9)


In [8]:
# same columns
same_columns = set.intersection(*[set(processed_datasets[key].columns) for key in processed_datasets])
same_columns

{'brand',
 'collection_name',
 'color',
 'images_path',
 'price',
 'pricecurrency',
 'title',
 'url',
 'website'}

In [9]:
merged_datasets = pd.concat(list(processed_datasets.values()), ignore_index=True)
merged_datasets

Unnamed: 0,brand,title,price,collection_name,url,images_path,website,pricecurrency,color
0,converse,wmns run star legacy cx periwinkle,8499.00,men-sneakers,https://www.superkicks.in/products/wmns-run-st...,data/raw/images/superkicks/men-sneakers/conver...,superkicks,INR,[periwinkle]
1,converse,chuck taylor all star charcoal,4299.00,men-sneakers,https://www.superkicks.in/products/chuck-taylo...,data/raw/images/superkicks/men-sneakers/conver...,superkicks,INR,[charcoal]
2,nike,kd16 nrg ep pink foam pink white,14995.00,men-sneakers,https://www.superkicks.in/products/kd16-nrg-ep...,data/raw/images/superkicks/men-sneakers/nike/k...,superkicks,INR,"[pink, foam, white]"
3,jordan,air jordan 1 retro high og black royal blue-wh...,16995.00,men-sneakers,https://www.superkicks.in/products/air-jordan-...,data/raw/images/superkicks/men-sneakers/jordan...,superkicks,INR,"[black, blue]"
4,nike,blazer mid 77 premium summit white black-light...,7756.00,men-sneakers,https://www.superkicks.in/products/blazer-mid-...,data/raw/images/superkicks/men-sneakers/nike/b...,superkicks,INR,"[white, silver]"
...,...,...,...,...,...,...,...,...,...
6954,nike,nike court borough mid 2,74.95,55-kids-sneakers-and-shoes,https://www.footshop.eu/en/kids-sneakers-and-s...,data/raw/images/footshop/55-kids-sneakers-and-...,footshop,EUR,"[university red, black-white]"
6955,new balance,new balance ct302,114.95,55-kids-sneakers-and-shoes,https://www.footshop.eu/en/kids-sneakers-and-s...,data/raw/images/footshop/55-kids-sneakers-and-...,footshop,EUR,[all black]
6956,vans,vans kids old skool,59.95,55-kids-sneakers-and-shoes,https://www.footshop.eu/en/kids-sneakers-and-s...,data/raw/images/footshop/55-kids-sneakers-and-...,footshop,EUR,"[navy, true white]"
6957,vans,vans old skool kids,59.95,55-kids-sneakers-and-shoes,https://www.footshop.eu/en/kids-sneakers-and-s...,data/raw/images/footshop/55-kids-sneakers-and-...,footshop,EUR,"[black, black]"


In [10]:
merged_datasets.groupby("title").agg({
    "brand": list,
    "collection_name": list,
    "color": list,
    "images_path": list,
    "price": list,
    "pricecurrency": list,
    "url": list,
    "website": list
})

Unnamed: 0_level_0,brand,collection_name,color,images_path,price,pricecurrency,url,website
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
01 low m bi eden,[autry],[category-men],"[[white, green]]",[data/raw/images/sneakerbaas/category-men/autr...,[149.99],[EUR],[https://www.sneakerbaas.com/collections/sneak...,[sneakerbaas]
01 low m bi violet,[autry],[category-men],"[[white, purple]]",[data/raw/images/sneakerbaas/category-men/autr...,[149.99],[EUR],[https://www.sneakerbaas.com/collections/sneak...,[sneakerbaas]
01 low m black,[autry],[category-men],"[[black, white, creme]]",[data/raw/images/sneakerbaas/category-men/autr...,[149.99],[EUR],[https://www.sneakerbaas.com/collections/sneak...,[sneakerbaas]
01 low mustard,[autry],[category-men],"[[yellow, white]]",[data/raw/images/sneakerbaas/category-men/autr...,[149.99],[EUR],[https://www.sneakerbaas.com/collections/sneak...,[sneakerbaas]
01 low w bi ivory,[autry],[category-women],"[[black, white]]",[data/raw/images/sneakerbaas/category-women/au...,[149.99],[EUR],[https://www.sneakerbaas.com/collections/sneak...,[sneakerbaas]
...,...,...,...,...,...,...,...,...
zoom vomero 5 sp anthracite anthracite-black-wolf grey,[nike],[men-sneakers],[[grey]],[data/raw/images/superkicks/men-sneakers/nike/...,[14995.0],[INR],[https://www.superkicks.in/products/zoom-vomer...,[superkicks]
zoom vomero 5 sp vast grey vast grey-black-sail,[nike],[men-sneakers],[[grey]],[data/raw/images/superkicks/men-sneakers/nike/...,[14995.0],[INR],[https://www.superkicks.in/products/zoom-vomer...,[superkicks]
zx 22 boost cream white,[adidas originals],[category-men],[[beige]],[data/raw/images/sneakerbaas/category-men/adid...,[59.99],[EUR],[https://www.sneakerbaas.com/collections/sneak...,[sneakerbaas]
zx 2k boost 2.0 white,[adidas],[women-sneakers],[[white]],[data/raw/images/superkicks/women-sneakers/adi...,[8399.0],[INR],[https://www.superkicks.in/products/zx-2k-boos...,[superkicks]
