In [1]:
# for PyCharm
import os

current_directory = os.getcwd()
project_directory = os.path.abspath(os.path.join(current_directory, os.pardir, os.pardir))
os.chdir(project_directory)

In [2]:
import os

import pandas as pd
from pathlib import Path
import numpy as np
from src.data.merger import SuperkicksFormatter, SneakerbaasFormatter, FootshopFormatter, HighsnobietyFormatter, \
    KickscrewFormatter, check_extra_symbols



In [3]:
metadata_path = "data/raw/metadata"
discovered_datasets = os.listdir(metadata_path)
discovered_datasets

['superkicks.csv',
 'highsnobiety.csv',
 'footshop.csv',
 'kickscrew.csv',
 'sneakerbaas.csv']

In [4]:
try:
    datasets = {
        source: pd.read_csv(str(Path(metadata_path, source))) for source in discovered_datasets
    }
    assert len(datasets) > 0
except FileNotFoundError as err:
    print(f"Some dataset could not be resolved:")
    raise

In [5]:
processed_datasets = {
    "superkicks": SuperkicksFormatter(datasets["superkicks.csv"]).format(),
    "sneakerbaas": SneakerbaasFormatter(datasets["sneakerbaas.csv"]).format(),
    "footshop": FootshopFormatter(datasets["footshop.csv"]).format(),
    "highsnobiety": HighsnobietyFormatter(datasets["highsnobiety.csv"]).format(),
    "kickscrew": KickscrewFormatter(datasets["kickscrew.csv"]).format(),
}
check_extra_symbols(processed_datasets)

superkicks set()
sneakerbaas set()
footshop set()
highsnobiety set()
kickscrew {'~', '!', '%', '=', ';'}


In [6]:
processed_datasets["footshop"]

Unnamed: 0,brand,title,color,pricecurrency,price,collection_name,url,images_path,website
0,vans,vans knu skool,"[black, true white]",EUR,97.95,5-mens-shoes,https://www.footshop.eu/en/mens-shoes/261589-v...,data/raw/images/footshop/5-mens-shoes/vans/van...,footshop
1,dr. martens,dr. martens jadon hdw ii,"[black buttero , black 100% recycled da pk mesh]",EUR,274.95,5-mens-shoes,https://www.footshop.eu/en/mens-shoes/297121-d...,data/raw/images/footshop/5-mens-shoes/dr. mart...,footshop
2,adidas originals,adidas samba og,"[core black, ftw white, gum5]",EUR,120.00,5-mens-shoes,https://www.footshop.eu/en/mens-shoes/29598-ad...,data/raw/images/footshop/5-mens-shoes/adidas o...,footshop
3,new balance,new balance 990 v1,"[green, gold]",EUR,146.22,5-mens-shoes,https://www.footshop.eu/en/mens-shoes/279670-n...,data/raw/images/footshop/5-mens-shoes/new bala...,footshop
4,asics,asics x andersson bell gel-sonoma 15-50,"[olive oil, dark brown]",EUR,155.51,5-mens-shoes,https://www.footshop.eu/en/mens-shoes/264451-a...,data/raw/images/footshop/5-mens-shoes/asics/as...,footshop
...,...,...,...,...,...,...,...,...,...
6442,nike,nike court borough mid 2,"[university red, black-white]",EUR,74.95,55-kids-sneakers-and-shoes,https://www.footshop.eu/en/kids-sneakers-and-s...,data/raw/images/footshop/55-kids-sneakers-and-...,footshop
6443,new balance,new balance ct302,[all black],EUR,114.95,55-kids-sneakers-and-shoes,https://www.footshop.eu/en/kids-sneakers-and-s...,data/raw/images/footshop/55-kids-sneakers-and-...,footshop
6444,vans,vans kids old skool,"[navy, true white]",EUR,59.95,55-kids-sneakers-and-shoes,https://www.footshop.eu/en/kids-sneakers-and-s...,data/raw/images/footshop/55-kids-sneakers-and-...,footshop
6445,vans,vans old skool kids,"[black, black]",EUR,59.95,55-kids-sneakers-and-shoes,https://www.footshop.eu/en/kids-sneakers-and-s...,data/raw/images/footshop/55-kids-sneakers-and-...,footshop


In [7]:
for key, value in processed_datasets.items():
    print(value.shape)

(1090, 9)
(1005, 9)
(4864, 9)
(598, 9)
(4900, 9)


In [8]:
# same columns
same_columns = set.intersection(*[set(processed_datasets[key].columns) for key in processed_datasets])
same_columns

{'brand', 'price', 'pricecurrency', 'title', 'url', 'website'}

In [9]:
merged_datasets = pd.concat(list(processed_datasets.values()), ignore_index=True)
merged_datasets

Unnamed: 0,brand,title,price,collection_name,url,images_path,website,pricecurrency,color,right-side-img,left-side-img,front-both-img
0,converse,wmns run star legacy cx periwinkle,8499.0,men-sneakers,https://www.superkicks.in/products/wmns-run-st...,data/raw/images/superkicks/men-sneakers/conver...,superkicks,INR,[periwinkle],,,
1,converse,chuck taylor all star charcoal,4299.0,men-sneakers,https://www.superkicks.in/products/chuck-taylo...,data/raw/images/superkicks/men-sneakers/conver...,superkicks,INR,[charcoal],,,
2,nike,kd16 nrg ep pink foam pink white,14995.0,men-sneakers,https://www.superkicks.in/products/kd16-nrg-ep...,data/raw/images/superkicks/men-sneakers/nike/k...,superkicks,INR,"[pink, foam, white]",,,
3,jordan,air jordan 1 retro high og black royal blue-wh...,16995.0,men-sneakers,https://www.superkicks.in/products/air-jordan-...,data/raw/images/superkicks/men-sneakers/jordan...,superkicks,INR,"[black, blue]",,,
4,nike,blazer mid 77 premium summit white black-light...,7756.0,men-sneakers,https://www.superkicks.in/products/blazer-mid-...,data/raw/images/superkicks/men-sneakers/nike/b...,superkicks,INR,"[white, silver]",,,
...,...,...,...,...,...,...,...,...,...,...,...,...
12452,adidas,adidas ultraboost 1.0 shoes valentines day,123.0,,https://www.kickscrew.com/products/adidas-ultr...,,kickscrew,USD,,data/raw/images/kickscrew/adidas/adidas-ultrab...,data/raw/images/kickscrew/adidas/adidas-ultrab...,data/raw/images/kickscrew/adidas/adidas-ultrab...
12453,adidas,adidas originals ozweego shoes magic beige cla...,102.0,,https://www.kickscrew.com/products/adidas-orig...,,kickscrew,USD,,data/raw/images/kickscrew/adidas/adidas-origin...,data/raw/images/kickscrew/adidas/adidas-origin...,data/raw/images/kickscrew/adidas/adidas-origin...
12454,adidas,adidas originals adilette slides off white bri...,56.0,,https://www.kickscrew.com/products/adidas-orig...,,kickscrew,USD,,data/raw/images/kickscrew/adidas/adidas-origin...,data/raw/images/kickscrew/adidas/adidas-origin...,data/raw/images/kickscrew/adidas/adidas-origin...
12455,adidas,adidas originals forum low shoes white grey red,84.0,,https://www.kickscrew.com/products/adidas-orig...,,kickscrew,USD,,data/raw/images/kickscrew/adidas/adidas-origin...,data/raw/images/kickscrew/adidas/adidas-origin...,data/raw/images/kickscrew/adidas/adidas-origin...


In [10]:
test = merged_datasets.groupby("title").agg({
    "brand": list,
    "collection_name": list,
    "color": list,
    "images_path": list,
    "price": list,
    "pricecurrency": list,
    "url": list,
    "website": list
})
test = test.reset_index()

In [11]:
test

Unnamed: 0,title,brand,collection_name,color,images_path,price,pricecurrency,url,website
0,(gs) adidas predator edge.3 multi-ground boots...,[adidas],[nan],[nan],[nan],[105.0],[USD],[https://www.kickscrew.com/products/adidas-pre...,[kickscrew]
1,(gs) adidas superstar xlg whote black,[adidas],[nan],[nan],[nan],[75.0],[USD],[https://www.kickscrew.com/products/gs-adidas-...,[kickscrew]
2,(gs) adidas ultra boost 22 magic mauve,[adidas],[nan],[nan],[nan],[111.0],[USD],[https://www.kickscrew.com/products/gs-adidas-...,[kickscrew]
3,(gs) adidas ultraboost 22 triple black,[adidas],[nan],[nan],[nan],[199.0],[USD],[https://www.kickscrew.com/products/gs-adidas-...,[kickscrew]
4,(gs) air jordan 1 low aquatone,[Air Jordan],[nan],[nan],[nan],[100.0],[USD],[https://www.kickscrew.com/products/gs-air-jor...,[kickscrew]
...,...,...,...,...,...,...,...,...,...
8939,zoom vomero 5 sp anthracite anthracite-black-w...,[nike],[men-sneakers],[[grey]],[data/raw/images/superkicks/men-sneakers/nike/...,[14995.0],[INR],[https://www.superkicks.in/products/zoom-vomer...,[superkicks]
8940,zoom vomero 5 sp vast grey vast grey-black-sail,[nike],[men-sneakers],[[grey]],[data/raw/images/superkicks/men-sneakers/nike/...,[14995.0],[INR],[https://www.superkicks.in/products/zoom-vomer...,[superkicks]
8941,zx 22 boost cream white,[adidas originals],[category-men],[[beige]],[data/raw/images/sneakerbaas/category-men/adid...,[59.99],[EUR],[https://www.sneakerbaas.com/collections/sneak...,[sneakerbaas]
8942,zx 2k boost 2.0 white,[adidas],[women-sneakers],[[white]],[data/raw/images/superkicks/women-sneakers/adi...,[8399.0],[INR],[https://www.superkicks.in/products/zx-2k-boos...,[superkicks]
