In [1]:
# for PyCharm
import os

current_directory = os.getcwd()
project_directory = os.path.abspath(os.path.join(current_directory, os.pardir, os.pardir))
os.chdir(project_directory)

In [2]:
import os

import pandas as pd
from pathlib import Path
import numpy as np
from src.data.merger import SuperkicksFormatter, SneakerbaasFormatter, FootshopFormatter, HighsnobietyFormatter, \
    KickscrewFormatter, check_extra_symbols



In [3]:
metadata_path = "data/raw/metadata"
discovered_datasets = os.listdir(metadata_path)
discovered_datasets

['superkicks.csv',
 'highsnobiety.csv',
 'footshop.csv',
 'kickscrew.csv',
 'sneakerbaas.csv']

In [4]:
try:
    datasets = {
        source: pd.read_csv(str(Path(metadata_path, source))) for source in discovered_datasets
    }
    assert len(datasets) > 0
except FileNotFoundError as err:
    print(f"Some dataset could not be resolved:")
    raise

In [5]:
processed_datasets = {
    "superkicks": SuperkicksFormatter(datasets["superkicks.csv"]).format(),
    "sneakerbaas": SneakerbaasFormatter(datasets["sneakerbaas.csv"]).format(),
    "footshop": FootshopFormatter(datasets["footshop.csv"]).format(),
    "highsnobiety": HighsnobietyFormatter(datasets["highsnobiety.csv"]).format(),
    "kickscrew": KickscrewFormatter(datasets["kickscrew.csv"]).format(),
}
check_extra_symbols(processed_datasets)

superkicks set()
sneakerbaas set()
footshop set()
highsnobiety set()
kickscrew {'~', '!', '%', '=', ';'}


In [35]:
processed_datasets["kickscrew"]

Unnamed: 0,url,brand,pricecurrency,price,title,right-side-img,left-side-img,front-both-img,website
0,https://www.kickscrew.com/products/new-balance...,New Balance,USD,88,new balance 530 white natural indigo,data/raw/images/kickscrew/New Balance/new-bala...,data/raw/images/kickscrew/New Balance/new-bala...,data/raw/images/kickscrew/New Balance/new-bala...,kickscrew
1,https://www.kickscrew.com/products/nike-nike-s...,Nike,USD,130,nike sportswear down fill sports hooded down j...,data/raw/images/kickscrew/Nike/nike-nike-sport...,data/raw/images/kickscrew/Nike/nike-nike-sport...,data/raw/images/kickscrew/Nike/nike-nike-sport...,kickscrew
2,https://www.kickscrew.com/products/nike-lebron...,Nike,USD,73,nike lebron witness 6 ep white melon tint,data/raw/images/kickscrew/Nike/nike-lebron-wit...,data/raw/images/kickscrew/Nike/nike-lebron-wit...,data/raw/images/kickscrew/Nike/nike-lebron-wit...,kickscrew
3,https://www.kickscrew.com/products/new-balance...,New Balance,USD,75,new balance 327 brown white,data/raw/images/kickscrew/New Balance/new-bala...,data/raw/images/kickscrew/New Balance/new-bala...,data/raw/images/kickscrew/New Balance/new-bala...,kickscrew
4,https://www.kickscrew.com/products/nike-dunk-l...,Nike,USD,94,(gs) nike dunk low retro panda,data/raw/images/kickscrew/Nike/nike-dunk-low-r...,data/raw/images/kickscrew/Nike/nike-dunk-low-r...,data/raw/images/kickscrew/Nike/nike-dunk-low-r...,kickscrew
...,...,...,...,...,...,...,...,...,...
4895,https://www.kickscrew.com/products/adidas-ultr...,adidas,USD,123,adidas ultraboost 1.0 shoes valentines day,data/raw/images/kickscrew/adidas/adidas-ultrab...,data/raw/images/kickscrew/adidas/adidas-ultrab...,data/raw/images/kickscrew/adidas/adidas-ultrab...,kickscrew
4896,https://www.kickscrew.com/products/adidas-orig...,adidas,USD,102,adidas originals ozweego shoes magic beige cla...,data/raw/images/kickscrew/adidas/adidas-origin...,data/raw/images/kickscrew/adidas/adidas-origin...,data/raw/images/kickscrew/adidas/adidas-origin...,kickscrew
4897,https://www.kickscrew.com/products/adidas-orig...,adidas,USD,56,adidas originals adilette slides off white bri...,data/raw/images/kickscrew/adidas/adidas-origin...,data/raw/images/kickscrew/adidas/adidas-origin...,data/raw/images/kickscrew/adidas/adidas-origin...,kickscrew
4898,https://www.kickscrew.com/products/adidas-orig...,adidas,USD,84,adidas originals forum low shoes white grey red,data/raw/images/kickscrew/adidas/adidas-origin...,data/raw/images/kickscrew/adidas/adidas-origin...,data/raw/images/kickscrew/adidas/adidas-origin...,kickscrew


In [7]:
for key, value in processed_datasets.items():
    print(value.shape)

(1090, 9)
(1005, 9)
(4864, 9)
(598, 9)
(4900, 9)


In [8]:
# same columns
same_columns = set.intersection(*[set(processed_datasets[key].columns) for key in processed_datasets])
same_columns

{'brand', 'price', 'pricecurrency', 'title', 'url', 'website'}

In [9]:
merged_datasets = pd.concat(list(processed_datasets.values()), ignore_index=True)
merged_datasets

Unnamed: 0,brand,title,price,collection_name,url,images_path,website,pricecurrency,color,right-side-img,left-side-img,front-both-img
0,converse,wmns run star legacy cx periwinkle,8499.0,men-sneakers,https://www.superkicks.in/products/wmns-run-st...,data/raw/images/superkicks/men-sneakers/conver...,superkicks,INR,[periwinkle],,,
1,converse,chuck taylor all star charcoal,4299.0,men-sneakers,https://www.superkicks.in/products/chuck-taylo...,data/raw/images/superkicks/men-sneakers/conver...,superkicks,INR,[charcoal],,,
2,nike,kd16 nrg ep pink foam pink white,14995.0,men-sneakers,https://www.superkicks.in/products/kd16-nrg-ep...,data/raw/images/superkicks/men-sneakers/nike/k...,superkicks,INR,"[pink, foam, white]",,,
3,jordan,air jordan 1 retro high og black royal blue-wh...,16995.0,men-sneakers,https://www.superkicks.in/products/air-jordan-...,data/raw/images/superkicks/men-sneakers/jordan...,superkicks,INR,"[black, blue]",,,
4,nike,blazer mid 77 premium summit white black-light...,7756.0,men-sneakers,https://www.superkicks.in/products/blazer-mid-...,data/raw/images/superkicks/men-sneakers/nike/b...,superkicks,INR,"[white, silver]",,,
...,...,...,...,...,...,...,...,...,...,...,...,...
12452,adidas,adidas ultraboost 1.0 shoes valentines day,123.0,,https://www.kickscrew.com/products/adidas-ultr...,,kickscrew,USD,,data/raw/images/kickscrew/adidas/adidas-ultrab...,data/raw/images/kickscrew/adidas/adidas-ultrab...,data/raw/images/kickscrew/adidas/adidas-ultrab...
12453,adidas,adidas originals ozweego shoes magic beige cla...,102.0,,https://www.kickscrew.com/products/adidas-orig...,,kickscrew,USD,,data/raw/images/kickscrew/adidas/adidas-origin...,data/raw/images/kickscrew/adidas/adidas-origin...,data/raw/images/kickscrew/adidas/adidas-origin...
12454,adidas,adidas originals adilette slides off white bri...,56.0,,https://www.kickscrew.com/products/adidas-orig...,,kickscrew,USD,,data/raw/images/kickscrew/adidas/adidas-origin...,data/raw/images/kickscrew/adidas/adidas-origin...,data/raw/images/kickscrew/adidas/adidas-origin...
12455,adidas,adidas originals forum low shoes white grey red,84.0,,https://www.kickscrew.com/products/adidas-orig...,,kickscrew,USD,,data/raw/images/kickscrew/adidas/adidas-origin...,data/raw/images/kickscrew/adidas/adidas-origin...,data/raw/images/kickscrew/adidas/adidas-origin...


In [10]:
test = merged_datasets.groupby("title").agg({
    "brand": list,
    "collection_name": list,
    "color": list,
    "images_path": list,
    "price": list,
    "pricecurrency": list,
    "url": list,
    "website": list
})
test = test.reset_index()

In [11]:
test

Unnamed: 0,title,brand,collection_name,color,images_path,price,pricecurrency,url,website
0,(gs) adidas predator edge.3 multi-ground boots...,[adidas],[nan],[nan],[nan],[105.0],[USD],[https://www.kickscrew.com/products/adidas-pre...,[kickscrew]
1,(gs) adidas superstar xlg whote black,[adidas],[nan],[nan],[nan],[75.0],[USD],[https://www.kickscrew.com/products/gs-adidas-...,[kickscrew]
2,(gs) adidas ultra boost 22 magic mauve,[adidas],[nan],[nan],[nan],[111.0],[USD],[https://www.kickscrew.com/products/gs-adidas-...,[kickscrew]
3,(gs) adidas ultraboost 22 triple black,[adidas],[nan],[nan],[nan],[199.0],[USD],[https://www.kickscrew.com/products/gs-adidas-...,[kickscrew]
4,(gs) air jordan 1 low aquatone,[Air Jordan],[nan],[nan],[nan],[100.0],[USD],[https://www.kickscrew.com/products/gs-air-jor...,[kickscrew]
...,...,...,...,...,...,...,...,...,...
8939,zoom vomero 5 sp anthracite anthracite-black-w...,[nike],[men-sneakers],[[grey]],[data/raw/images/superkicks/men-sneakers/nike/...,[14995.0],[INR],[https://www.superkicks.in/products/zoom-vomer...,[superkicks]
8940,zoom vomero 5 sp vast grey vast grey-black-sail,[nike],[men-sneakers],[[grey]],[data/raw/images/superkicks/men-sneakers/nike/...,[14995.0],[INR],[https://www.superkicks.in/products/zoom-vomer...,[superkicks]
8941,zx 22 boost cream white,[adidas originals],[category-men],[[beige]],[data/raw/images/sneakerbaas/category-men/adid...,[59.99],[EUR],[https://www.sneakerbaas.com/collections/sneak...,[sneakerbaas]
8942,zx 2k boost 2.0 white,[adidas],[women-sneakers],[[white]],[data/raw/images/superkicks/women-sneakers/adi...,[8399.0],[INR],[https://www.superkicks.in/products/zx-2k-boos...,[superkicks]


In [13]:
titles = test["title"].tolist()

In [None]:
!pip install thefuzz -q

In [34]:
from thefuzz import fuzz

def similarity(string1, string2):
    return fuzz.token_sort_ratio(string1, string2)
# fuzz.partial_ratio(string1, string2)/100 * 
title = "(gs) adidas ultraboost 22 triple black"

similarities = []
for text in titles:
    similarities.append((text, similarity(title, text)))

similarities.sort(key=lambda x: x[1])
similarities[-50:]

[('adidas retropy e5 triple black', 61),
 ('adidas ultraboost 1.0 w', 61),
 ('adidas velosamba black gum', 61),
 ('adidas vs pace 2.0 3 stripes shoes white black', 61),
 ('adidas yeezy boost 700 mnvn triple black', 61),
 ('asics - gel-trabuco terra sps black', 61),
 ('(gs) adidas superstar xlg whote black', 62),
 ('adidas d rose 773 2020 black', 62),
 ('adidas pro bounce 2018 triple black', 62),
 ('adidas pro model 2g black', 62),
 ('adidas solar boost core black', 62),
 ('adidas x stella mccartney ultraboost speed', 62),
 ('asics gel nimbus 23 triple black', 62),
 ('asics gel nimbus 25 triple black', 62),
 ('adidas - superstar 82 white black', 63),
 ('adidas pureboost go clear brown', 63),
 ('adidas terrex speed ultra trail black turbo', 63),
 ('adidas ultraboost 1.0', 63),
 ('(gs) adidas ultra boost 22 magic mauve', 64),
 ('adidas d rose 773 2020 black gold', 64),
 ('adidas lego x ultraboost dna color pack - multi', 64),
 ('adidas pro model 2g core black', 64),
 ('adidas superstar sl