In [1]:
# for PyCharm
import os

current_directory = os.getcwd()
project_directory = os.path.abspath(os.path.join(current_directory, os.pardir, os.pardir))
os.chdir(project_directory)

In [2]:
import os

import pandas as pd
from pathlib import Path
import numpy as np
from src.data.merger import SuperkicksFormatter, SneakerbaasFormatter, FootshopFormatter, HighsnobietyFormatter, \
    KickscrewFormatter, check_extra_symbols



In [3]:
metadata_path = "data/raw/metadata"
discovered_datasets = os.listdir(metadata_path)
discovered_datasets

['superkicks.csv',
 'highsnobiety.csv',
 'footshop.csv',
 'kickscrew.csv',
 'sneakerbaas.csv']

In [4]:
try:
    datasets = {
        source: pd.read_csv(str(Path(metadata_path, source))) for source in discovered_datasets
    }
    assert len(datasets) > 0
except FileNotFoundError as err:
    print(f"Some dataset could not be resolved:")
    raise

In [5]:
processed_datasets = {
    "superkicks": SuperkicksFormatter(datasets["superkicks.csv"]).format(),
    "sneakerbaas": SneakerbaasFormatter(datasets["sneakerbaas.csv"]).format(),
    "footshop": FootshopFormatter(datasets["footshop.csv"]).format(),
    "highsnobiety": HighsnobietyFormatter(datasets["highsnobiety.csv"]).format(),
    "kickscrew": KickscrewFormatter(datasets["kickscrew.csv"]).format(),
}
check_extra_symbols(processed_datasets)

superkicks {')', '('}
sneakerbaas {')', '('}
footshop {')', '('}
highsnobiety {')', '('}
kickscrew {')', '('}


In [6]:
for key, value in processed_datasets.items():
    print(value.shape)

(1090, 9)
(1005, 9)
(4864, 9)
(598, 7)
(4900, 7)


In [7]:
# same columns
same_columns = set.intersection(*[set(processed_datasets[key].columns) for key in processed_datasets])
same_columns
# no colors and collection-name

{'brand', 'images_path', 'price', 'pricecurrency', 'title', 'url', 'website'}

In [8]:
merged_datasets = pd.concat(list(processed_datasets.values()), ignore_index=True)
merged_datasets

Unnamed: 0,brand,title,price,collection_name,url,images_path,website,pricecurrency,color
0,converse,wmns run star legacy cx periwinkle,8499.0,men-sneakers,https://www.superkicks.in/products/wmns-run-st...,data/raw/images/superkicks/men-sneakers/conver...,superkicks,INR,[periwinkle]
1,converse,chuck taylor all star charcoal,4299.0,men-sneakers,https://www.superkicks.in/products/chuck-taylo...,data/raw/images/superkicks/men-sneakers/conver...,superkicks,INR,[charcoal]
2,nike,kd16 nrg ep pink foam pink white,14995.0,men-sneakers,https://www.superkicks.in/products/kd16-nrg-ep...,data/raw/images/superkicks/men-sneakers/nike/k...,superkicks,INR,"[pink, foam, white]"
3,jordan,air jordan 1 retro high og black royal blue-wh...,16995.0,men-sneakers,https://www.superkicks.in/products/air-jordan-...,data/raw/images/superkicks/men-sneakers/jordan...,superkicks,INR,"[black, blue]"
4,nike,blazer mid 77 premium summit white black-light...,7756.0,men-sneakers,https://www.superkicks.in/products/blazer-mid-...,data/raw/images/superkicks/men-sneakers/nike/b...,superkicks,INR,"[white, silver]"
...,...,...,...,...,...,...,...,...,...
12452,adidas,adidas ultraboost 1.0 shoes valentines day,123.0,,https://www.kickscrew.com/products/adidas-ultr...,[data/raw/images/kickscrew/adidas/adidas-ultra...,kickscrew,USD,
12453,adidas,adidas originals ozweego shoes magic beige cla...,102.0,,https://www.kickscrew.com/products/adidas-orig...,[data/raw/images/kickscrew/adidas/adidas-origi...,kickscrew,USD,
12454,adidas,adidas originals adilette slides off white bri...,56.0,,https://www.kickscrew.com/products/adidas-orig...,[data/raw/images/kickscrew/adidas/adidas-origi...,kickscrew,USD,
12455,adidas,adidas originals forum low shoes white grey red,84.0,,https://www.kickscrew.com/products/adidas-orig...,[data/raw/images/kickscrew/adidas/adidas-origi...,kickscrew,USD,


In [19]:
# group by по названиям, если все названия будут одинаковые, то все дублирующие кроссы сложатся в одно.
test = merged_datasets.groupby("title").agg({
    "brand": list,
    "collection_name": list,
    "color": list,
    "images_path": list,
    "price": list,
    "pricecurrency": list,
    "url": list,
    "website": list
}).reset_index().sort_values(by="title")
test

Unnamed: 0,title,brand,collection_name,color,images_path,price,pricecurrency,url,website
0,(gs) adidas predator edge.3 multi-ground boots...,[adidas],[nan],[nan],[[data/raw/images/kickscrew/adidas/adidas-pred...,[105.0],[USD],[https://www.kickscrew.com/products/adidas-pre...,[kickscrew]
1,(gs) adidas superstar xlg whote black,[adidas],[nan],[nan],[[data/raw/images/kickscrew/adidas/gs-adidas-s...,[75.0],[USD],[https://www.kickscrew.com/products/gs-adidas-...,[kickscrew]
2,(gs) adidas ultra boost 22 magic mauve,[adidas],[nan],[nan],[[data/raw/images/kickscrew/adidas/gs-adidas-u...,[111.0],[USD],[https://www.kickscrew.com/products/gs-adidas-...,[kickscrew]
3,(gs) adidas ultraboost 22 triple black,[adidas],[nan],[nan],[[data/raw/images/kickscrew/adidas/gs-adidas-u...,[199.0],[USD],[https://www.kickscrew.com/products/gs-adidas-...,[kickscrew]
4,(gs) air jordan 1 low aquatone,[Air Jordan],[nan],[nan],[[data/raw/images/kickscrew/Air Jordan/gs-air-...,[100.0],[USD],[https://www.kickscrew.com/products/gs-air-jor...,[kickscrew]
...,...,...,...,...,...,...,...,...,...
8939,zoom vomero 5 sp anthracite anthracite-black-w...,[nike],[men-sneakers],[[grey]],[data/raw/images/superkicks/men-sneakers/nike/...,[14995.0],[INR],[https://www.superkicks.in/products/zoom-vomer...,[superkicks]
8940,zoom vomero 5 sp vast grey vast grey-black-sail,[nike],[men-sneakers],[[grey]],[data/raw/images/superkicks/men-sneakers/nike/...,[14995.0],[INR],[https://www.superkicks.in/products/zoom-vomer...,[superkicks]
8941,zx 22 boost cream white,[adidas originals],[category-men],[[beige]],[data/raw/images/sneakerbaas/category-men/adid...,[59.99],[EUR],[https://www.sneakerbaas.com/collections/sneak...,[sneakerbaas]
8942,zx 2k boost 2.0 white,[adidas],[women-sneakers],[[white]],[data/raw/images/superkicks/women-sneakers/adi...,[8399.0],[INR],[https://www.superkicks.in/products/zx-2k-boos...,[superkicks]


In [11]:
# sizes
print(merged_datasets.shape)
print(test.shape)

(12457, 9)
(8944, 9)


In [12]:
titles = test["title"].tolist()

In [13]:
!pip install thefuzz -q


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m23.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [17]:
from thefuzz import fuzz

def similarity(string1, string2):
    return fuzz.token_sort_ratio(string1, string2)

title = "vans - ua og authentic lx yellow"

similarities = []
for text in titles:
    similarities.append((text, similarity(title, text)))

similarities.sort(key=lambda x: x[1])
similarities[-10:]

[('vans - ua authentic vr3 pw lx beige', 70),
 ('vans vault og authentic frayed lx', 70),
 ('vans vault og authentic lx (canvas)', 70),
 ('authentic los vans', 71),
 ('vans - ua og authentic lx suede brown', 71),
 ('vans - ua og authentic lx suede olive', 71),
 ('vans og authentic lx navy', 73),
 ('vans authentic sneaker yellow', 78),
 ('vans vault og authentic lx', 79),
 ('vans - ua og authentic lx suede yellow', 91)]

In [18]:
# эта тема лучше всего находит похожие, но она хуёвит, когда в названии цвета есть, потому что
# у кроссов очень часто совпадают цвета.