In [1]:
# for PyCharm
import os

current_directory = os.getcwd()
project_directory = os.path.abspath(os.path.join(current_directory, os.pardir, os.pardir))
os.chdir(project_directory)

In [2]:
import os

import pandas as pd
from pathlib import Path
import numpy as np
from src.data.merger import SuperkicksFormatter, SneakerbaasFormatter, FootshopFormatter, HighsnobietyFormatter, \
    KickscrewFormatter, check_extra_symbols



In [3]:
metadata_path = "data/raw/metadata"
discovered_datasets = os.listdir(metadata_path)
discovered_datasets

['superkicks.csv',
 'highsnobiety.csv',
 'footshop.csv',
 'kickscrew.csv',
 'sneakerbaas.csv']

In [4]:
try:
    datasets = {
        source: pd.read_csv(str(Path(metadata_path, source))) for source in discovered_datasets
    }
    assert len(datasets) > 0
except FileNotFoundError as err:
    print(f"Some dataset could not be resolved:")
    raise

In [5]:
processed_datasets = {
    "superkicks": SuperkicksFormatter(datasets["superkicks.csv"]).format(),
    "sneakerbaas": SneakerbaasFormatter(datasets["sneakerbaas.csv"]).format(),
    "footshop": FootshopFormatter(datasets["footshop.csv"]).format(),
    "highsnobiety": HighsnobietyFormatter(datasets["highsnobiety.csv"]).format(),
    "kickscrew": KickscrewFormatter(datasets["kickscrew.csv"]).format(),
}
check_extra_symbols(processed_datasets)

superkicks set()
sneakerbaas set()
footshop set()
highsnobiety set()
kickscrew set()


In [6]:
for key, value in processed_datasets.items():
    print(value.shape)

(1090, 10)
(1005, 10)
(4864, 10)
(598, 8)
(4900, 8)


In [7]:
# same columns
same_columns = set.intersection(*[set(processed_datasets[key].columns) for key in processed_datasets])
same_columns
# no colors and collection-name

{'brand',
 'images_path',
 'price',
 'pricecurrency',
 'title',
 'title_old',
 'url',
 'website'}

In [8]:
merged_datasets = pd.concat(list(processed_datasets.values()), ignore_index=True)
merged_datasets

Unnamed: 0,brand,title,price,collection_name,url,images_path,website,title_old,pricecurrency,color
0,converse,wmns run star legacy cx,8499.0,men-sneakers,https://www.superkicks.in/products/wmns-run-st...,data/raw/images/superkicks/men-sneakers/conver...,superkicks,wmns run star legacy cx periwinkle,INR,[]
1,converse,chuck taylor all star,4299.0,men-sneakers,https://www.superkicks.in/products/chuck-taylo...,data/raw/images/superkicks/men-sneakers/conver...,superkicks,chuck taylor all star charcoal,INR,[]
2,nike,kd16 nrg ep,14995.0,men-sneakers,https://www.superkicks.in/products/kd16-nrg-ep...,data/raw/images/superkicks/men-sneakers/nike/k...,superkicks,kd16 nrg ep pink foam|pink|white,INR,[]
3,jordan,air jordan 1 retro high og royal bluewhiteroyal,16995.0,men-sneakers,https://www.superkicks.in/products/air-jordan-...,data/raw/images/superkicks/men-sneakers/jordan...,superkicks,air jordan 1 retro high og black|royal blue-wh...,INR,[]
4,nike,blazer mid 77 premium summit blacklight,7756.0,men-sneakers,https://www.superkicks.in/products/blazer-mid-...,data/raw/images/superkicks/men-sneakers/nike/b...,superkicks,blazer mid 77 premium summit white|black-light...,INR,[]
...,...,...,...,...,...,...,...,...,...,...
12452,adidas,adidas ultraboost 10 shoes valentines day,123.0,,https://www.kickscrew.com/products/adidas-ultr...,[data/raw/images/kickscrew/adidas/adidas-ultra...,kickscrew,Adidas Ultraboost 1.0 Shoes 'Valentines Day',USD,
12453,adidas,adidas originals ozweego shoes magic clay strata,102.0,,https://www.kickscrew.com/products/adidas-orig...,[data/raw/images/kickscrew/adidas/adidas-origi...,kickscrew,Adidas Originals Ozweego Shoes 'Magic Beige Cl...,USD,
12454,adidas,adidas originals adilette slides off bright,56.0,,https://www.kickscrew.com/products/adidas-orig...,[data/raw/images/kickscrew/adidas/adidas-origi...,kickscrew,Adidas Originals Adilette Slides 'Off White Br...,USD,
12455,adidas,adidas originals forum low shoes,84.0,,https://www.kickscrew.com/products/adidas-orig...,[data/raw/images/kickscrew/adidas/adidas-origi...,kickscrew,Adidas Originals Forum Low Shoes 'White Grey Red',USD,


In [9]:
# group by по названиям, если все названия будут одинаковые, то все дублирующие кроссы сложатся в одно.
test = merged_datasets.groupby("title").agg({
    "brand": list,
    "collection_name": list,
    "color": list,
    "images_path": list,
    "price": list,
    "pricecurrency": list,
    "url": list,
    "website": list
}).reset_index().sort_values(by="title")
test

Unnamed: 0,title,brand,collection_name,color,images_path,price,pricecurrency,url,website
0,01 low,"[autry, autry]","[category-women, category-men]","[[white, beige], [yellow, white]]",[data/raw/images/sneakerbaas/category-women/au...,"[129.99, 149.99]","[EUR, EUR]",[https://www.sneakerbaas.com/collections/sneak...,"[sneakerbaas, sneakerbaas]"
1,01 low m,[autry],[category-men],"[[black, white, creme]]",[data/raw/images/sneakerbaas/category-men/autr...,[149.99],[EUR],[https://www.sneakerbaas.com/collections/sneak...,[sneakerbaas]
2,01 low m bi,[autry],[category-men],"[[white, purple]]",[data/raw/images/sneakerbaas/category-men/autr...,[149.99],[EUR],[https://www.sneakerbaas.com/collections/sneak...,[sneakerbaas]
3,01 low m bi eden,[autry],[category-men],"[[white, green]]",[data/raw/images/sneakerbaas/category-men/autr...,[149.99],[EUR],[https://www.sneakerbaas.com/collections/sneak...,[sneakerbaas]
4,01 low w bi,[autry],[category-women],"[[black, white]]",[data/raw/images/sneakerbaas/category-women/au...,[149.99],[EUR],[https://www.sneakerbaas.com/collections/sneak...,[sneakerbaas]
...,...,...,...,...,...,...,...,...,...
8233,zoom vomero 5 sp anthracite anthraciteblackwolf,[nike],[men-sneakers],[[]],[data/raw/images/superkicks/men-sneakers/nike/...,[14995.0],[INR],[https://www.superkicks.in/products/zoom-vomer...,[superkicks]
8234,zoom vomero 5 sp vast vast greyblacksail,[nike],[men-sneakers],[[]],[data/raw/images/superkicks/men-sneakers/nike/...,[14995.0],[INR],[https://www.superkicks.in/products/zoom-vomer...,[superkicks]
8235,zx 22 boost,[adidas originals],[category-men],[[beige]],[data/raw/images/sneakerbaas/category-men/adid...,[59.99],[EUR],[https://www.sneakerbaas.com/collections/sneak...,[sneakerbaas]
8236,zx 2k boost 20,[adidas],[women-sneakers],[[]],[data/raw/images/superkicks/women-sneakers/adi...,[8399.0],[INR],[https://www.superkicks.in/products/zx-2k-boos...,[superkicks]


In [10]:
# sizes
print(merged_datasets.shape)
print(test.shape)

(12457, 10)
(8238, 9)


In [11]:
titles = test["title"].tolist()

In [12]:
!pip install thefuzz -q


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m23.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [13]:
from thefuzz import fuzz

def similarity(string1, string2):
    return fuzz.token_sort_ratio(string1, string2)

title = "vans - ua og authentic lx yellow"

similarities = []
for text in titles:
    similarities.append((text, similarity(title, text)))

similarities.sort(key=lambda x: x[1])
similarities[-10:]

[('vans ua og authentic lx vault checkerboard', 69),
 ('vans vault authentic one piece vlt lx', 69),
 ('vans vault og authentic frayed lx', 70),
 ('vans vault og authentic lx canvas', 70),
 ('authentic los vans', 71),
 ('vans og authentic lx stressed', 71),
 ('vans ua authentic vr3 pw lx', 77),
 ('vans ua og authentic lx suede', 78),
 ('vans vault og authentic lx', 79),
 ('vans og authentic lx', 80)]

In [18]:
# эта тема лучше всего находит похожие

In [14]:
test.to_csv("temp.csv")