# EDA notebook

In [1]:
# for PyCharm
import os
from collections import Counter

import pandas as pd

from sneakers_ml.data.image import (
    get_images_count,
    get_images_formats,
    get_images_modes,
    get_images_sizes,
    get_images_suffixes,
)
from sneakers_ml.data.merger import Merger

current_directory = os.getcwd()
project_directory = os.path.abspath(
    os.path.join(current_directory, os.pardir, os.pardir)
)
os.chdir(project_directory)

In [3]:
# !dvc pull data/merged/metadata

# Main dataset

In [4]:
df = pd.read_csv("data/merged/metadata/main_dataset.csv")
df

Unnamed: 0,title_merge,brand_merge,images_path,price,pricecurrency,color,website
0,01 low,autry,['data/raw/images/sneakerbaas/category-men/aut...,129.99,EUR,"['Mustard', 'White']","['sneakerbaas', 'sneakerbaas']"
1,01 low m,autry,['data/raw/images/sneakerbaas/category-men/aut...,149.99,EUR,"['BI EDEN', 'BI VIOLET', 'BLACK']","['sneakerbaas', 'sneakerbaas', 'sneakerbaas']"
2,01 low man,autry,['data/raw/images/sneakerbaas/category-men/aut...,149.99,EUR,['LEAT/LEAT RED'],['sneakerbaas']
3,01 low w,autry,['data/raw/images/sneakerbaas/category-women/a...,149.99,EUR,"['BI IVORY', 'BI POW', 'SALTWATER', 'WHT/CORAL']","['sneakerbaas', 'sneakerbaas', 'sneakerbaas', ..."
4,01 low wom,autry,['data/raw/images/sneakerbaas/category-women/a...,149.99,EUR,['WHT/PETROL'],['sneakerbaas']
...,...,...,...,...,...,...,...
872,zoom verona slip,nike,['data/raw/images/sneakerbaas/category-men/nik...,49.99,EUR,"['Leo Baker', 'Light Dew']","['sneakerbaas', 'sneakerbaas']"
873,zoom verona slip x leticia bufon,nike,['data/raw/images/sneakerbaas/category-unisex/...,39.99,EUR,[''],['sneakerbaas']
874,zoom vomero 5 sp,nike,['data/raw/images/superkicks/men-sneakers/nike...,14995.00,INR,"['ANTHRACITE/ANTHRACITE-BLACK-WOLF GREY', 'VAS...","['superkicks', 'superkicks']"
875,zx 22 boost,adidas,['data/raw/images/sneakerbaas/category-men/adi...,59.99,EUR,['Cream White'],['sneakerbaas']


## Preparations

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 877 entries, 0 to 876
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   title_merge    877 non-null    object 
 1   brand_merge    877 non-null    object 
 2   images_path    877 non-null    object 
 3   price          877 non-null    float64
 4   pricecurrency  877 non-null    object 
 5   color          877 non-null    object 
 6   website        877 non-null    object 
dtypes: float64(1), object(6)
memory usage: 48.1+ KB


In [6]:
df.describe(include="all")

Unnamed: 0,title_merge,brand_merge,images_path,price,pricecurrency,color,website
count,877,877,877,877.0,877,877,877
unique,877,30,877,,2,704,50
top,01 low,nike,['data/raw/images/sneakerbaas/category-men/aut...,,INR,[''],['sneakerbaas']
freq,1,185,1,,499,68,230
mean,,,,5677.451471,,,
std,,,,5823.709353,,,
min,,,,11.95,,,
25%,,,,99.95,,,
50%,,,,5499.0,,,
75%,,,,9749.0,,,


# Пропущенные значения отсутствуют

In [7]:
df.isna().sum()

title_merge      0
brand_merge      0
images_path      0
price            0
pricecurrency    0
color            0
website          0
dtype: int64

In [8]:
unique_brands = df["brand_merge"].value_counts()
unique_brands

brand_merge
nike                    185
puma                    137
adidas                  123
reebok                   72
converse                 57
jordan                   44
vans                     35
asics                    26
kangaroos                24
new balance              22
clarks                   17
autry                    17
toms                     17
veja                     15
saucony                  13
mizuno                   11
diadora                  10
salomon                  10
karhu                     9
hi tec                    7
etonic                    6
suicoke                   5
stepney workers club      4
sergio tacchini           3
y 3                       2
merrell                   2
fila                      1
crocs                     1
represent                 1
warrior                   1
Name: count, dtype: int64

# Мердж в основном происходит внутри одного вебсайта, между сайтами почти нету пересечений

In [9]:
df["website"].apply(lambda x: set(eval(x))).value_counts()

website
{superkicks}                 463
{sneakerbaas}                378
{superkicks, sneakerbaas}     36
Name: count, dtype: int64

# Топ цветов - тяжело посчитать, все цвета имеют разные названия

In [10]:
Counter(Merger.flatten_list(df["color"].apply(eval).values)).most_common(10)

[('', 1),
 ('ACADEMY PINK', 1),
 ('ACTION PRT', 1),
 ('ALUMINIUM/WHITE/BURGUNDY', 1),
 ('ALUMINIUM/WONDER MAUVE', 1),
 ('ANIMAL PRINT', 1),
 ('ANTHRACITE/ANTHRACITE-BLACK-WOLF GREY', 1),
 ('AQUA', 1),
 ('AQUAMARINE', 1),
 ('ARCHEO PINK', 1)]

In [11]:
by_brands = pd.read_csv("data/merged/metadata/brands_dataset.csv")
by_models = pd.read_csv("data/merged/metadata/models_dataset.csv")
brands_images_path = "data/merged/images/by-brands"
models_images_path = "data/merged/images/by-models"

# Датасет брендов

## Топ брендов по количеству картинок

In [12]:
by_brands.sort_values(by="unique_images_count", ascending=False)[
    ["brand_merge", "unique_images_count"]
]

Unnamed: 0,brand_merge,unique_images_count
16,nike,1527
0,adidas,923
17,puma,608
18,reebok,370
4,converse,361
10,jordan,337
1,asics,279
21,saucony,193
15,new balance,184
12,karhu,178


# Количество картинок в датасете по брендам

In [13]:
get_images_count(brands_images_path)

5892

## Расширения картинок - проверили, что всё приведено к одному формату

In [14]:
set(get_images_formats(brands_images_path))

{'JPEG'}

## Расширения файлов картинок

In [15]:
set(get_images_suffixes(brands_images_path))

{'.jpeg'}

# Палитра картинок - все картинки имеют три канала

In [16]:
set(get_images_modes(brands_images_path))

{'RGB'}

## Размеры картинок

In [17]:
sizes = get_images_sizes(brands_images_path)
print("Max size:", max(sizes, key=lambda x: (x[0], x[1])))
print("Min size:", min(sizes, key=lambda x: (x[0], x[1])))

Max size: (2000, 2000)
Min size: (432, 381)


# Датасет моделей

## Топ моделей по количеству картинок

In [18]:
by_models.sort_values(by="unique_images_count", ascending=False)[
    ["title_merge", "unique_images_count"]
]

Unnamed: 0,title_merge,unique_images_count
577,shadow 6000,95
327,fusion 20,59
730,wmns air force 1 07,54
34,550,53
576,shadow 5000,47
...,...,...
552,rs z reinvent,1
773,wmns blazer low platform pink glaze,1
574,sdu j mesh,1
122,answer 5 x eric emanuel,1


## Количество картинок

In [19]:
get_images_count(models_images_path)

5922

## Расширения картинок

In [20]:
set(get_images_formats(models_images_path))

{'JPEG'}

## Расширения файлов картинок

In [21]:
set(get_images_suffixes(models_images_path))

{'.jpeg'}

# Палитра картинок

In [22]:
set(get_images_modes(models_images_path))

{'RGB'}

# Размеры картинок

In [23]:
sizes = get_images_sizes(models_images_path)
print("Max size:", max(sizes, key=lambda x: (x[0], x[1])))
print("Min size:", min(sizes, key=lambda x: (x[0], x[1])))

Max size: (2000, 2000)
Min size: (432, 381)
