In [1]:
import os
import sys

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
import cv2
from tqdm import tqdm

Не забудьте положить описательные csv файлы (train_images и test_images) в папку с этим .ipynb-файлом

In [4]:
path_folder = os.getcwd()

In [5]:
path_folder_images = os.path.join(path_folder, 'images')

In [6]:
train_data_dir = os.path.join(path_folder_images, 'train')

In [10]:
train_cats = pd.read_csv('train_images.csv', sep='\t', index_col=0)

Будем делать сбалансированную выборку по 32 (размер батча) картинки в каждой категории


In [15]:
#UnderSampling
def under_sample(df, n_sample):
    desc_cats = df['category_id'].value_counts().to_frame()
    desc_cats = desc_cats[desc_cats['category_id']>n_sample]
    
    ids_for_undersample = desc_cats.index
    
    under_sample_idxs = []
    for idx in ids_for_undersample:
        indices_current_cat = df[df['category_id']==idx].index
        random_indices = np.random.choice(indices_current_cat, n_sample, replace=False)
        
        under_sample_idxs.append(list(random_indices))
    
    return_ix = np.array(under_sample_idxs).reshape(1, -1)[0]
    return df.loc[return_ix]

In [16]:
train_cats_under = under_sample(train_cats, 32)

In [24]:
train_cats_under['category_id'].value_counts().describe()

count    471.0
mean      32.0
std        0.0
min       32.0
25%       32.0
50%       32.0
75%       32.0
max       32.0
Name: category_id, dtype: float64

In [18]:
#Prepare for OverSampling
def prepare_over_sample(df, n_sample):
    desc_cats = df['category_id'].value_counts().to_frame()
    desc_cats = desc_cats[desc_cats['category_id']<=n_sample]
    
    ids_for_oversample = desc_cats.index
    
    pre_over_sample_idxs = []
    for idx in ids_for_oversample:
        indices_current_cat = df[df['category_id']==idx].index
        
        pre_over_sample_idxs.append(list(indices_current_cat))
    
    return_ix = np.array([a for b in pre_over_sample_idxs for a in b])
    return df.loc[return_ix]

In [19]:
train_cats_for_over = prepare_over_sample(train_cats, 32)

In [20]:
def bootstrap_oversample(df, n_samples):
    cats = np.unique(df['category_id'].values)
    
    samples = []
    
    for cat in cats:
        current_group = df[df['category_id']==cat]
        for i in range(n_samples):
            sample = current_group.sample()
            samples.append(list(sample.values[0]))
            
    df_new = pd.DataFrame(samples)
    
    
    return df_new.rename(columns = {0:'product_id', 1:'category_id'})

In [21]:
train_cats_oversampled = bootstrap_oversample(train_cats_for_over, 32)

In [23]:
train_cats_oversampled['category_id'].value_counts().describe()

count    403.0
mean      32.0
std        0.0
min       32.0
25%       32.0
50%       32.0
75%       32.0
max       32.0
Name: category_id, dtype: float64

In [25]:
train_cats = train_cats_under.append(train_cats_oversampled, ignore_index=True)

  train_cats = train_cats_under.append(train_cats_oversampled, ignore_index=True)


In [27]:
train_cats.tail()

Unnamed: 0,product_id,category_id
27963,1599349,15075
27964,1609848,15075
27965,613538,15075
27966,1283165,15075
27967,1926450,15075


In [28]:
train_cats['image'] = [(str(x)+'.jpg') for x in train_cats['product_id'].values]

In [29]:
train_cats.head()

Unnamed: 0,product_id,category_id,image
0,321012,11937,321012.jpg
1,420744,11937,420744.jpg
2,1183152,11937,1183152.jpg
3,114833,11937,114833.jpg
4,812016,11937,812016.jpg


In [32]:
def create_train(): 
    train_images=[]              
    for img in tqdm(train_cats['image']):       
        final_path=os.path.join(train_data_dir, img)     
        train_img=cv2.imread(final_path)     
        img_arr=cv2.resize(train_img,(64,64)) ## преобразуем картинки (512,512) в (64,64) 
        train_images.append(img_arr)
        
    return np.array(train_images, dtype=float)

In [33]:
train_images = create_train()

100%|█████████████████████████████████████████████████████████████████████████████| 27968/27968 [19:53<00:00, 23.44it/s]


In [34]:
len(train_images)

27968

In [35]:
train_images[0][0][1]

array([240., 240., 240.])

In [36]:
# нормализация картинок train
b_count = 0
for b in tqdm(train_images):
    train_images[b_count] = b / 255
    b_count += 1

100%|████████████████████████████████████████████████████████████████████████████| 27968/27968 [00:43<00:00, 643.04it/s]


In [37]:
train_images[0][0][1]

array([0.94117647, 0.94117647, 0.94117647])

In [38]:
#сохраним загруженные train картинки
np.save('train_images_np.npy',train_images)

In [45]:
train_cats['cats_id']=train_cats['category_id'].astype('category')
train_cats['labels']=train_cats['cats_id'].cat.codes

In [46]:
train_labels=np.array(train_cats['labels'].values)

In [47]:
train_labels = train_labels.reshape(train_labels.shape[0],-1)

In [48]:
train_cats['category_id'].nunique() #874 категории

874

In [49]:
train_labels=np.eye(874)[train_labels.reshape(-1)]

In [50]:
train_labels.shape

(27968, 874)

In [51]:
train_images.shape

(27968, 64, 64, 3)

In [54]:
#сохраним метки для train
np.save('train_labels_np.npy', train_labels)

In [60]:
# сохраним csv файл с описанием картинок train
train_cats.to_csv('train_desc_images.csv', sep='\t')

Готовим тестовые данные

In [39]:
test_data_dir = os.path.join(path_folder_images, 'test')

In [40]:
test_cats = pd.read_csv('test_images.csv', sep='\t', index_col=0)
test_cats['image'] = [(str(x)+'.jpg') for x in test_cats['product_id'].values]

In [41]:
def create_test(): 
    test_images=[]
    for img in tqdm(test_cats['image']):       
        final_path=os.path.join(test_data_dir, img)     
        test_img=cv2.imread(final_path)     
        img_arr=cv2.resize(test_img,(64,64)) ## преобразуем картинки (512,512) в (64,64) 
        test_images.append(img_arr)
        
    return np.array(test_images, dtype=float)

In [42]:
test_images = create_test()

100%|█████████████████████████████████████████████████████████████████████████████| 16860/16860 [15:19<00:00, 18.35it/s]


In [43]:
len(test_images)

16860

In [44]:
test_images[0][0][1]

array([240., 240., 240.])

In [55]:
# нормализация картинок test
b_count = 0
for b in tqdm(test_images):
    test_images[b_count] = b / 255
    b_count += 1

100%|██████████████████████████████████████████████████████████████████████████| 16860/16860 [00:00<00:00, 29581.98it/s]


In [56]:
test_images[0][0][1]

array([0.94117647, 0.94117647, 0.94117647])

In [57]:
#сохраним загруженные test картинки
np.save('test_images_np.npy',test_images)

In [61]:
# сохраним csv файл с описанием картинок test
test_cats.to_csv('test_desc_images.csv', sep='\t')