### Инициализация пайплайна
Установим библиотеки из файла requirements, kaggle api и фреймворк PyTorch

In [None]:
!pip install -q -r requirements.txt
!pip install --upgrade --force-reinstall --no-deps kaggle
!pip install torch==1.7.1+cu101 torchvision==0.8.2+cu101 -f https://download.pytorch.org/whl/torch_stable.html#%%

Подключим необходимые для работы библиотеки

In [1]:
import os
import random
import numpy as np
import pandas as pd

import cv2
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score

import torch
import torchvision
import albumentations
from albumentations import pytorch as AT

Скачаем и распакуем в папку ./data

In [None]:
# !kaggle competitions download -c nornikel-ore-contamination-detection
# !cp nornikel-ore-contamination-detection.zip ./data/nornikel-ore-contamination-detection.zip
# os.chdir('./data/')
# !unzip nornikel-ore-contamination-detection.zip
# !rm -rf ./data/nornikel-ore-contamination-detection.zip
# os.chdir('..')
# !pwd

Зададим параметры для корректного отображения текстов графиков,
а также зафиксируем сиды

In [2]:
# Убедимся, что включили GPU ускоритель
device = "cuda" if torch.cuda.is_available() else "cpu" # ЗДЕСЬ ВАШ КОД
print(f'device: {device}')

params = {'legend.fontsize': 'x-large',
          'figure.figsize':  (15, 5),
          'axes.labelsize':  'x-large',
          'axes.titlesize':  'x-large',
          'xtick.labelsize': 'x-large',
          'ytick.labelsize': 'x-large'}

pylab.rcParams.update(params)

print('Numpy version:', np.__version__)
print('Pandas version:', pd.__version__)

# Зафиксируем сиды для Python, Numpy и PyTorch
# для воспроизводимости результатов
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)

device: cuda
Numpy version: 1.19.5
Pandas version: 1.3.0


<torch._C.Generator at 0x7ff3a1122250>

In [28]:
# Выведем информацию об используемом GPU-ускорителе
!nvidia-smi

Fri Jul 16 19:14:59 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 455.45.01    Driver Version: 455.45.01    CUDA Version: 11.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  GeForce RTX 2070    On   | 00000000:0A:00.0  On |                  N/A |
| 31%   46C    P0    31W / 175W |    446MiB /  7979MiB |      1%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+---------------------------------------------------------------------------

### Загрузка и исследование данных

In [3]:
train_df = pd.read_csv('data/train.csv', index_col=0)
train_df.head()

Unnamed: 0,image_name,type,xmin,xmax,ymin,ymax
0,0007Date_01_08_2019.jpg,other,285,368,61,278
1,0013Date_01_08_2019.jpg,armature,187,550,101,253
2,0016Date_01_08_2019.jpg,armature,172,327,13,360
3,0019Date_01_08_2019.jpg,armature,19,267,162,237
4,0019Date_01_08_2019.jpg,armature,309,548,300,376


In [33]:
# Посмотрим на стандартные статистические параметры данных
train_df.describe()

Unnamed: 0,xmin,xmax,ymin,ymax
count,2541.0,2541.0,2541.0,2541.0
mean,192.623377,421.927194,101.252656,319.252263
std,142.564916,155.507596,94.012065,89.880843
min,0.0,33.0,0.0,48.0
25%,102.0,302.0,14.0,262.0
50%,152.0,410.0,79.0,341.0
75%,262.0,543.0,167.0,397.0
max,672.0,704.0,394.0,421.0


In [4]:
classes = train_df['type'].unique().tolist()

CLASS_TYPES = dict([(type, i) for i, type in enumerate(classes)])
CLASS_TYPES

{'other': 0, 'armature': 1, 'wood': 2}

Необходимо разделить данные на train/val/test части, для этого воспользуемся
методом train_test_split из библиотеки sklearn

In [5]:
X = train_df.drop(['type'], axis=1)
X.head(3)

Unnamed: 0,image_name,xmin,xmax,ymin,ymax
0,0007Date_01_08_2019.jpg,285,368,61,278
1,0013Date_01_08_2019.jpg,187,550,101,253
2,0016Date_01_08_2019.jpg,172,327,13,360


In [6]:
y = train_df.drop(['image_name', 'xmin', 'xmax', 'ymin', 'ymax'], axis=1)
y.head(3)

Unnamed: 0,type
0,other
1,armature
2,armature


In [7]:
train_df

Unnamed: 0,image_name,type,xmin,xmax,ymin,ymax
0,0007Date_01_08_2019.jpg,other,285,368,61,278
1,0013Date_01_08_2019.jpg,armature,187,550,101,253
2,0016Date_01_08_2019.jpg,armature,172,327,13,360
3,0019Date_01_08_2019.jpg,armature,19,267,162,237
4,0019Date_01_08_2019.jpg,armature,309,548,300,376
...,...,...,...,...,...,...
2536,9954Date_05_08_2019.jpg,armature,83,228,229,269
2537,9965Date_05_08_2019.jpg,armature,152,287,0,403
2538,9975Date_05_08_2019.jpg,armature,229,324,214,369
2539,9991Date_05_08_2019.jpg,armature,193,358,3,421


In [78]:
# X_train, X_test, y_train, y_test = train_test_split(train_df,
#                                                     train_df['type'],
#                                                     test_size=0.3,
#                                                     random_state=42,
#                                                     stratify=y,
#                                                     shuffle=True)
#
#
# X_test, X_val, y_test, y_val = train_test_split(X_test,
#                                                 X_test['type'],
#                                                 test_size=0.5,
#                                                 random_state=42,
#                                                 stratify=y_test,
#                                                 shuffle=True)

Unnamed: 0,image_name,type,xmin,xmax,ymin,ymax
2010,6055Date_03_08_2019.jpg,armature,26,59,202,350
2181,7499Date_04_08_2019.jpg,armature,141,465,14,283
956,1754Date_02_08_2019.jpg,armature,226,666,189,416
853,14453Date_06_08_2019.jpg,armature,550,692,61,420
925,1511Date_02_08_2019.jpg,armature,106,214,11,182
...,...,...,...,...,...,...
36,0109Date_01_08_2019.jpg,armature,191,270,111,244
1413,33930Date_14_08_2019.jpg,armature,139,588,29,326
776,14005Date_06_08_2019.jpg,armature,34,442,187,305
460,12285Date_05_08_2019.jpg,armature,125,494,1,328


In [10]:
X_train = pd.read_csv('data/X_train.csv', index_col=0)
X_test = pd.read_csv('data/X_test.csv', index_col=0)
X_val = pd.read_csv('data/X_val.csv', index_col=0)


In [11]:
# Посмотрии на распределение по классам для получившихся наборов данных
print('Обучающая выборка:')
print(X_train['type'].value_counts(normalize=True))
X_train.to_csv('./data/X_train.csv')

print('Тестовая выборка:')
print(X_test['type'].value_counts(normalize=True))
X_test.to_csv('./data/X_test.csv')

print('Валидационная выборка:')
print(X_val['type'].value_counts(normalize=True))
X_val.to_csv('./data/X_val.csv')

Обучающая выборка:
armature    0.831271
wood        0.129921
other       0.038808
Name: type, dtype: float64
Тестовая выборка:
armature    0.831579
wood        0.126316
other       0.042105
Name: type, dtype: float64
Валидационная выборка:
armature    0.832461
wood        0.130890
other       0.036649
Name: type, dtype: float64


### Посмотим на сами изображения

In [22]:
import shutil

images_path = os.path.join(os.getcwd(), 'data', 'images', 'all_images')
print(f'Количество изображений обучающей выборки: {len(os.listdir(images_path))}')

# labels_path = os.path.join(os.getcwd(), 'data', 'labels')
# if not os.path.exists(labels_path):
#     os.makedirs(labels_path)
# else:
#     shutil.rmtree(labels_path)

def convert_df_into_labels(df_subsample: pd.DataFrame, subsample: str) -> None:
    """

    :param df_subsample:    Pandas df of subsample for converting to labels
    :return:                None
    """
    # Create folders for subsamples
    subsample_folder = os.path.join(os.getcwd(), 'data', 'images')
    subsample_images_path = os.path.join(subsample_folder, 'images', subsample)
    subsample_labels_path = os.path.join(subsample_folder, 'labels', subsample)
    for folder in subsample_images_path, subsample_labels_path:
        if not os.path.exists(folder):
            os.makedirs(folder)
    if os.path.exists(os.path.join(subsample_folder, f'{subsample}.txt')):
        os.remove(os.path.join(subsample_folder, f'{subsample}.txt'))
    for index, row in df_subsample.iterrows():
        # Get object params from dataframe
        image_name = row['image_name']
        type = row['type']
        xmin = row['xmin']
        xmax = row['xmax']
        ymin = row['ymin']
        ymax = row['ymax']
        # Open image and chech iamge size
        image_path = os.path.join(images_path, image_name)
        image = cv2.imread(image_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        image_height, image_width = image.shape[:2]
        # Calculate and normalize object parameters
        class_num = CLASS_TYPES[type]
        object_x_center = (xmin + (xmax - xmin) / 2) / image_width
        object_y_center = (ymin + (ymax - ymin) / 2) / image_height
        object_width = (xmax - xmin) / image_width
        object_height = (ymax - ymin) / image_height
        # Get image filename and save label
        label_name = f"{image_name.split('.')[0]}.txt"
        with open(os.path.join(subsample_labels_path, label_name), "a") as file:
            # Classname X Y Width Height
            file.write(f"{class_num} {object_x_center} {object_y_center} {object_width} {object_height}\n")
        # Copy image to subsample folder
        cv2.imwrite(os.path.join(subsample_images_path, image_name), image)
        # Save subsamples image names in .txt files
        with open(os.path.join(subsample_folder, f'{subsample}.txt'), "a") as file:
            # Classname X Y Width Height
            file.write(f'./images/{subsample}/{image_name}\n')

convert_df_into_labels(X_train, subsample='train')
convert_df_into_labels(X_test, subsample='valid')
convert_df_into_labels(X_val, subsample='test')

Количество изображений обучающей выборки: 3837


In [None]:
X_train['image_name'].value_counts()