# Предобработка изображений

## Подключение библиотек

In [1]:
import pandas as pd
import numpy as np
from PIL import Image

## Загрузка данных

In [2]:
train = pd.read_csv("../data/clouds/train.csv.gz")
print(train.head())

          Image_Label                                      EncodedPixels
0    0011165.jpg_Fish  264918 937 266318 937 267718 937 269118 937 27...
1  0011165.jpg_Flower  1355565 1002 1356965 1002 1358365 1002 1359765...
2  0011165.jpg_Gravel                                                NaN
3   0011165.jpg_Sugar                                                NaN
4    002be4f.jpg_Fish  233813 878 235213 878 236613 878 238010 881 23...


## Очистка данных

In [3]:
train["Image"] = train["Image_Label"].str.split("_").str[0]
train["Label"] = train["Image_Label"].str.split("_").str[1]
train.drop(labels=["Image_Label"], axis=1, inplace=True)
print(train.head())

                                       EncodedPixels        Image   Label
0  264918 937 266318 937 267718 937 269118 937 27...  0011165.jpg    Fish
1  1355565 1002 1356965 1002 1358365 1002 1359765...  0011165.jpg  Flower
2                                                NaN  0011165.jpg  Gravel
3                                                NaN  0011165.jpg   Sugar
4  233813 878 235213 878 236613 878 238010 881 23...  002be4f.jpg    Fish


In [4]:
data = pd.DataFrame({"Image": train["Image"].unique()})
for label in train["Label"].unique():
    data[label] = pd.Series(train[train["Label"] == label]["EncodedPixels"].values)
print(data.head())

         Image                                               Fish  \
0  0011165.jpg  264918 937 266318 937 267718 937 269118 937 27...   
1  002be4f.jpg  233813 878 235213 878 236613 878 238010 881 23...   
2  0031ae9.jpg  3510 690 4910 690 6310 690 7710 690 9110 690 1...   
3  0035239.jpg                                                NaN   
4  003994e.jpg  2367966 18 2367985 2 2367993 8 2368002 62 2369...   

                                              Flower  \
0  1355565 1002 1356965 1002 1358365 1002 1359765...   
1  1339279 519 1340679 519 1342079 519 1343479 51...   
2  2047 703 3447 703 4847 703 6247 703 7647 703 9...   
3  100812 462 102212 462 103612 462 105012 462 10...   
4                                                NaN   

                                              Gravel  \
0                                                NaN   
1                                                NaN   
2                                                NaN   
3  65400 380 66800 380 6

## Обработка изображений

0011165.jpg имеет размер 525 * 350 = 183750 пикселей

In [5]:
imgdata = np.array([np.zeros(183750, dtype="uint8") for _ in range(len(data))])

for i, img in enumerate(data["Image"].unique()):
    imgdata[i] = np.array(Image.open("../data/clouds/train_images_small/" + img).convert("L"),
                          dtype="uint8").reshape(1, -1)[0]
imgdata = pd.DataFrame(imgdata)
print(imgdata.head())

   0       1       2       3       4       5       6       7       8       \
0       0       0       0       0       0       0       0       0       0   
1      72      43      32      39      49      69      71      47      25   
2      97      98     100     102     105     107     109     110     111   
3      88      87      90      94      94      89      87      87      82   
4      18      19      21      21      21      20      18      17      15   

   9       ...  183740  183741  183742  183743  183744  183745  183746  \
0       0  ...      83      65      52      70      65     102     113   
1      22  ...      99     113     110      59      27      38      58   
2     112  ...       0       0       0       0       0       0       0   
3      82  ...     182     188     170     150     146     166     164   
4      16  ...      67      75     112     157     141     125     137   

   183747  183748  183749  
0     102      96     117  
1      83     111     134  
2       

In [6]:
for column in data.columns:
    imgdata[column] = data[column]
del data
print(imgdata.head)

<bound method NDFrame.head of         0    1    2    3    4    5    6    7    8    9  ...  183745  183746  \
0       0    0    0    0    0    0    0    0    0    0  ...     102     113   
1      72   43   32   39   49   69   71   47   25   22  ...      38      58   
2      97   98  100  102  105  107  109  110  111  112  ...       0       0   
3      88   87   90   94   94   89   87   87   82   82  ...     166     164   
4      18   19   21   21   21   20   18   17   15   16  ...     125     137   
...   ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...     ...     ...   
5541    9   35   51   47   50   61   55   37   24   63  ...     177     171   
5542  184  182  182  187  197  209  218  224  224  227  ...      78      76   
5543   23   38   59   75   69   43   33   46   35   15  ...     162     172   
5544   37   14   47   45   44   42  111   50   35   40  ...     164     159   
5545  125  125  125  125  125  125  125  125  126  124  ...     144     148   

      183747  183748 

## Сохранить данные в HDF5

In [7]:
imgdata.to_hdf(
    "../data/out/clouds.data.h5",
    "data",
    complib="zlib",
    complevel=9,
    mode="w",
)