# データセット取得

Google Open Images Dataset V4から食材画像データを取得する



## Google Driveをマウント

In [6]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
base_dir = Path('/content/drive/My Drive/openimages')

In [0]:
import csv
import requests
import pandas as pd
from pathlib import Path, PurePosixPath
from urllib.parse import urlparse
from PIL import Image

## BigQueryのための認証

In [0]:
from google.colab import auth
auth.authenticate_user()

## ラベルの確認

In [0]:
foods = ['Pumpkin', 'Cabbage', 'Cucumber', 'Potato', 'Tomato', 'Carrot', 'Bell pepper', 'Broccoli', 'Milk', 'Radish', 'Egg', 'Mushroom', 'Lemon', 'Cheese']

In [13]:
query = 'SELECT * FROM `bigquery-public-data.open_images.dict`'
labels = pd.read_gbq(query, 'x-fabric-166212', dialect='standard')
labels = labels[labels['label_display_name'].isin(foods)]
labels

Unnamed: 0,label_name,label_display_name
65,/m/02g387,Egg
66,/m/033cnk,Egg
598,/m/04zpv,Milk
1379,/m/09k_b,Lemon
2114,/m/0fj52s,Carrot
2134,/m/01nkt,Cheese
2777,/m/05vtc,Potato
2817,/m/015x5n,Radish
3060,/m/07j87,Tomato
3421,/m/0fbw6,Cabbage


## 14種類の食材画像をダウンロード

In [0]:
def save_image(url, food):
    result = urlparse(url)
    filename = PurePosixPath(result.path).name
    food_dir = Path(base_dir / food)
    food_dir.mkdir(exist_ok=True)
    fileToSave = food_dir / filename
    with open(fileToSave, 'wb') as f:
      try:
        response = requests.get(url, stream=True, timeout=5.0, allow_redirects=False)
        fileType = response.headers['Content-Type']
        if fileType == 'image/jpeg': 
            if response.status_code == 200:
                for block in response.iter_content(1024):
                    if not block:
                        break
                    f.write(block)
      except requests.exceptions.RequestException as e:
        print(e)

- confidence 0.8以上
- Eggはlabel_nameが2つあったが`/m/02g387`を使用

In [0]:
for food in foods:
  food_labels = labels[labels['label_display_name']==food]
  food_label = [label for label in food_labels['label_name']][0]
  query = f'SELECT i.image_id AS image_id,original_url,confidence FROM `bigquery-public-data.open_images.labels` l INNER JOIN `bigquery-public-data.open_images.images` i ON l.image_id = i.image_id WHERE label_name="{food_label}" AND confidence >= 0.8 AND Subset="train"'
  df = pd.read_gbq(query, 'x-fabric-166212', dialect='standard')
  for url in df['original_url']:
    save_image(url, food)

HTTPSConnectionPool(host='c4.staticflickr.com', port=443): Read timed out. (read timeout=5.0)
HTTPSConnectionPool(host='c3.staticflickr.com', port=443): Read timed out. (read timeout=5.0)


## パスとラベルのテーブル作成

original_urlからfile_pathを作成し、日本語の食材名を追加

In [0]:
foods_dict = {'Pumpkin': 'かぼちゃ',
              'Cabbage': 'キャベツ',
              'Cucumber': 'きゅうり',
              'Potato': 'じゃがいも',
              'Tomato': 'トマト',
              'Carrot': 'にんじん',
              'Bell pepper': 'ピーマン',
              'Broccoli': 'ブロッコリー',
              'Milk': '牛乳',
              'Radish': '大根',
              'Egg': '卵',
              'Mushroom': 'きのこ',
              'Lemon': 'レモン',
              'Cheese': 'チーズ'}

In [0]:
food_df = pd.DataFrame()
for food in foods:
  food_labels = labels[labels['label_display_name']==food]
  food_label = [label for label in food_labels['label_name']][0]
  query = f'SELECT i.image_id AS image_id,original_url,confidence FROM `bigquery-public-data.open_images.labels` l INNER JOIN `bigquery-public-data.open_images.images` i ON l.image_id = i.image_id WHERE label_name="{food_label}" AND confidence >= 0.8 AND Subset="train"'
  df = pd.read_gbq(query, 'x-fabric-166212', dialect='standard')
  df['file_path'] = [base_dir / food / PurePosixPath(urlparse(url).path).name for url in df['original_url']]
  df['label'] = foods_dict[food]
  food_df = pd.concat([food_df, df])

In [56]:
food_df.head()

Unnamed: 0,image_id,original_url,confidence,file_path,label
0,c5912d08552d3812,https://farm3.staticflickr.com/5604/1553890865...,1.0,/content/drive/My Drive/openimages/Pumpkin/155...,かぼちゃ
1,12cb8162813d7c2a,https://c3.staticflickr.com/1/1/1173263_182e82...,0.9,/content/drive/My Drive/openimages/Pumpkin/117...,かぼちゃ
2,12cb8162813d7c2a,https://c3.staticflickr.com/1/1/1173263_182e82...,1.0,/content/drive/My Drive/openimages/Pumpkin/117...,かぼちゃ
3,120683e9595ba35c,https://c5.staticflickr.com/2/1365/5115249444_...,0.8,/content/drive/My Drive/openimages/Pumpkin/511...,かぼちゃ
4,120683e9595ba35c,https://c5.staticflickr.com/2/1365/5115249444_...,1.0,/content/drive/My Drive/openimages/Pumpkin/511...,かぼちゃ


壊れている画像が分かるようにフラグを追加

In [0]:
broken = []
for image_path in food_df['file_path']:
  try:
    Image.open(image_path, 'r')
  except:
    broken.append(True)
  else:
    broken.append(False)

In [0]:
food_df['broken'] = broken
food_df = food_df[food_df['broken']==False]

In [63]:
food_df.shape

(14618, 6)

### 各食材の画像数

In [64]:
food_df['label'].value_counts(ascending=True)

牛乳          78
キャベツ       174
大根         206
きゅうり       249
じゃがいも      262
チーズ        311
ピーマン       348
ブロッコリー     353
レモン        486
にんじん       509
卵          727
トマト       1693
かぼちゃ      3596
きのこ       5626
Name: label, dtype: int64

## アノテーションファイル

トレーニングに下記フォーマットのアノテーションが必要になるので作成

```
path/to/image.jpg,x1,y1,x2,y2,class_name
```

### [Open Images Dataset V4](https://storage.googleapis.com/openimages/web/download.html)からBoxesのCSVをダウンロード

In [0]:
bbox = pd.read_csv('/content/drive/My Drive/openimages/train-annotations-bbox.csv')

In [65]:
bbox.head()

Unnamed: 0,ImageID,Source,LabelName,Confidence,XMin,XMax,YMin,YMax,IsOccluded,IsTruncated,IsGroupOf,IsDepiction,IsInside
0,000002b66c9c498e,xclick,/m/01g317,1,0.0125,0.195312,0.148438,0.5875,0,1,0,0,0
1,000002b66c9c498e,xclick,/m/01g317,1,0.025,0.276563,0.714063,0.948438,0,1,0,0,0
2,000002b66c9c498e,xclick,/m/01g317,1,0.151562,0.310937,0.198437,0.590625,1,0,0,0,0
3,000002b66c9c498e,xclick,/m/01g317,1,0.25625,0.429688,0.651563,0.925,1,0,0,0,0
4,000002b66c9c498e,xclick,/m/01g317,1,0.257812,0.346875,0.235938,0.385938,1,0,0,0,0


In [0]:
annotations = pd.merge(food_df, bbox, left_on='image_id', right_on='ImageID')

In [0]:
annotations['x1'] = [round(Image.open(image_path, 'r').width * float(xmin)) for image_path, xmin in zip(annotations['file_path'],annotations['XMin'])]
annotations['x2'] = [round(Image.open(image_path, 'r').width * float(xmax)) for image_path, xmax in zip(annotations['file_path'],annotations['XMax'])]
annotations['y1'] = [round(Image.open(image_path, 'r').height * float(ymin)) for image_path, ymin in zip(annotations['file_path'],annotations['YMin'])]
annotations['y2'] = [round(Image.open(image_path, 'r').height * float(ymax))  for image_path, ymax in zip(annotations['file_path'],annotations['YMax'])]

In [69]:
annotations4csv = annotations[['file_path', 'x1', 'y1', 'x2',  'y2', 'label']]
annotations4csv.head()

Unnamed: 0,file_path,x1,y1,x2,y2,label
0,/content/drive/My Drive/openimages/Pumpkin/155...,232,565,1151,2866,かぼちゃ
1,/content/drive/My Drive/openimages/Pumpkin/155...,67,485,1246,3133,かぼちゃ
2,/content/drive/My Drive/openimages/Pumpkin/155...,1190,376,1397,590,かぼちゃ
3,/content/drive/My Drive/openimages/Pumpkin/155...,1337,397,1530,699,かぼちゃ
4,/content/drive/My Drive/openimages/Pumpkin/155...,1443,597,1699,941,かぼちゃ


### 各食材のアノテーション数

In [70]:
annotations4csv['label'].value_counts(ascending=True)

牛乳          238
キャベツ        377
大根          626
じゃがいも       692
卵           705
ピーマン        826
ブロッコリー      944
きゅうり       1407
にんじん       1438
チーズ        1498
レモン        2025
トマト        5740
きのこ        6776
かぼちゃ      10125
Name: label, dtype: int64

In [0]:
annotations4csv.to_csv(base_dir / 'annotations.csv', index=None, header=None)

## クラスファイル

トレーニングに下記フォーマットのクラスマップが必要になるので作成

```
class_name,id
```


In [0]:
labels = annotations4csv['label'].unique().tolist()
with open(base_dir / 'classes.csv', 'w') as f:
  writer = csv.writer(f)
  for i, label in enumerate(labels):
    writer.writerow([label, i])