# zero shotのclipでやってみる

In [1]:
! pip install ftfy regex tqdm
! pip install git+https://github.com/rinnakk/japanese-clip.git

Collecting ftfy
  Downloading ftfy-6.1.1-py3-none-any.whl (53 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/53.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.1/53.1 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: ftfy
Successfully installed ftfy-6.1.1
Collecting git+https://github.com/rinnakk/japanese-clip.git
  Cloning https://github.com/rinnakk/japanese-clip.git to /tmp/pip-req-build-mpuj9dsl
  Running command git clone --filter=blob:none --quiet https://github.com/rinnakk/japanese-clip.git /tmp/pip-req-build-mpuj9dsl
  Resolved https://github.com/rinnakk/japanese-clip.git to commit 374e08d48b9ba72fcaf7459af4f8c93caffd6fb3
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting transformers>=4.15.0 (from japanese-clip==0.2.0)
  Downloading transformers-4.33.1-py3-none-any.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[

In [2]:
from PIL import Image
import pandas as pd
import numpy as np
import glob
import os
import copy
import tqdm

import torch

import japanese_clip as ja_clip

## 各設定

In [3]:
os.chdir("./drive/MyDrive/TECHNOPRO_food_package/codes")

test_image_paths = glob.glob("../Dataset/test/*")
test_image_paths = sorted(test_image_paths)
descriptions = ["飲料", "食料"]

In [4]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = ja_clip.load("rinna/japanese-clip-vit-b-16", device=device)
model.to(device).eval()
# テキストに指定するトークナイザー
tokenizer = ja_clip.load_tokenizer()

encodings = ja_clip.tokenize(
    texts=descriptions,
    max_seq_len=77,
    device=device,
    tokenizer=tokenizer, # this is optional. if you don't pass, load tokenizer each time
)

Downloading (…)lve/main/config.json:   0%|          | 0.00/4.24k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/787M [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/806k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/153 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/259 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. If you see this, DO NOT PANIC! This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


## test

In [5]:
image = Image.open(test_image_paths[10])
image = preprocess(image).unsqueeze(0).to(device)
with torch.no_grad():
    # 画像の特徴量を取得する
    image_features = model.get_image_features(image)
    # テキストの特徴量を取得する
    text_features = model.get_text_features(**encodings)
    # 予測
    text_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1)

beverage = text_probs[0,0].item()
food = text_probs[0,1].item()

if beverage > food:
    ans = copy.deepcopy(beverage)
else:
    ans = copy.deepcopy(food)

print(ans)

1.0


In [6]:
file_names = []
predictions = []

for test_image_path in tqdm.tqdm(test_image_paths):
    file_name = os.path.basename(test_image_path)
    file_names.append(file_name)

    image = Image.open(test_image_path)
    image = preprocess(image).unsqueeze(0).to(device)
    with torch.no_grad():
        # 画像の特徴量を取得する
        image_features = model.get_image_features(image)
        # テキストの特徴量を取得する
        text_features = model.get_text_features(**encodings)
        # 予測
        text_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1)

    beverage = text_probs[0,0].item()
    food = text_probs[0,1].item()

    if beverage > food:
        prediction = 1 - copy.deepcopy(beverage)
    else:
        prediction = copy.deepcopy(food)

    predictions.append(prediction)

d = {'0': file_names, '1': predictions}
dst_df = pd.DataFrame(data=d)

100%|██████████| 2188/2188 [02:02<00:00, 17.89it/s]


In [7]:
dst_df.to_csv("../output/clipbit_zero_shot.csv", index=False, header=False)