In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!nvidia-smi

In [None]:
!pip install --quiet transformers==4.18.0
!pip install --quiet tokenizers==0.12.1
!pip install --quiet sentencepiece
!pip install --quiet japanize-matplotlib
!pip install transformers fugashi ipadic >> /dev/null

In [None]:
! pip install git+https://github.com/rinnakk/japanese-clip.git

In [None]:
! conda install --yes -c pytorch pytorch=1.7.1 torchvision cudatoolkit=11.0
! pip install ftfy regex tqdm
! pip install git+https://github.com/openai/CLIP.git

In [None]:
! conda install --yes -c pytorch pytorch=1.7.1 torchvision cudatoolkit=11.0
! pip install ftfy regex tqdm
! pip install git+https://github.com/openai/CLIP.git

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image

import torch
import transformers

from sklearn.metrics import mean_squared_error
from sklearn.metrics import log_loss
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

import sys
import os
import re
import random

from time import time
from tqdm import tqdm

from contextlib import contextmanager
import lightgbm as lgb

from pathlib import Path

import re
import requests
import unicodedata
import nltk
from nltk.corpus import wordnet
from bs4 import BeautifulSoup
nltk.download(['wordnet', 'stopwords', 'punkt'])

import japanese_clip as ja_clip
from torchvision.io import read_image

import cv2

In [None]:
#上限表示数を拡張
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 300)

In [None]:
def seed_everything(seed=1234):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)

seed_everything(42)

In [None]:
INPUT = "/content/drive/MyDrive/nishika" # 所望のディレクトリに変更してください。
train_image_path ="/content/drive/MyDrive/nishika/train/"
test_image_path ="/content/drive/MyDrive/nishika/test/"

In [None]:
train_df = pd.read_csv(os.path.join(INPUT, "train.csv"))
test_df = pd.read_csv(os.path.join(INPUT, "test.csv"))
submission_df = pd.read_csv(os.path.join(INPUT, "sample_submission.csv"))

train_df["img_path"] = train_image_path + train_df["odai_photo_file_name"]
test_df["img_path"] = test_image_path + test_df["odai_photo_file_name"]

In [None]:
import torchvision.models as models


from torch.utils.data import Dataset, DataLoader

device = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
from __future__ import unicode_literals
import re
import unicodedata

def unicode_normalize(cls, s):
    pt = re.compile('([{}]+)'.format(cls))

    def norm(c):
        return unicodedata.normalize('NFKC', c) if pt.match(c) else c

    s = ''.join(norm(x) for x in re.split(pt, s))
    s = re.sub('－', '-', s)
    return s

def remove_extra_spaces(s):
    s = re.sub('[ 　]+', ' ', s)
    blocks = ''.join(('\u4E00-\u9FFF',  # CJK UNIFIED IDEOGRAPHS
                      '\u3040-\u309F',  # HIRAGANA
                      '\u30A0-\u30FF',  # KATAKANA
                      '\u3000-\u303F',  # CJK SYMBOLS AND PUNCTUATION
                      '\uFF00-\uFFEF'   # HALFWIDTH AND FULLWIDTH FORMS
                      ))
    basic_latin = '\u0000-\u007F'

    def remove_space_between(cls1, cls2, s):
        p = re.compile('([{}]) ([{}])'.format(cls1, cls2))
        while p.search(s):
            s = p.sub(r'\1\2', s)
        return s

    s = remove_space_between(blocks, blocks, s)
    s = remove_space_between(blocks, basic_latin, s)
    s = remove_space_between(basic_latin, blocks, s)
    return s

def normalize_neologd(s):
    s = s.strip()
    s = unicode_normalize('０-９Ａ-Ｚａ-ｚ｡-ﾟ', s)

    def maketrans(f, t):
        return {ord(x): ord(y) for x, y in zip(f, t)}

    s = re.sub('[˗֊‐‑‒–⁃⁻₋−]+', '-', s)  # normalize hyphens
    s = re.sub('[﹣－ｰ—―─━ー]+', 'ー', s)  # normalize choonpus
    s = re.sub('[~∼∾〜〰～]+', '〜', s)  # normalize tildes (modified by Isao Sonobe)
    s = s.translate(
        maketrans('!"#$%&\'()*+,-./:;<=>?@[¥]^_`{|}~｡､･｢｣',
              '！”＃＄％＆’（）＊＋，－．／：；＜＝＞？＠［￥］＾＿｀｛｜｝〜。、・「」'))

    s = remove_extra_spaces(s)
    s = unicode_normalize('！”＃＄％＆’（）＊＋，－．／：；＜＞？＠［￥］＾＿｀｛｜｝〜', s)  # keep ＝,・,「,」
    s = re.sub('[’]', '\'', s)
    s = re.sub('[”]', '"', s)
    s = s.lower()
    return s

def normalize_text(text):
    return normalize_neologd(text)

In [None]:
#import clip

model, preprocess = ja_clip.load("rinna/japanese-clip-vit-b-16", cache_dir="/tmp/japanese_clip", device=device)
tokenizer = ja_clip.load_tokenizer()


In [None]:
def _get_img_paths(img_dir):
    img_dir = Path(img_dir)
    img_extensions = [".jpg"]
    img_paths = [str(p) for p in img_dir.iterdir() if p.suffix in img_extensions]
    img_paths.sort()

    return img_paths


class ImageFolder(Dataset):
    def __init__(self, img_dir):
        # 画像ファイルのパス一覧を取得する。
        self.img_paths = _get_img_paths(img_dir)
        #self.transform = transform

    def __getitem__(self, index):
        path = self.img_paths[index]
        img = preprocess(Image.open(path).convert("RGB"))
        #inputs = self.transform(img)

        return {"image": img, "path": path}

    def __len__(self):
        return len(self.img_paths)

In [None]:
# Dataset を作成する。
dstrain = ImageFolder("/content/drive/MyDrive/nishika/train")

In [None]:
import glob
import itertools


def get_images_features(dataset):
    image_features = []
    
    with torch.no_grad():
        for batch in tqdm(DataLoader(dataset, batch_size=16)):
            inputs = batch["image"].to(device)
            outputs = model.get_image_features(inputs)

            image_features.append(outputs)

    return torch.cat(image_features).cpu().numpy()

In [None]:
train_image_features = get_images_features(dstrain)

In [None]:
image_feature = pd.DataFrame.from_dict(train_image_features, orient='columns').add_prefix("clip_image_").reset_index()
image_feature.rename(columns={"columns":"odai_photo_file_name"}, inplace=True)

In [None]:
image_feature = image_feature.drop(["index"], axis=1)

In [None]:
# trainのデータに結合します。
train_df = pd.concat([train_df, image_feature],axis=1)

In [None]:
train_df.shape

In [None]:
# Dataset を作成する。
dstest = ImageFolder("/content/drive/MyDrive/nishika/test")

In [None]:
test_image_features = get_images_features(dstest)

In [None]:
image_feature = pd.DataFrame.from_dict(test_image_features, orient='columns').add_prefix("clip_image_").reset_index()
image_feature.rename(columns={"columns":"odai_photo_file_name"}, inplace=True)

In [None]:
print(image_feature)

In [None]:
image_feature = image_feature.drop(["index"], axis=1)

In [None]:
print(image_feature)

In [None]:
# testのデータに結合します。
test_df = pd.concat([test_df, image_feature],axis=1)

In [None]:
test_df.shape

(6000, 516)

In [None]:
## テキストの欠損値を補間します
train_df["text"] = train_df["text"].fillna('NaN')
test_df["text"] = test_df["text"].fillna('NaN')

In [None]:
import tensorflow as tf

def get_texts_features(dataset):
    text_labels = []
    
    with torch.no_grad():
        for labels in tqdm(DataLoader(dataset, batch_size=16)):
            inputs = ja_clip.tokenize(texts = [normalize_text(label) for label in labels], device = device)
            outputs = model.get_text_features(**inputs)
            text_labels.append(outputs)

    return torch.cat(text_labels).cpu().numpy()

In [None]:
train_labels = get_texts_features(train_df["text"])

In [None]:
test_labels = get_texts_features(test_df["text"])

In [None]:
print(train_labels)

In [None]:
## テキスト特徴量
features_text_train_df = pd.DataFrame(train_labels).add_prefix("clip-text")
features_text_test_df = pd.DataFrame(test_labels).add_prefix("clip-text")

train_df = pd.concat([train_df, features_text_train_df], axis=1)
test_df = pd.concat([test_df, features_text_test_df], axis=1)

In [None]:
print(train_df.shape)
print(test_df.shape)

In [None]:
train_df.to_csv('/content/drive/MyDrive/nishika/embeded/embedding_train_rinna_tune_clip.csv')

In [None]:
test_df.to_csv('/content/drive/MyDrive/nishika/embeded/embedding_test_rinna_tune_clip.csv')