<a href="https://colab.research.google.com/github/nicolas-dufour/rakuten_colour_extraction/blob/master/rakuten_challenge.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Load Git folder
import os
from getpass import getpass
import urllib
repo_user = 'nicolas-dufour'
user = 'nicolas-dufour'
password = getpass('Password: ')
repo_name = 'rakuten_colour_extraction'
# your password is converted into url format
password = urllib.parse.quote(password)
cmd_string = 'git clone https://{0}:{1}@github.com/{2}/{3}.git'.format(user, password, repo_user, repo_name)
os.system(cmd_string)
cmd_string, password = "", "" # removing the password from the variable
# Bad password fails silently so make sure the repo was copied
assert os.path.exists(f"/content/{repo_name}"), "Incorrect Password or Repo Not Found, please try again"

Password: ··········


In [2]:
%cd rakuten_colour_extraction/

/content/rakuten_colour_extraction


In [3]:
# Google drive connection
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
# Git Ignore setup
!echo 'lightning_logs' >> .gitignore
!echo 'wandb' >> .gitignore

In [None]:
!git status

On branch master
Your branch is up to date with 'origin/master'.

nothing to commit, working tree clean


In [32]:
# Save to git
!git config --global user.email "nicolas.dufourn@gmail.com"
!git config --global user.name "Nicolas DUFOUR"
!git add --all
!git commit -m "Fixed images model package"
!git push --force

[master 4a1ebc4] Fixed images model package
 7 files changed, 3 insertions(+)
Counting objects: 13, done.
Delta compression using up to 2 threads.
Compressing objects: 100% (13/13), done.
Writing objects: 100% (13/13), 6.33 KiB | 6.33 MiB/s, done.
Total 13 (delta 4), reused 0 (delta 0)
remote: Resolving deltas: 100% (4/4), completed with 4 local objects.[K
To https://github.com/nicolas-dufour/rakuten_colour_extraction.git
   da28d11..4a1ebc4  master -> master


In [4]:
%%capture
!pip install transformers
!pip install pytorch-lightning
!pip install wandb
!pip install git+https://github.com/rwightman/pytorch-image-models
!pip install git+https://github.com/ildoonet/pytorch-randaugment

In [30]:
%load_ext autoreload
%autoreload 2

import pandas as pd
from skimage import io
import numpy as np
import ast
from tqdm.notebook import tqdm

import wandb

import timm
from timm.data import create_transform

from RandAugment import RandAugment

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms

import pytorch_lightning as pl
from pytorch_lightning.loggers import WandbLogger
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.metrics.classification import Accuracy, F1

from transformers import BertTokenizer, BertModel

from data.bert import Bert_dataset
from data.images import ImageDataset, TestImageDataset
from modeling.text import Bert_classifier, train
from modeling.images import ResNet18, NFNet, Deit
from sklearn.preprocessing import MultiLabelBinarizer

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [10]:
data_path = '/content/rakuten_colour_extraction/data_files/'

# Data Loading

In [11]:
!mkdir data_files

In [12]:
!echo 'data_files' >> .gitignore

In [13]:
!wget  https://challengedata.ens.fr/participants/challenges/59/download/x-train --load-cookies /content/drive/MyDrive/rakuten_challenge/ens.fr_cookies.txt -O /content/rakuten_colour_extraction/data_files/X_train.csv

--2021-03-08 17:25:40--  https://challengedata.ens.fr/participants/challenges/59/download/x-train
Resolving challengedata.ens.fr (challengedata.ens.fr)... 129.199.99.143
Connecting to challengedata.ens.fr (challengedata.ens.fr)|129.199.99.143|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 329723647 (314M) [application/octet-stream]
Saving to: ‘/content/rakuten_colour_extraction/data_files/X_train.csv’


2021-03-08 17:26:12 (10.5 MB/s) - ‘/content/rakuten_colour_extraction/data_files/X_train.csv’ saved [329723647/329723647]



In [14]:
!wget  https://challengedata.ens.fr/participants/challenges/59/download/y-train --load-cookies /content/drive/MyDrive/rakuten_challenge/ens.fr_cookies.txt -O /content/rakuten_colour_extraction/data_files/y_train.csv

--2021-03-08 17:26:12--  https://challengedata.ens.fr/participants/challenges/59/download/y-train
Resolving challengedata.ens.fr (challengedata.ens.fr)... 129.199.99.143
Connecting to challengedata.ens.fr (challengedata.ens.fr)|129.199.99.143|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5592658 (5.3M) [application/octet-stream]
Saving to: ‘/content/rakuten_colour_extraction/data_files/y_train.csv’


2021-03-08 17:26:16 (2.90 MB/s) - ‘/content/rakuten_colour_extraction/data_files/y_train.csv’ saved [5592658/5592658]



In [15]:
!wget  https://challengedata.ens.fr/participants/challenges/59/download/x-test --load-cookies /content/drive/MyDrive/rakuten_challenge/ens.fr_cookies.txt -O /content/rakuten_colour_extraction/data_files/X_test.csv

--2021-03-08 17:26:16--  https://challengedata.ens.fr/participants/challenges/59/download/x-test
Resolving challengedata.ens.fr (challengedata.ens.fr)... 129.199.99.143
Connecting to challengedata.ens.fr (challengedata.ens.fr)|129.199.99.143|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 57856660 (55M) [application/octet-stream]
Saving to: ‘/content/rakuten_colour_extraction/data_files/X_test.csv’


2021-03-08 17:26:23 (8.77 MB/s) - ‘/content/rakuten_colour_extraction/data_files/X_test.csv’ saved [57856660/57856660]



In [16]:
!wget  https://challengedata.ens.fr/participants/challenges/59/download/supplementary-files --load-cookies /content/drive/MyDrive/rakuten_challenge/ens.fr_cookies.txt -O ../supplementary-files

--2021-03-08 17:26:23--  https://challengedata.ens.fr/participants/challenges/59/download/supplementary-files
Resolving challengedata.ens.fr (challengedata.ens.fr)... 129.199.99.143
Connecting to challengedata.ens.fr (challengedata.ens.fr)|129.199.99.143|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2924854699 (2.7G) [application/octet-stream]
Saving to: ‘../supplementary-files’


2021-03-08 17:30:44 (10.8 MB/s) - ‘../supplementary-files’ saved [2924854699/2924854699]



In [17]:
!tar -zxf ../supplementary-files -C /content/rakuten_colour_extraction/data_files --checkpoint=.10000

................................

In [18]:
!rm ../supplementary-files

# Data Processing

In [19]:
pd.read_csv(data_path+'X_train.csv',index_col=0)

Unnamed: 0,image_file_name,item_name,item_caption
0,278003_10389968_1.jpg,三協アルミ M.シェード2 梁置きタイプ 片側支持 5818 H30 ポリカーボネート屋根　...,商品番号19235601メーカー三協アルミサイズ幅 1931.0mm × 奥行き 5853....
1,220810_10010506_1.jpg,【40%OFF SALE/セール】30代〜40代 ファッション コーディネート 太サッシュ ...,太サッシュベルトで存在感アップ 柔軟性に優れた馬革を使用 幅が太めで存在感◎ キレイな形が出...
2,207456_10045549_1.jpg,下駄 桐 日本製 女性用 TONE 鼻緒巾が広め 黒塗り台 適合足サイズ 23〜24.5cm...,項目 桐の下駄 ※特別価格にて浴衣、半幅帯（浴衣帯）、巾着等も同時出品中です！ サイズ 下駄...
3,346541_10000214_1.jpg,＼期間限定【1000円OFF】クーポン 発行中／ シューズボックス 幅60 奥行33 15足...,■商品説明 ルーバーシューズボックス60幅のシングルタイプが登場。お部屋に合わせて色、サイズ...
4,240426_10024071_1.jpg,ポスト 郵便ポスト 郵便受け 集合住宅用ポスト 可変式プッシュ錠集合郵便受箱 PKS-M15...,集合住宅用ポスト 可変式プッシュ錠集合郵便受箱 PKS-M15-3 1列3段 暗証番号を自由...
...,...,...,...
212115,332136_10000371_1.jpg,サボテン おしゃれな寄せ植え アニマルカクタス ジラフ アニマルフィギア付き プレゼントに,
212116,286000_12212768_1.jpg,【代金引換不可】【アンドモア】 二つ折り財布 財布 小銭入れ 札入れ カード入れ ウォレット...,【ご注意】※メーカー直送のため代金引換はお受けできません。※代金引換でのご注文はキャンセルさ...
212117,254241_10307285_1.jpg,Love Sam　コットン　フレアスカート XS オフベージュ,商品名Love Sam　コットン　フレアスカート カラーオフベージュ サイズ ( cm )サ...
212118,259814_10002299_1.jpg,壁面収納 リビング 薄型 【送料無料】『耐震機能付リビング・書斎収納SELECT〔セレクト〕...,【代引不可商品です】 こちらの商品はメーカー直送品のため代金引換はご利用いただけません。 お...


In [20]:
image_paths = pd.read_csv(data_path+'X_train.csv',index_col=0)['image_file_name']

In [21]:
labels = pd.read_csv(data_path+'y_train.csv',index_col=0)
labels=labels['color_tags'].apply(ast.literal_eval)
labels

0                       [Silver, Grey, Black]
1                              [Brown, Black]
2                              [White, Black]
3                       [Beige, Brown, Black]
4                                    [Silver]
                         ...                 
212115                                [Brown]
212116    [Red, Black, Multiple Colors, Navy]
212117                                [Beige]
212118                         [White, Brown]
212119                           [Blue, Navy]
Name: color_tags, Length: 212120, dtype: object

In [22]:
mlb = MultiLabelBinarizer()
onehot_labels = mlb.fit_transform(labels)
classes_correp = mlb.classes_

In [23]:
classes_correp

array(['Beige', 'Black', 'Blue', 'Brown', 'Burgundy', 'Gold', 'Green',
       'Grey', 'Khaki', 'Multiple Colors', 'Navy', 'Orange', 'Pink',
       'Purple', 'Red', 'Silver', 'Transparent', 'White', 'Yellow'],
      dtype=object)

In [24]:
n_classes = len(classes_correp)
n_classes

19

In [25]:
image_dataset = ImageDataset(image_paths,
                             data_path+'images/',
                             onehot_labels)

# BERT FineTuning

In [None]:
MAX_LEN = 200
LEARNING_RATE = 1e-05
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)
X_path = '/content/drive/MyDrive/rp/X_train_12tkObq.csv'
y_path = '/content/drive/MyDrive/rp/y_train_Q9n2dCu.csv'
train_dataset = Bert_dataset(X_path, y_path, MAX_LEN)
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, num_workers=0)
model = Bert_classifier(len(train_dataset.colors_dict))
model.to(device)
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)
train(1, train_loader, device, model, optimizer)

# Image Models

In [26]:
np.random.seed(42)
idx = np.random.permutation(len(image_dataset))
sep = int(len(image_dataset)*0.9)
idx_train, idx_val = idx[:sep], idx[sep:]
train_set, val_set= torch.utils.data.Subset(image_dataset, idx_train), torch.utils.data.Subset(image_dataset, idx_val)

train_transform = transforms.Compose([
    transforms.Resize((256,256)),
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])
train_transform.transforms.insert(0, RandAugment(3, 10))

# train_transform = create_transform(
#             input_size=300,
#             is_training=True,
#             auto_augment='rand-m9-mstd0.5-inc1',
#             interpolation='bilinear'
#         )
val_transform = transforms.Compose([
    transforms.Resize((256,256)),
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])

train_set.dataset.transform = train_transform
val_set.dataset.transform = val_transform

train_loader = DataLoader(train_set,
                          shuffle=True,
                          num_workers=8,
                          batch_size=32)
val_loader = DataLoader(val_set,
                          shuffle=False,
                          num_workers=8,
                          batch_size=32)

In [27]:
wandb.init(project='Rakuten-colour-classification')

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [31]:
logger = WandbLogger()

checkpoint_callback = ModelCheckpoint(
     mode ='max',
     monitor='val_f1_score',
     dirpath='/content/drive/MyDrive/rakuten_challenge/models',
    filename='NFNET-RandAug-all-data-{epoch:02d}-{val_f1_score:.2f}'
)

trainer = pl.Trainer(
    gpus=1,
    logger=logger,
    callbacks = [checkpoint_callback]
)

model = NFNet(lr=1e-4)


GPU available: True, used: True
TPU available: None, using: 0 TPU cores


NameError: ignored

In [None]:
lr_finder = trainer.tuner.lr_find(model,train_loader)

# Results can be found in
print(lr_finder.results)

# Plot with
fig = lr_finder.plot(suggest=True)
fig.show()

print(lr_finder.suggestion())


In [None]:
trainer.fit(model, train_loader, val_loader)

In [None]:
del model, trainer
torch.cuda.empty_cache()

In [None]:
model = NFNet.load_from_checkpoint('/content/drive/MyDrive/rakuten_challenge/models/resnet18-all-data-epoch=05-val_f1_score=0.65.ckpt').to('cuda')
image_paths = pd.read_csv(data_path+'X_test.csv',index_col=0)['image_file_name']
test_transform = transforms.Compose([
    transforms.Resize((300,300)),
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])
test_set = TestImageDataset(image_paths, data_path+'images/',transform=test_transform)
test_loader = DataLoader(test_set,
                         shuffle=False,
                         batch_size=32,
                         num_workers=8)
output_df = pd.DataFrame(columns=['color_tags'])
for i,(idx,images) in enumerate(tqdm(test_loader)):
    model.eval()
    labels = model(images.to('cuda')).cpu().detach().numpy()
    labels_hard = labels>0.5
    colors = [list(classes_correp[t.nonzero()[0]]) for t in labels_hard]
    output_df_inter = pd.DataFrame(columns=['color_tags'])
    output_df_inter['color_tags'] = colors
    output_df = pd.concat([output_df, output_df_inter])
output_df = output_df.reset_index()
del output_df['index']

HBox(children=(FloatProgress(value=0.0, max=1168.0), HTML(value='')))




In [None]:
output_df

Unnamed: 0,color_tags
0,[]
1,[Black]
2,[Khaki]
3,[Navy]
4,[Grey]
...,...
37342,[White]
37343,"[Black, White]"
37344,"[Black, White]"
37345,[Green]


In [None]:
output_df.to_csv('/content/drive/MyDrive/rakuten_challenge/submissions/submission_3.csv')