In [1]:
# Hugging Face dataset bootstrapping
OWNER_NAME = 'competitions/'
DATASET_NAME = 'aiornot'

!pip install -Uqq datasets
!pip install -Uqq python-dotenv

from dotenv import load_dotenv
import os
from pathlib import Path
from datasets import load_dataset


load_dotenv('/notebooks/.env')
access_token = os.environ.get('HF_TOKEN')
DOWNLOADS = Path('downloads')
path = DOWNLOADS/DATASET_NAME

print('about to download', OWNER_NAME+DATASET_NAME, 'to', DOWNLOADS)
ds = load_dataset(OWNER_NAME+DATASET_NAME, use_auth_token=access_token)
print('done')

about to download competitions/aiornot to downloads


Downloading readme:   0%|          | 0.00/1.32k [00:00<?, ?B/s]

Using custom data configuration competitions--aiornot-2c268a235d50d29c


Downloading and preparing dataset None/None to /root/.cache/huggingface/datasets/competitions___parquet/competitions--aiornot-2c268a235d50d29c/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/415M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/418M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/416M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/416M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/354M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/356M [00:00<?, ?B/s]

Computing checksums of downloaded files. They can be used for integrity verification. You can disable this by passing ignore_verifications=True to load_dataset


Computing checksums:  83%|########3 | 5/6 [00:05<00:01,  1.07s/it]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset parquet downloaded and prepared to /root/.cache/huggingface/datasets/competitions___parquet/competitions--aiornot-2c268a235d50d29c/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

done


## Data preprocessing
Moving csv format to images in folders

In [2]:
from datasets import Image as ImageFeature
ds = ds.cast_column('image', ImageFeature())
ds

DatasetDict({
    test: Dataset({
        features: ['id', 'image', 'label'],
        num_rows: 43442
    })
    train: Dataset({
        features: ['id', 'image', 'label'],
        num_rows: 18618
    })
})

Let's save the dataset in folders 

In [3]:
path

PosixPath('downloads/aiornot')

In [4]:
path.mkdir(exist_ok=True)

In [5]:
(path/'train'/'0').mkdir(exist_ok=True)
(path/'train'/'1').mkdir(exist_ok=True)
(path/'test'/'0').mkdir(exist_ok=True)
(path/'test'/'1').mkdir(exist_ok=True)

In [6]:
from PIL import Image
from tqdm.auto import tqdm

# save train images to folder
#for i, fname in tqdm(enumerate(ds['train']['id']), total=ds['train'].num_rows):
#    img = ds['train'][i]['image']
#    label = str(ds['train'][i]['label'])
#    fpath = path/'train'/label/fname
#    #print(fpath)
#    if not fpath.is_file():
#        img.save(fpath)

In [7]:
# save test images to folder
#for i, fname in tqdm(enumerate(ds['test']['id']), total=ds['test'].num_rows):
#    img = ds['test'][i]['image']
#    label = str(ds['test'][i]['label'])
#    fpath = path/'test'/fname
#    #print(fpath)
#    if not fpath.is_file():
#        img.save(fpath)

### fastai baseline vision learner

In [8]:
# gpu memory management
import gc, torch
!pip install -Uqq pynvml

def free_gpu():
    gc.collect()
    torch.cuda.empty_cache()
    
def report_gpu():
    print(torch.cuda.list_gpu_processes())
    free_gpu()

In [9]:
from fastai.vision.all import *

files = get_image_files(path/'train')
len(files)

18618

In [10]:
report_gpu()

GPU:0
no processes are running


In [11]:
dls = ImageDataLoaders.from_folder(path/'train', valid_pct=0.2, bs=16)
print(len(dls.train), len(dls.valid))

930 233


In [12]:
learn = vision_learner(dls, resnet50, metrics=error_rate, cbs=GradientAccumulation(64)).to_fp16()

Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to /root/.cache/torch/hub/checkpoints/resnet50-0676ba61.pth


  0%|          | 0.00/97.8M [00:00<?, ?B/s]

In [13]:
learn.fine_tune(epochs=3, base_lr=1e-3, freeze_epochs=2)

epoch,train_loss,valid_loss,error_rate,time
0,0.310925,0.20421,0.069299,03:00
1,0.222168,0.168059,0.065001,02:39


epoch,train_loss,valid_loss,error_rate,time
0,0.156358,0.126242,0.048885,03:19
1,0.104974,0.085055,0.029815,03:18
2,0.05762,0.075238,0.025248,03:18


In [14]:
learn.fit_one_cycle(3, 5e-5)

epoch,train_loss,valid_loss,error_rate,time
0,0.073284,0.092176,0.031695,03:18
1,0.037752,0.075269,0.026323,03:18
2,0.017595,0.06601,0.023905,03:17


In [24]:
learn.fit_one_cycle(1, 5e-5)

epoch,train_loss,valid_loss,error_rate,time
0,0.025646,0.044835,0.013161,03:15


## Evaluate on test files

In [29]:
test_files = get_image_files(path/'test')
len(test_files)

43442

In [30]:
test_dl = learn.dls.test_dl(test_files)
preds, _ = learn.tta(dl=test_dl)

In [31]:
preds[:, 1]

TensorBase([9.9958e-01, 1.2630e-06, 9.8388e-01,  ..., 9.9133e-07,
            1.0000e+00, 9.9996e-01])

## Prepare submission file

In [32]:
submission = pd.read_csv(path/'sample_submission.csv')
submission.shape

(43442, 2)

In [33]:
submission.label = preds[:, 1]
submission

Unnamed: 0,id,label
0,0.jpg,9.995835e-01
1,1.jpg,1.262985e-06
2,10.jpg,9.838781e-01
3,100.jpg,1.719565e-06
4,1000.jpg,9.999942e-01
...,...,...
43437,9995.jpg,9.289585e-01
43438,9996.jpg,9.999926e-01
43439,9997.jpg,9.913288e-07
43440,9998.jpg,9.999986e-01


In [34]:
submission.label.mean()

0.56004536

## Submit CSV to Hugging Face

In [35]:
# write submission csv
technique = '-resnet50 finetune 2+3+3+3 epochs undecoded-'
sub_filename = Path('subs')/(DATASET_NAME+'-'+technique+'-sub.csv')
submission.to_csv(sub_filename, index=False)
print('done')

done
