In [7]:
# Hugging Face dataset bootstrapping
OWNER_NAME = 'competitions/'
DATASET_NAME = 'aiornot'

!pip install -Uqq datasets
!pip install -Uqq python-dotenv

from dotenv import load_dotenv
import os
from pathlib import Path
from datasets import load_dataset


load_dotenv('/notebooks/.env')
access_token = os.environ.get('HF_TOKEN')
DOWNLOADS = Path('downloads')
path = DOWNLOADS/DATASET_NAME

print('about to download', OWNER_NAME+DATASET_NAME, 'to', DOWNLOADS)
ds = load_dataset(OWNER_NAME+DATASET_NAME, use_auth_token=access_token)
print('done')

about to download competitions/aiornot to downloads


Using custom data configuration competitions--aiornot-c64672d1851055ac
Found cached dataset parquet (/root/.cache/huggingface/datasets/competitions___parquet/competitions--aiornot-c64672d1851055ac/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/2 [00:00<?, ?it/s]

done


## Data preprocessing
Moving csv format to images in folders

In [8]:
from datasets import Image as ImageFeature
ds = ds.cast_column('image', ImageFeature())
ds

DatasetDict({
    train: Dataset({
        features: ['id', 'image', 'label'],
        num_rows: 18618
    })
    test: Dataset({
        features: ['id', 'image', 'label'],
        num_rows: 43442
    })
})

Let's save the dataset in folders 

In [9]:
path

PosixPath('downloads/aiornot')

In [10]:
path.mkdir(exist_ok=True)

In [11]:
(path/'train'/'0').mkdir(exist_ok=True)
(path/'train'/'1').mkdir(exist_ok=True)
(path/'test'/'0').mkdir(exist_ok=True)
(path/'test'/'1').mkdir(exist_ok=True)

In [12]:
from PIL import Image
from tqdm.auto import tqdm

# save train images to folder
#for i, fname in tqdm(enumerate(ds['train']['id']), total=ds['train'].num_rows):
#    img = ds['train'][i]['image']
#    label = str(ds['train'][i]['label'])
#    fpath = path/'train'/label/fname
#    #print(fpath)
#    if not fpath.is_file():
#        img.save(fpath)

In [13]:
# save test images to folder
#for i, fname in tqdm(enumerate(ds['test']['id']), total=ds['test'].num_rows):
#    img = ds['test'][i]['image']
#    label = str(ds['test'][i]['label'])
#    fpath = path/'test'/fname
#    #print(fpath)
#    if not fpath.is_file():
#        img.save(fpath)

### fastai baseline vision learner

In [14]:
# gpu memory management
import gc, torch
!pip install -Uqq pynvml

def free_gpu():
    gc.collect()
    torch.cuda.empty_cache()
    
def report_gpu():
    print(torch.cuda.list_gpu_processes())
    free_gpu()

In [15]:
from fastai.vision.all import *

files = get_image_files(path/'train')
len(files)

18618

In [16]:
report_gpu()

GPU:0
no processes are running


In [17]:
dls = ImageDataLoaders.from_folder(path/'train', valid_pct=0.2, bs=32)
print(len(dls.train), len(dls.valid))

465 117


In [18]:
learn = vision_learner(dls, resnet50, metrics=error_rate, cbs=GradientAccumulation(64))



In [19]:
learn.fine_tune(epochs=5, base_lr=1e-3, freeze_epochs=3)

epoch,train_loss,valid_loss,error_rate,time


KeyboardInterrupt: 

## Evaluate on test files

In [None]:
test_files = get_image_files(path/'test')
len(test_files)

In [163]:
test_dl = learn.dls.test_dl(test_files)
preds, = learn.get_preds(dl=test_dl)

In [182]:
preds[:, 1]

TensorBase([9.9998e-01, 1.5562e-04, 9.9684e-01,  ..., 2.0520e-07,
            9.9998e-01, 9.9999e-01])

## Prepare submission file

In [184]:
submission = pd.read_csv(path/'sample_submission.csv')
submission.shape

(43442, 2)

In [186]:
submission.label = decoded
submission

Unnamed: 0,id,label
0,0.jpg,9.999846e-01
1,1.jpg,1.556160e-04
2,10.jpg,9.968406e-01
3,100.jpg,5.743017e-06
4,1000.jpg,9.999951e-01
...,...,...
43437,9995.jpg,9.411466e-01
43438,9996.jpg,9.996895e-01
43439,9997.jpg,2.051955e-07
43440,9998.jpg,9.999838e-01


In [187]:
submission.label.mean()

0.5629704

## Submit CSV to Hugging Face

In [188]:
# write submission csv
technique = '-resnet34 finetune 3 epochs undecoded-'
sub_filename = Path('subs')/(DATASET_NAME+'-'+technique+'-sub.csv')
submission.to_csv(sub_filename, index=False)
print('done')

done
