In [1]:
from fastai import *
from fastai.vision import *
from fastai.widgets import *
from torch.nn import CrossEntropyLoss

In [2]:
gc.collect()

31

In [3]:
competition = 'datasciencebowl'

In [None]:
#!kaggle competitions download -c datasciencebowl -p {plankton_path}  
#!unzip {path}/sampleSubmission.csv.zip
#!unzip {path}/test.zip
#!unzip {path}/train.zip

In [4]:
plankton_path = Config.data_path()/'plankton'
train_path = plankton_path / 'train'
test_path = plankton_path / 'test'
project_path = pathlib.Path.home()/'PROJEX/competitions_1/plankton'

In [5]:
project_path.exists(), train_path.exists(), test_path.exists()

(True, True, True)

In [None]:
#df_sub = pd.read_csv(plankton_path/'sampleSubmission.csv')

In [None]:
#df_sub.info()

In [None]:
#df_sub.shape

In [None]:
#df_sub.head(1).T[1:].sum()

It appears that the goal is to produce a csv file with 1 row per image, and columns populated by classes. For a given image, each class is assigned a probability. 

## Data Dealings

The training data set is partitioned into folders names after the classes they belonged to. The table above 122 classes.

But first in order to play with data augmentation, I am going to alter the default transforms applied to the training set. 

In [7]:
tfms = get_transforms(flip_vert=True, 
                      max_rotate=25, max_zoom=1.3, max_lighting=.4, max_warp=0.4,
                      p_affine=1., p_lighting=1.)

In [None]:
def get_data(size, bs, padding_mode='reflection'):
    return (src.label_from_re(r))

In [None]:
test = ImageList.from_folder(test_path)
data = (ImageDataBunch.from_folder(train_path, train='./', valid_pct=0.2,
                                   ds_tfms=get_transforms(), size=128)
        .normalize(imagenet_stats))

In [None]:
data.classes[:5]

In [None]:
#for c in data.classes:
#    verify_images(train_path/c, )

In [None]:
data.show_batch(rows=3, figsize=(9, 8))

In [None]:
data.c, len(data.train_ds), len(data.valid_ds)

In [None]:
learn = cnn_learner(data, models.resnet34, metrics=error_rate)
learn.loss_func = CrossEntropyLoss()

---
## TRAINING - PART 1
---

### Train model

In [None]:
learn.lr_find()

In [None]:
learn.recorder.plot()

In [None]:
lr = 1e-2
learn.fit_one_cycle(5, slice(lr))

In [None]:
learn.save(project_path / 'size_128_stage-1')

In [None]:
stat = (project_path/'size_128_stage-1.pth').stat()

In [None]:
print(f'model size: {stat.st_size/int(1e6):.2f}MB')

---
## END OF TRAINING - PART 1
---

Now I'll give it a first shot with the test set see where that puts me in the leaderboard

In [None]:
test = ImageList.from_folder(test_path)
learn.load(project_path / 'size_128_stage-1');

In [None]:
learn.export(project_path/'export.pkl')

In [None]:
lexp = load_learner(project_path, test=test)

In [None]:
preds, y = lexp.get_preds(ds_type=DatasetType.Test)

In [None]:
images = [image.name for image in lexp.data.test_ds.items]
predictions = np.c_[images, preds.data]
df_pred = pd.DataFrame(predictions, columns=['image']+data.classes, )
df_pred.shape

In [None]:
df_pred.head()

In [None]:
df_pred.set_index('image', drop=True).to_csv(project_path/'submission3.csv')

In [None]:
!kaggle competitions submit {competition} -f {project_path/'submission3.csv'} -m "size_128_frozenmdl_submission"

The above scored 0.9730, which ranks me as 235th out of 1050 on the private leaderboard.

---
## TRAINING - PART 2
---

Now I'll reload stage 1, unfreeze the model and retrain.

In [None]:
235/1050*100

In [None]:
test = ImageList.from_folder(test_path)
learn.load(project_path / 'size_128_stage-1');

In [None]:
learn.unfreeze()

In [None]:
learn.lr_find()

In [None]:
learn.recorder.plot()

In [None]:
learn.fit_one_cycle(5, slice(1e-6, lr/50))

In [None]:
learn.save('size_128_stage-2')

In [None]:
learn.export(project_path/'export_128_unfr.pkl')

In [None]:
def make_submission(submission_name='submissionX', exported_learner='export.pkl'):
    df_sub = pd.read_csv(plankton_path/'sampleSubmission.csv')
    lexp = load_learner(project_path, file=exported_learner, test=test)
    images = [image.name for image in lexp.data.test_ds.items]
    preds, _ = lexp.get_preds(ds_type=DatasetType.Test)
    predictions = np.c_[images, preds.data]
    df_pred = pd.DataFrame(predictions, columns=['image'] + data.classes)
    df_pred.set_index('image', drop=True).to_csv(project_path/f'{submission_name}')

In [None]:
make_submission(submission_name='submission4.csv', exported_learner='export_128_unfr.pkl')

In [None]:
submission_message='size_128_unfrozenmdlsubmission',
!kaggle competitions submit {competition} -f {project_path/'submission4.csv'} -m f"{submission_message}"

The above scored 0.88689, through which I went up to 185th our of 1050.

---
## TRAINING - PART 3
---

Here I load a different model, in this case

In [None]:
185/1050*100

In [None]:
interp = ClassificationInterpretation.from_learner(learn)

In [None]:
conf_mat = interp.confusion_matrix()

In [None]:
interp.plot_top_losses(9, figsize=(20, 20), heatmap=True)

In [None]:
import matplotlib.pyplot as pl
from seaborn import heatmap

In [None]:
f1= interp.plot_confusion_matrix(figsize=(40, 40), return_fig=True )

In [None]:
f1.savefig('confmat.png', dpi=300)

## Post learning data cleaning

ImageDeleter is essentially the same as the old FileDeleter.

ImageRelabeler renders the files you pass in and lets you re-label them. This is to find mis-categorized images in your data directory. NOTE: ImageRelabeler currently only works with files where labels were created from the names of their parent directory (i.e. with .from_folder()). The widget moves mislabeled photos from the incorrect parent directory to the properly-labeled parent directory.

To relabel an image, just click the proper label in the widget dropdown.

Both widgets take a formatted dataset from DatasetFormatter. You can easily specify which dataset you’d like to render by passing in DatasetType.Valid, DatasetType.Train or DatasetType.Test to the ds_type keyword arg in DatasetFormatter.from_toplosses.

All three classes are available in fastai.widgets.

In [None]:
ds, idxs = DatasetFormatter().from_toplosses(learn)
ImageCleaner(ds, idxs, project_path)