## Seedlings Identification Challenge on Kaggle

In [2]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [3]:
from fastai.imports import *
from fastai.transforms import *
from fastai.conv_learner import *
from fastai.model import *
from fastai.dataset import *
from fastai.sgdr import *
from fastai.plots import *

In [4]:
import os
import random
import shutil

In [5]:
!jupyter nbextension enable --py widgetsnbextension

In [6]:
!ls -l

In [7]:
PATH = "data/"
#path = "data/sample/"

In [8]:
directories = os.listdir(PATH+ 'train')
print(directories)

In [9]:
os.listdir(PATH)

## Skip this part once initial directories have been populated

In [None]:
#intial downloaded data was split into one directory per class,
#so keeping that arrangement here. Another option would have been
#to create a CSV file so could load from file instead of from path; 
#generally seems path is easier at end when forming submission file.
for dir in directories :
    os.makedirs('data/valid/'+dir, exist_ok=True)

In [None]:
#import glob
#def get_1st(path): return glob(f'{path}/*.*')[0]
#get_1st(path)

In [None]:
_,dirs,_ = next(os.walk(PATH+'train'))
print(dirs)

In [None]:
def create_valid_folder(_from, to, percentage=0.1, move=True):
    _, folders, _ = next(os.walk(_from))
    for folder in folders:
        if not os.path.exists(os.path.join(to, folder)):
            os.makedirs(os.path.join(to, folder))
        _, _, files = next(os.walk(os.path.join(_from, folder)))
        sample = random.sample(files, int(len(files) * percentage))
        for filename in sample:
            if move:
                shutil.move(os.path.join(_from, folder, filename), os.path.join(to, folder, filename))
            else:
                shutil.copyfile(os.path.join(_from, folder, filename), os.path.join(to, folder, filename))



In [None]:
create_valid_folder(PATH+'train',PATH+'valid', percentage=0.2, move=True)

In [None]:
create_valid_folder(PATH+'train',PATH+'sample', percentage = 0.1, move=True)

In [None]:
os.makedirs('data/models', exist_ok=True)

In [None]:
!ls {PATH+'valid'}

## Start from here when re-starting notebook

In [12]:
def get_1st(path): return glob(f'{path}/*.*')[0]

In [13]:
get_1st(PATH+'train/Cleavers')

In [14]:
list_paths = [get_1st(PATH+'train/Cleavers'), get_1st(PATH+'train/Charlock'),get_1st(PATH+'train/Maize')]
plots_from_files(list_paths, titles=["Cleaver", "Charlock","Maize"], maintitle="Single-label classification")

In [15]:
img = plt.imread(list_paths[1])
img.shape

In [16]:
#import notebook/model specific packages and metrics here
#sklearn.metrics.f1_score
#(y_true, y_pred, labels=None, pos_label=1, average=’binary’, sample_weight=None)
from sklearn.metrics import f1_score
#from planet import f2

In [17]:
metrics=[f1_score] #adding appropriate metric here
sz=128 # starting off with 128 by 128 here, but could start with 64
f_model = resnet50 #specifying model here

In [18]:
#this is the dataloader.  Specify appropriate transforms, size of images, and zoom.
#also, make sure the path, training set directory, label filename, and test directory are set.
def get_data(sz):
    tfms = tfms_from_model(f_model, sz, aug_tfms=transforms_top_down, max_zoom=1.05)
    return ImageClassifierData.from_paths(PATH, tfms=tfms, test_name='test')

In [19]:
data = get_data(sz)
data = data.resize(int(sz*1.3), 'tmp')

In [None]:
#Okay, let's do transfer learning!  This uses pre-trained model weights
#here precompute=True by default, actually 

learn = ConvLearner.pretrained(f_model, data, precompute=True)

In [20]:
#Find learning rate
lrf=learn.lr_find()
learn.sched.plot()

In [21]:
lr = 2*10e-2 #substituting the rate determined from the finder above

In [22]:
learn.fit(lr, 3) #here's a nice start

In [23]:
learn.fit(lr, 3) #round 2

In [24]:
#first round or two of training, freeze the layers so not recomputing all your activations
learn.freeze()
learn.fit(lr, 3, cycle_len=1, cycle_mult=2, cycle_save_name='resnet50_128')
#cycle_save_name is nice, it will save intermediate parameter results
#in case of problems, or in case want to average or something

In [25]:
lrs = np.array([lr/9,lr/3,lr])  # we don't want to fool too much with the pre-trained
#parameters, so we make the learning rate really small for the earlier layers

In [26]:
precompute = False  #convolutional layer activations will no longer be precomputed (and therefore fixed)

In [27]:
#this will take a long time :)
learn.unfreeze()
learn.fit(lrs, 3, wds=0.001, cycle_len=1, cycle_mult=2, cycle_save_name='resnet50_128') #replace name as needed
#learn.sched.plot_lr()  -- can be fun to see how learning rate changes
learn.save(f'{sz}')

In [28]:
learn.load('128')  

In [29]:
## switching to Adam instead of SGD
import torch.optim as optim
learn = ConvLearner.pretrained(f_model, data, precompute=False, opt_fn=optim.Adam)
lr = 0.01
wd = 0.025
learn.fit(lrs=[lr/100, lr/10,lr], n_cycle=3, wds=[wd/100, wd/10, wd], use_wd_sched=True, cycle_len=1, cycle_mult=2)

In [30]:
learn.sched.plot_lr()

In [31]:
sz=224  #time to up the image size

In [32]:
data=get_data(sz)
data=data.resize(int(sz*1.3), 'tmp')

In [33]:
learn.set_data(data)

In [34]:
learn = ConvLearner.pretrained(f_model, data, precompute=False, opt_fn=optim.Adam)

In [35]:
lrf=learn.lr_find()
learn.sched.plot()

In [36]:
lr = .005

In [37]:
learn.freeze()
learn.fit(lr, 3, cycle_len=1, cycle_mult=2, cycle_save_name='resnet50_224')

In [38]:
lrf=learn.lr_find()
learn.sched.plot()

In [39]:
lr = 0.005
wd = 0.025
learn.fit(lrs=[lr/100, lr/10,lr], n_cycle=3, wds=[wd/100, wd/10, wd], use_wd_sched=True, cycle_len=1, cycle_mult=2,cycle_save_name='resnet50_224_a' )

In [40]:
learn.unfreeze()

In [41]:
lr = 0.003
wd = 0.025
learn.fit(lrs=[lr/100, lr/10,lr], n_cycle=3, wds=[wd/100, wd/10, wd], use_wd_sched=True, cycle_len=1, cycle_mult=2,cycle_save_name='resnet50_224_b' )

In [42]:
learn.save('224')

In [50]:
learn.load('224')

In [51]:
sz=299  #upping size one more time 
data=get_data(sz)

In [52]:
learn.set_data(data)

In [53]:
lrf=learn.lr_find()
learn.sched.plot()

In [54]:
lr=0.0002

In [55]:
learn.freeze()
learn.fit(lr, 3, cycle_len=1, cycle_mult=2, cycle_save_name='resnet50_299_pre')

In [56]:
learn.save('299_pre')

In [57]:
learn.load('299_pre')

In [58]:
learn.unfreeze()
lr = 0.005
wd = 0.025
learn.fit(lrs=[lr/100, lr/10,lr], n_cycle=3, wds=[wd/100, wd/10, wd], use_wd_sched=True, cycle_len=1, cycle_mult=2,cycle_save_name='resnet50_299' )

In [59]:
learn.save('299')

In [None]:
lrf=learn.lr_find()
learn.sched.plot()

In [None]:
lr = 0.0001
wd = 0.025
learn.fit(lrs=[lr/100, lr/10,lr], n_cycle=3, wds=[wd/100, wd/10, wd], use_wd_sched=True, cycle_len=1, cycle_mult=2,cycle_save_name='resnet50_299_a' )

In [None]:
lr = 0.00001
lrf=learn.lr_find()
learn.sched.plot()

In [None]:
lr=0.0001

In [None]:
wd = 0.03  # experimenting with different weight decay
learn.fit(lrs=[lr/100, lr/10,lr], n_cycle=3, wds=[wd/100, wd/10, wd], use_wd_sched=True, cycle_len=1, cycle_mult=2,cycle_save_name='resnet50_299_b')

In [None]:
lr=0.0001
wd = 0.025
learn.fit(lrs=[lr/100, lr/10,lr], n_cycle=1, wds=[wd/100, wd/10, wd], use_wd_sched=True, cycle_len=1, cycle_mult=2,cycle_save_name='resnet50_299_a' )

In [None]:
learn.save('299')

In [None]:
learn.load('299')

In [None]:
learn.freeze()


In [None]:
lr=0.0001

In [None]:
learn.fit(lr, 3, cycle_len=1, cycle_mult=2, cycle_save_name='resnet50_299_pre')

In [None]:
lr = 0.00001
lrf=learn.lr_find()
learn.sched.plot()

In [None]:
learn.unfreeze()

In [None]:
lr = 0.001
wd = 0.015
learn.fit(lrs=[lr/100, lr/10,lr], n_cycle=3, wds=[wd/100, wd/10, wd], use_wd_sched=True, cycle_len=1, cycle_mult=2,cycle_save_name='resnet50_299_d' )

### Generating submission file

In [65]:
#compute test time augmentations on the validation set, to get metric
tta = learn.TTA()
f2(*tta)

In [66]:
#now get actual predictions
log_preds, y = learn.TTA(is_test=True)

In [67]:
log_preds.shape

In [68]:
probs = np.mean(np.exp(log_preds), 0)
print(probs.shape)

In [69]:
len(probs)

In [70]:
preds = np.argmax(probs, axis=1); preds.shape

In [72]:
df2 = pd.DataFrame(probs)

In [74]:
df2.columns = data.classes

In [76]:
df2.insert(0, 'file', [o[5:] for o in data.test_ds.fnames])

#or may need to read from directory:  test_fnames = read_dir(PATH, 'test-jpg')

In [None]:
#don't need here
mapp={}
for i in range(1,13):
    mapp[i] = ds2.columns[i]+' '
print(mapp)

In [77]:
df2['species']= df2[['Black-grass','Charlock','Cleavers', 'Common Chickweed', \
                     'Common wheat','Fat Hen', 'Loose Silky-bent', 'Maize',\
                     'Scentless Mayweed','Shepherds Purse', \
                     'Small-flowered Cranesbill', 'Sugar beet']].idxmax(axis=1)

In [78]:
df2.head()

In [80]:
data.test_ds.fnames[0:2]

In [81]:
df4 = df2.loc[:,['file','species']]

In [82]:
df4.head()

In [83]:
os.makedirs('subm', exist_ok=True)
subm = f'{PATH}subm/'
df4.to_csv(f'{subm}seedlings6.csv', index=False)  ##compression='gzip', index=False)

In [84]:
FileLink(f'{subm}seedlings6.csv')
#seedlings6 scored .97858; best so far is .97984