In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
from os import listdir, makedirs, getcwd, remove
print(os.listdir("../input"))

# Import packages
import time, copy
import seaborn as sns
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
from PIL import Image
from torchvision import datasets, models, transforms
from torchvision.utils import make_grid
from torchvision.models.vgg import model_urls
from torch.utils.data import Dataset, DataLoader, random_split
from torch.utils.data.sampler import SubsetRandomSampler

from os.path import isfile, join, abspath, exists, isdir, expanduser

# Any results you write to the current directory are saved as output.

['pytorch-pretrained-image-models', 'iwildcam-2019-fgvc6']


## Load dataset

In [2]:
# Class names
class_names = ['empty', 'deer', 'moose', 'squirrel', 'rodent', 'small_mammal', 'elk', 'pronghorn_antelope', 
               'rabbit', 'bighorn_sheep', 'fox', 'coyote', 'black_bear', 'raccoon', 'skunk', 'wolf', 
               'bobcat', 'cat', 'dog', 'opossum', 'bison', 'mountain_goat', 'mountain_lion']

# csv file & data file path
train_csv_file = '../input/iwildcam-2019-fgvc6/train.csv'
train_data_dir = '../input/iwildcam-2019-fgvc6/train_images'
test_csv_file = '../input/iwildcam-2019-fgvc6/test.csv'
test_data_dir = '../input/iwildcam-2019-fgvc6/test_images'

### 1. Read basic information in csv files and save in pandas.DataFrame.

In [3]:
# Read csv
train_val_df = pd.read_csv(train_csv_file)
test_df = pd.read_csv(test_csv_file)

# Create a new feature 'category' (string) in train_df for better plotting/understanding
train_val_df['category'] = train_val_df['category_id'].apply(lambda id: class_names[id])

In [4]:
train_val_df.head()

Unnamed: 0,category_id,date_captured,file_name,frame_num,id,location,rights_holder,seq_id,seq_num_frames,width,height,category
0,19,2011-05-13 23:43:18,5998cfa4-23d2-11e8-a6a3-ec086b02610b.jpg,1,5998cfa4-23d2-11e8-a6a3-ec086b02610b,33,Justin Brown,6f084ccc-5567-11e8-bc84-dca9047ef277,3,1024,747,opossum
1,19,2012-03-17 03:48:44,588a679f-23d2-11e8-a6a3-ec086b02610b.jpg,2,588a679f-23d2-11e8-a6a3-ec086b02610b,115,Justin Brown,6f12067d-5567-11e8-b3c0-dca9047ef277,3,1024,747,opossum
2,0,2014-05-11 11:56:46,59279ce3-23d2-11e8-a6a3-ec086b02610b.jpg,1,59279ce3-23d2-11e8-a6a3-ec086b02610b,96,Erin Boydston,6faa92d1-5567-11e8-b1ae-dca9047ef277,1,1024,747,empty
3,0,2013-10-06 02:00:00,5a2af4ab-23d2-11e8-a6a3-ec086b02610b.jpg,1,5a2af4ab-23d2-11e8-a6a3-ec086b02610b,57,Erin Boydston,6f7d4702-5567-11e8-9e03-dca9047ef277,1,1024,747,empty
4,0,2011-07-12 13:11:16,599fbd89-23d2-11e8-a6a3-ec086b02610b.jpg,3,599fbd89-23d2-11e8-a6a3-ec086b02610b,46,Justin Brown,6f1728a1-5567-11e8-9be7-dca9047ef277,3,1024,747,empty


In [5]:
# Arrange the dataset in pytorch ImageFolder way
class IWildCamDataset(Dataset):
    
    def __init__(self, df, root_dir, transform=None):
        """
        Args:
            df (pandas.DataFrame): Contains basic information.
            root_dir (string): The path where image data is saved.
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """
        self.df = df
        self.root_dir = root_dir
        self.transform = transform
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        img_path = os.path.join(self.root_dir,
                                self.df.iloc[idx].file_name)
        with open(img_path, 'rb') as f:
            image = Image.open(f)
            image = image.convert('RGB')
        
        if('category_id' in self.df.iloc[idx]):
            category = self.df.iloc[idx].category_id
        else:
            # In test set, there is no given category. Here we will not return the category,
            # return the img_id instead. (Because we need to keep track of the img id during
            # testing)
            category = self.df.iloc[idx].id
        
        if('file_name' in self.df.iloc[idx]):
            filename = self.df.iloc[idx].file_name

        # Transform
        if self.transform:
            image = self.transform(image)

        return image, category, filename

In [6]:
# Use a subset of train dataset: should be comment after tuning the model
train_val_df = train_val_df.sample(20000, random_state=199)

# Split into train_df and val_df
train_df = train_val_df.sample(frac=0.8, random_state=201)
val_df = train_val_df.drop(train_df.index)

In [7]:
train_df.to_pickle('train_df.pkl')
val_df.to_pickle('val_df.pkl')

In [8]:
# Data transforms
data_transforms = transforms.Compose([
    transforms.Resize(128),  # 1. Resize smallest side to 128.
     transforms.CenterCrop(128), # 2. Crop the center 128x128 pixels.
     transforms.ToTensor(), # 3. Convert to pytorch tensor.
     transforms.Normalize(mean = [0.485, 0.456, 0.406],  # normalize.
                          std = [0.229, 0.224, 0.225])
])

# Get dataset
train_set = IWildCamDataset(train_df, train_data_dir, transform=data_transforms)
val_set   = IWildCamDataset(val_df, train_data_dir, transform=data_transforms)
test_set  = IWildCamDataset(test_df, test_data_dir, transform=data_transforms)

### Image pre-processing

In [9]:
import os
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
!pip install opencv-contrib-python
import cv2
print(os.listdir("../input"))
import matplotlib.pyplot as plt

import scipy.misc
import zipfile

%matplotlib inline 

Collecting opencv-contrib-python
[?25l  Downloading https://files.pythonhosted.org/packages/af/0a/50a827d13a75754a8500fa854517f09886027005b09ac5210dca9f0aa101/opencv_contrib_python-4.1.0.25-cp36-cp36m-manylinux1_x86_64.whl (32.6MB)
[K    100% |████████████████████████████████| 32.6MB 1.2MB/s 
Installing collected packages: opencv-contrib-python
Successfully installed opencv-contrib-python-4.1.0.25
[33mYou are using pip version 19.0.3, however version 19.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
['pytorch-pretrained-image-models', 'iwildcam-2019-fgvc6']


### CLAHE

In [10]:
clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(16, 16))

### Automatic White Balance

In [11]:
wb = cv2.xphoto.createSimpleWB()
wb.setP(0.4)

In [12]:
!ls /kaggle/working

__notebook__.ipynb  __output__.json  train_df.pkl  val_df.pkl


### pre-processing for train_set

In [13]:
startdir = "p_train_set"  #要压缩的文件夹路径
file_news = startdir +'.zip' # 压缩后文件夹的名字
azip = zipfile.ZipFile(file_news, 'w')
root_dir = '../input/iwildcam-2019-fgvc6/train_images/'
desired_size = 256
#fig=plt.figure(figsize=(32, 128))
#train_list = []

for idx in range(0, len(train_set)):
    train_img = root_dir + train_set[idx][2]
    temp_img = cv2.imread(train_img, cv2.IMREAD_COLOR)        
    
    img_wb = wb.balanceWhite(temp_img)

    img_lab = cv2.cvtColor(img_wb, cv2.COLOR_BGR2Lab)

    l, a, b = cv2.split(img_lab)
    res_l = clahe.apply(l)
    res = cv2.merge((res_l, a, b))

    res = cv2.cvtColor(res, cv2.COLOR_Lab2BGR)
    resized = cv2.resize(res, (desired_size,)*2).astype('uint8')
    
    imgname = train_set[idx][2]
    scipy.misc.imsave(imgname, resized)
    azip.write(imgname, compress_type=zipfile.ZIP_LZMA)
    #train_list.append(imgname)
    #for filename in os.listdir('/kaggle/working'):
        #if filename.endswith('.jpg'):
    os.remove(imgname)

#p_train_set = zip(train_list)
azip.close()


`imsave` is deprecated in SciPy 1.0.0, and will be removed in 1.2.0.
Use ``imageio.imwrite`` instead.
  return self._open_to_write(zinfo, force_zip64=force_zip64)
  return self._open_to_write(zinfo, force_zip64=force_zip64)


### pre-processing for val_set

In [14]:
startdir = "p_val_set"  #要压缩的文件夹路径
file_news = startdir +'.zip' # 压缩后文件夹的名字
azip = zipfile.ZipFile(file_news, 'w')
root_dir = '../input/iwildcam-2019-fgvc6/train_images/'
fig=plt.figure(figsize=(32, 128))
#train_list = []

for idx in range(0, len(val_set)):
    val_img = root_dir + val_set[idx][2]
    temp_img = cv2.imread(val_img, cv2.IMREAD_COLOR)        
    
    img_wb = wb.balanceWhite(temp_img)

    img_lab = cv2.cvtColor(img_wb, cv2.COLOR_BGR2Lab)

    l, a, b = cv2.split(img_lab)
    res_l = clahe.apply(l)
    res = cv2.merge((res_l, a, b))

    res = cv2.cvtColor(res, cv2.COLOR_Lab2BGR)
    resized = cv2.resize(res, (desired_size,)*2).astype('uint8')
    
    imgname = val_set[idx][2]
    scipy.misc.imsave(imgname, resized)
    azip.write(imgname, compress_type=zipfile.ZIP_LZMA)
    #train_list.append(imgname)
    os.remove(imgname)

#p_train_set = zip(train_list)
azip.close()
#for filename in os.listdir('/kaggle/working'):
 #   if filename.endswith('.jpg'):
        

`imsave` is deprecated in SciPy 1.0.0, and will be removed in 1.2.0.
Use ``imageio.imwrite`` instead.


<Figure size 2304x9216 with 0 Axes>

### pre-processing for test set

### 2. Get a subset of train_val_df and split into train_df & val_df
When tuning the model, we will not use the whole train dataset because the dataset is too large. After getting the best model structure, we should comment the code in this section since we should use the complete train dataset.

### 3. Explore the train set: train_df and val_df
We need to make sure that all the 14 classes is contained in train_df and val_df.

### 4. Get Dataset in pytorch
We will use pd.DataFrame (train_df, val_df, test_df) to build torch.utils.data.Dataset (train_set, val_set, test_set).

## 5. Visualize a few images

## Transfer Learning

### 1. Load a pretrained model and reset final fully connected layer
Here we used the DenseNet 201. The pretrained model weights are loaded from Kaggle dataset.

### 2. Train the model we defined

## Model Evaluation

### 1. Get the result in validation set
Confusion matrix.

### 2. Show misclassified images
We want to explore which type of images are mostly being misclassified.

## Submission: using trained model to get predicted classes