In [1]:
import os
from os import walk
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from PIL import Image
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from torchvision import models, transforms
from tqdm import tqdm

In [2]:
HP = {
    'epochs': 25,
    'batch_size': 32,
    'learning_rate': 1e-3,
    'momentum': 0.9,
    'test_size': 0.05,
    'seed': 1
}

In [3]:
torch.manual_seed(HP['seed'])
device = 'cuda' if torch.cuda.is_available() else 'cpu'
if device == 'cuda':
    torch.backends.cudnn.benchmark = True
print(f'using {device} device')

using cpu device


In [11]:
dataset_dir = 'data/train_images/'
submission_dir = 'data/test_images/'
dataset_file = 'data/train.csv'
submission_sample = 'data/sample_submission.csv'

df = pd.read_csv(dataset_file)
df = shuffle(df, random_state=HP['seed'])

print(f'count: {len(df)} \n')
df.head(5)

count: 10407 



Unnamed: 0,image_id,label,variety,age
8845,107872.jpg,normal,ADT45,70
8148,105537.jpg,normal,ADT45,60
5701,105132.jpg,downy_mildew,AtchayaPonni,45
9601,105680.jpg,tungro,ADT45,60
3649,102240.jpg,brown_spot,ADT45,72


In [12]:
df['variety'] = pd.factorize(df['variety'])[0]
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
variety,10407.0,1.440761,2.421698,0.0,0.0,0.0,2.0,9.0
age,10407.0,64.043624,8.95883,45.0,60.0,67.0,70.0,82.0


In [13]:
idx_to_label = df['label'].unique()
label_to_idx = {idx: label for label, idx in enumerate(idx_to_label)}
print(label_to_idx)

{'normal': 0, 'downy_mildew': 1, 'tungro': 2, 'brown_spot': 3, 'blast': 4, 'bacterial_leaf_streak': 5, 'bacterial_leaf_blight': 6, 'hispa': 7, 'dead_heart': 8, 'bacterial_panicle_blight': 9}


In [14]:
train_df, test_df = train_test_split(df, test_size=HP['test_size'])
print(f'train len: {len(train_df)}, test len: {len(test_df)}')

train len: 9886, test len: 521


In [15]:
train_df['label'].value_counts()

normal                      1658
blast                       1650
hispa                       1514
dead_heart                  1371
tungro                      1030
brown_spot                   916
downy_mildew                 600
bacterial_leaf_blight        455
bacterial_leaf_streak        364
bacterial_panicle_blight     328
Name: label, dtype: int64

In [16]:
train_transform = transforms.Compose([
    transforms.RandomHorizontalFlip(),
    transforms.RandomVerticalFlip(),
    transforms.RandomChoice([
        transforms.Pad(padding=10),
        transforms.CenterCrop(480),
        transforms.RandomRotation(20),
        transforms.CenterCrop((576,432)),
        transforms.ColorJitter(
            brightness=0.1,
            contrast=0.1, 
            saturation=0.1,
            hue=0.1
        )
    ]),
    transforms.Resize((224,224)),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

test_transform = transforms.Compose([
    transforms.Resize((224,224)),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

class PaddyDataset(Dataset):
    def __init__(self, dataset_dir, df, label_to_idx, transforms):
        self.df = df
        self.label_to_idx = label_to_idx
        self.transforms = transforms
        self.df['path'] = dataset_dir + '/' + self.df.label + '/' + self.df.image_id
        # 0: image_id, 1: label, 2: variety, 3: age, 4: path
        self.df = self.df.values.tolist()

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df[idx]
        image = Image.open(row[4])
        image = self.transforms(image)
        idx = self.label_to_idx[row[1]]
        return image, idx


train_dataset = PaddyDataset(dataset_dir, train_df, label_to_idx, train_transform)
test_dataset = PaddyDataset(dataset_dir, test_df, label_to_idx, test_transform)
train_dataloader = DataLoader(train_dataset, batch_size=HP['batch_size'], shuffle=True, pin_memory=True)
test_dataloader = DataLoader(test_dataset, batch_size=HP['batch_size'], shuffle=True, pin_memory=True)

In [17]:
model = models.resnet34(pretrained=True)
model.fc = nn.Sequential(
    nn.Dropout(0.1),
    nn.Linear(model.fc.in_features, len(label_to_idx))
)
model = model.to(device)

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=HP['learning_rate'], momentum=HP['momentum'])

Downloading: "https://download.pytorch.org/models/resnet34-b627a593.pth" to /Users/jh931201/.cache/torch/hub/checkpoints/resnet34-b627a593.pth


  0%|          | 0.00/83.3M [00:00<?, ?B/s]

In [18]:
def train(model, criterion, optimizer, train_dataloader, test_dataloader):

    total_train_loss = 0
    total_test_loss = 0
    
    model.train()
    with tqdm(train_dataloader, unit='batch', leave=False) as pbar:
        pbar.set_description(f'training')
        for images, idxs in pbar:
            images = images.to(device, non_blocking=True)
            idxs = idxs.to(device, non_blocking=True)
            output = model(images)

            loss = criterion(output, idxs)
            total_train_loss += loss.item()

            loss.backward()
            optimizer.step()
            optimizer.zero_grad(set_to_none=True)

    model.eval()
    with tqdm(test_dataloader, unit='batch', leave=False) as pbar:
        pbar.set_description(f'testing')
        for images, idxs in pbar:
            images = images.to(device, non_blocking=True)
            idxs = idxs.to(device, non_blocking=True)

            output = model(images)
            loss = criterion(output, idxs)
            total_test_loss += loss.item()

    train_acc = total_train_loss / len(train_dataset)
    test_acc = total_test_loss / len(test_dataset)
    print(f'Train loss: {train_acc:.4f} Test loss: {test_acc:.4f} ')

In [19]:
%%time
for i in range(HP['epochs']):
    print(f"Epoch {i+1}/{HP['epochs']}")
    train(model, criterion, optimizer, train_dataloader, test_dataloader)

Epoch 1/25


                                                              

KeyboardInterrupt: 

In [None]:
%%time
model.eval()
image_ids, labels = [], []
for (dirpath, dirname, filenames) in walk(submission_dir):
    for filename in filenames:
        image = Image.open(dirpath+filename)
        image = test_transform(image)
        image = image.unsqueeze(0).to(device)
        image_ids.append(filename)
        labels.append(idx_to_label[model(image).argmax().item()])

In [None]:
submission = pd.DataFrame({
    'image_id': image_ids,
    'label': labels,
})
# submission['label'].value_counts()

In [None]:
# submission.to_csv(submission_output, index=False, header=True)