In [None]:
import os
import random
import numbers
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# train test split from sklearn
from sklearn.model_selection import train_test_split

# Import Torch 
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision import transforms, models
from torch.autograd import Variable
from torch import nn, optim
from torch.optim import lr_scheduler
from torch.utils.data import DataLoader, Dataset, Subset

# graphing
from matplotlib import pyplot as plt

# Image processing
from PIL import Image, ImageOps, ImageEnhance

# progress bar
from tqdm import tqdm

In [None]:
# extract files
import zipfile

# train data
if 'train' not in os.listdir('.'):
    with zipfile.ZipFile("../input/dogs-vs-cats/train.zip","r") as z:
        z.extractall(".")

# test data
if 'test1' not in os.listdir('.'):
    with zipfile.ZipFile("../input/dogs-vs-cats/test1.zip","r") as z:
        z.extractall(".")

In [None]:
# set up transforms
train_transforms = transforms.Compose([
                                transforms.Resize(256),
                                transforms.ColorJitter(),
                                transforms.RandomCrop(224),
                                transforms.RandomHorizontalFlip(),
                                transforms.Resize((32,32)),
                                transforms.ToTensor(), 
                                transforms.Normalize(mean=(0.5,), std=(0.5,))])

# only resize transform for testing
test_transform = transforms.Compose([
    transforms.Resize((32,32)),
    transforms.ToTensor()
])

In [None]:
class CatsDogsDataSet(Dataset):
     def __init__(self, train_dir, transform = train_transforms):       
        self.train_dir = train_dir
        self.transform = transform
        self.images = []
        self.labels = []
        for fname in os.listdir(train_dir):
            self.images.append(fname)
            if 'cat' in fname.split('.')[0]:
                self.labels.append(1)
            else:
                self.labels.append(0)
                
     def __len__(self):
        return len(self.images)

     def __getitem__(self, idx):
        img = Image.open(os.path.join(self.train_dir, self.images[idx]))
        if self.transform is not None:
            img = self.transform(img)
        else:
            img = np.array(img).astype('float32')
        return img, self.labels[idx]
    
     def split(self, start, end):
        return self.labels[start:end+1]
    
    
     def show_image(self, index):        
        filename = os.path.join(self.train_dir, self.images[index])
        img_array = np.array(Image.open(filename))
        plt.imshow(img_array)


In [None]:
# view sample data and check distribution of validation set
train_dir = './test1'
sample_dataset = CatsDogsDataSet(train_dir=train_dir, transform = train_transforms)  
rand_index = random.randint(0, len(sample_dataset))
sample_image, sample_label = sample_dataset.__getitem__(rand_index)

print("Shape: ", sample_image.shape)
my_split = sample_dataset.split(20001,25000)

print("Split: ", sum(my_split))

print(sample_label)
sample = sample_dataset.show_image(2)

In [None]:
batch_size = 64

# create datasets from np arrays
train_dataset = CatsDogsDataSet('train')
test_dataset = CatsDogsDataSet('test1', transform = test_transform)

train_indexes =  [i for i in range(20000)]
valid_indexes =  [i for i in range(20001, 25000)]

train_data = Subset(train_dataset, train_indexes)
valid_data  = Subset(train_dataset, valid_indexes)

# torch dataloaders
train_loader = torch.utils.data.DataLoader(dataset=train_data, batch_size=batch_size, shuffle=True)
valid_loader = torch.utils.data.DataLoader(dataset=valid_data, batch_size=batch_size, shuffle=True)
test_loader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)

In [None]:
class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__()
        self.conv1 = nn.Conv2d(3,50,5)
        self.conv2 = nn.Conv2d(50,150,3)
        self.conv3 = nn.Conv2d(150,300,3)
        self.conv4 = nn.Conv2d(300, 120, 3)
        self.fc1 = nn.Linear(5880, 1024)
        self.fc2 = nn.Linear(1024, 512)
        self.fc3 = nn.Linear (512, 2)
    
        
    def forward(self,x): 
#        print('\ninput')
#        print(x.shape)
        x = F.relu(self.conv1(x))
#        print('\nconv1 output')
#        print(x.shape)
        x = F.max_pool2d(x, kernel_size = 3, stride = 1)
#       print('\nmaxpool1 output')
#        print(x.shape)
        x = F.relu(self.conv2(x))
#        print('\nconv2 output')
#        print(x.shape)
        x = F.max_pool2d(x, kernel_size = 3, stride = 1)
#        print('\nmaxpool2 output')
#        print(x.shape)
        x = F.relu(self.conv3(x))
#        print('\nconv3 output')
#        print(x.shape)
        x = F.max_pool2d(x, kernel_size = 3, stride = 2)
#        print('maxpool3 output')
#        print(x.shape)
        x = F.relu(self.conv4(x))
#        print('conv4 output')
#        print(x.shape)
#        x = F.max_pool2d(x, kernel_size = 3, stride = 2)
#        print('maxpool4 output')
#        print(x.shape)
        
        x = torch.flatten(x, start_dim=1)
        
#        print('flatten output')
#        print(x.shape)
        x = F.relu(self.fc1(x))
#        print('fc1 output')
#        print(x.shape)
        x = F.relu(self.fc2(x))
#        print('fc2 output')
#        print(x.shape)
        x = self.fc3(x)
#        print(x.shape)
        return x

print(CNN())

In [None]:
model = CNN()    
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()
expr_lr_scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones = [3, 7], gamma=0.1)

In [None]:
loss_list = []
accuracy_list = []
iteration_list = []

def train(epoch):
    expr_lr_scheduler.step()
    i = 0
    for features, labels in tqdm(train_loader):
        features, labels = Variable(features), Variable(labels)

        # zero out gradients from previous iteration
        optimizer.zero_grad()

        # forward propagation
        output = model(features)
        
        # calculate loss
        loss = criterion(output, labels)

        # backprop
        loss.backward()
        
        # update params (gradient descent)
        optimizer.step()
            
        i += 1

In [None]:
def evaluate(data_loader):
    model.eval()
    loss = 0
    correct = 0
    
    for features, labels in tqdm(data_loader):
        with torch.no_grad():
            features, labels = Variable(features), Variable(labels)
            output = model(features)
            
        loss += F.cross_entropy(output, labels, size_average=False).data.item()
        pred = output.data.max(1, keepdim=True)[1]
        correct += pred.eq(labels.data.view_as(pred)).cpu().sum()
        
    loss /= len(data_loader.dataset)
    accuracy = 100. * correct / len(data_loader.dataset)
    print('Epoch: {}, Average loss: {:.4f}, Accuracy: {}/{} ({:.3f}%)'.format(
        epoch,
        loss, correct, len(data_loader.dataset),
        accuracy))
    loss_list.append(loss)
    accuracy_list.append(accuracy)
    iteration_list.append(epoch)

In [None]:
n_epochs = 10

for epoch in range(n_epochs):
    train(epoch)
    evaluate(valid_loader)

In [None]:
# visualize loss 
plt.plot(iteration_list,loss_list)
plt.xlabel("Number of epochs")
plt.ylabel("Loss")
plt.title("Loss vs Number of epochs")
plt.show()

# visualize accuracy 
plt.plot(iteration_list,accuracy_list,color = "red")
plt.xlabel("Number of epochs")
plt.ylabel("Accuracy")
plt.title("Accuracy vs Number of epochs")
plt.show()

In [None]:
# save model
torch.save(model, 'cnn-model.pt')

In [None]:
model.eval()
fn_list = []
pred_list = []
for x,_ in tqdm(test_loader):
    with torch.no_grad():
        output = model(x)
        pred = torch.argmax(output, dim=1)
    pred_list += [p.item() for p in pred]

##### 

In [None]:
ids = [i for i in range(1, len(pred_list)+1)]
submission = pd.DataFrame({"id":ids, "label":pred_list})
submission.to_csv('submission.csv', index=False)