# Lesson 2 : State Farm

## Setup the directories for the data

In [1]:
%pwd

u'/home/ubuntu/nbs'

In [1]:
import os

In [16]:
#Path of current directory and the data directory
current_dir = os.getcwd()
LESSON_HOME_DIR = current_dir
DATA_HOME_DIR = current_dir + '/data/statefarm/'

In [20]:
#setup the folder for the validation data
%mkdir -p $DATA_HOME_DIR'valid'

In [22]:
#create the classes for the validation data
for root, dirs, files in os.walk(DATA_HOME_DIR+'train/'):
    for name in dirs:
        os.mkdir(os.path.join(DATA_HOME_DIR+'valid', name))

In [2]:
import numpy as np
from glob import glob

In [38]:
#move 20% of the training data to the respective validation folders
for root, dirs, files in os.walk(DATA_HOME_DIR+'train/'):
    for name in dirs:
        g = glob(DATA_HOME_DIR + 'train/' + name + '/*.jpg')
        shuffle = np.random.permutation(g)
        for i in range(int(0.2*shuffle.size)): 
            os.rename(shuffle[i], DATA_HOME_DIR + 'valid/' + name + '/' + os.path.basename(shuffle[i]))

In [40]:
#setup the test data
%mkdir -p $DATA_HOME_DIR'test/none/'

In [44]:
#move all the files in the folder created above
%cd $DATA_HOME_DIR'test/'
%mv *.jpg  none/

/home/ubuntu/nbs/data/statefarm/test


In [46]:
%cd $LESSON_HOME_DIR

/home/ubuntu/nbs


## Start creating the linear model

In [3]:
%matplotlib inline
from __future__ import division,print_function
import json
import scipy
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import confusion_matrix
np.set_printoptions(precision=4, linewidth=100)
from matplotlib import pyplot as plt
import utils; reload(utils)
from utils import plots, get_batches, plot_confusion_matrix, get_data

Using gpu device 0: Tesla K80 (CNMeM is disabled, cuDNN 5103)
Using Theano backend.


In [4]:
from numpy.random import random, permutation
from scipy import misc, ndimage
from scipy.ndimage.interpolation import zoom

import keras
from keras import backend as K
from keras.utils.data_utils import get_file
from keras.models import Sequential
from keras.layers import Input
from keras.layers.core import Flatten, Dense, Dropout, Lambda
from keras.layers.convolutional import Convolution2D, MaxPooling2D, ZeroPadding2D
from keras.optimizers import SGD, RMSprop
from keras.preprocessing import image

In [5]:
#path for the data and the arrays to save
path='data/statefarm/'
model_path=path+'model/'
if not os.path.exists(model_path): os.mkdir(model_path)

In [10]:
#write the function to save the arrays
import bcolz
def save_array(path, arr): 
    x = bcolz.carray(arr, rootdir=path, mode='w')
    x.flush()
    
def load_array(path):
    return bcolz.open(path)[:]

In [7]:
batch_size=64

In [54]:
train_batches = get_batches(path+'train/', batch_size=batch_size)
val_batches = get_batches(path+'valid', batch_size=batch_size)

Found 17943 images belonging to 10 classes.
Found 4481 images belonging to 10 classes.


In [8]:
#method to one hot encode the classes
def onehot(x):
    return np.array(OneHotEncoder().fit_transform(x.reshape(-1, 1)).todense())

In [63]:
#get the classes from the batches and onehotencode them to labels
train_classes = train_batches.classes
val_classes = val_batches.classes
train_labels = onehot(train_classes)
val_labels = onehot(val_classes)

In [64]:
#save the arrays of labels
save_array(model_path+'train_labels.bc', train_labels)
save_array(model_path+'val_labels.bc', val_labels)

In [65]:
#use the get_data method to get the concatenated data
train_data = get_data(path+'train/')
val_data = get_data(path+'valid/')

Found 17943 images belonging to 10 classes.
Found 4481 images belonging to 10 classes.


In [66]:
#save the data for quick load 
save_array(model_path+'train_data.bc', train_data)
save_array(model_path+'val_data.bc', val_data)

In [11]:
#loading the labels
train_labels = load_array(model_path+'train_labels.bc')
val_labels = load_array(model_path+'val_labels.bc')

In [13]:
#load the data
train_data = load_array(model_path+'train_data.bc')
val_data = load_array(model_path+'val_data.bc')

In [14]:
from vgg16 import Vgg16
vgg = Vgg16()
model = vgg.model

In [16]:
#getting the features
# train_features = model.predict(train_data, batch_size=batch_size)
val_features = model.predict(val_data, batch_size=batch_size)

In [17]:
#saving the features
save_array(model_path+'train_features.bc', train_features)
save_array(model_path+'val_features.bc', val_features)

In [18]:
#creating our linear model
lm = Sequential([Dense(10,activation='softmax', input_shape=(1000,))])
lm.compile(optimizer=RMSprop(lr=0.1), loss='categorical_crossentropy', metrics=['accuracy'])

In [21]:
#fitting the linear model to the features
lm.fit(train_features, train_labels, nb_epoch=20, batch_size=batch_size, validation_data=(val_features, val_labels))

Train on 17943 samples, validate on 4481 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7fdadce26e90>

In [22]:
lm.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
dense_4 (Dense)                  (None, 10)            10010       dense_input_1[0][0]              
Total params: 10010
____________________________________________________________________________________________________


## Visualisation of our model's prediction chops

### Things we will be visualising
 - Random correct predictions
 - Random incorrect predictions
 - Most correct predictions belonging to class c0
 - Most correct predictions belongig to class c1