In [1]:
from sklearn.cross_validation import cross_val_score
from sklearn.decomposition import RandomizedPCA
import numpy as np
from PIL import Image
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import pandas as pd
from tqdm import tqdm



In [2]:
from IPython.display import HTML

In [3]:
import os, sys
import random

In [4]:
#setup a standard image size; this will distort some images but will get everything into the same shape
STANDARD_SIZE = (300, 300)

In [5]:
def img_to_matrix(filename, verbose=False):
    """
    takes a filename and turns it into a numpy array of RGB pixels
    """
    img = Image.open(filename)
    if verbose==True:
        print "changing size from %s to %s" % (str(img.size), str(STANDARD_SIZE))
    img = img.resize(STANDARD_SIZE)
    img = list(img.getdata())
    img = map(list, img)
    img = np.array(img)
    return img

In [6]:
def flatten_image(img):
    """
    takes in an (m, n) numpy array and flattens it 
    into an array of shape (1, m * n)
    """
    s = img.shape[0] * img.shape[1]
    img_wide = img.reshape(1, s)
    return img_wide[0]

In [15]:
img_dir = "/Users/minhgeneralassembly/Downloads/kaggle_catsanddogs/pca_train/"
images = [img_dir + f for f in os.listdir(img_dir) if f.endswith('.jpg')]
random.shuffle(images)
### Right now this says the target is true if the file has 'kittens' in the name.
### This means we are building a binary classifier of kitten or not.  Change this is so you can pass an argument 
### to build a classifier for any set of images.  Use sys.argv ('import sys' first)

# added and renamed classes -mt
# labels = np.array(["kitten" if "kitten" in f.split('/')[-1] else "bicycle" if "bicycle" in f.split('/')[-1] else "pumpkin" for f in images])

labels = np.array([1 if "cat" in f.split('/')[-1] else 0 for f in images])

In [16]:
data = []

In [17]:
### Apply transformation for each matrix
for image in tqdm(images):
  img = img_to_matrix(image)
  img = flatten_image(img)
  data.append(img)

100%|██████████| 500/500 [01:22<00:00,  6.08it/s]


In [18]:
data = np.array(data)

In [19]:
### This creates a simpler representation of the images other than the raw pixels
### Change the number of components to see how this effects classification accuracy
pca = RandomizedPCA(n_components=50)



In [20]:
### Transform your dataset `data` into a feature setX 
X = pca.fit_transform(data)

In [21]:
### Setup a classifier (or multiple, play around with different models) 
### How much data do you have?  Do you think the relationships are linear?
model = RandomForestClassifier(n_estimators=10)

In [22]:
## Do some cross validation
print cross_val_score(model, X, labels, scoring='roc_auc')

[ 0.57440476  0.56350704  0.56372478]


In [23]:
model.fit(X, labels)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

### test images

In [24]:
test_img_dir = "small_testing_dir/"
test_images = [test_img_dir + f for f in os.listdir(test_img_dir) if f.endswith('.jpg')]

In [25]:
test_data = []

for test_image in tqdm(test_images):
  img = img_to_matrix(test_image)
  img = flatten_image(img)
  test_data.append(img)

100%|██████████| 100/100 [00:16<00:00,  6.02it/s]


In [26]:
test_data = np.array(test_data)

In [27]:
test_X = pca.fit_transform(test_data)

In [28]:
df = pd.DataFrame(test_images, columns=['image_name'])

In [29]:
df['image'] = df['image_name'].map(lambda x: '<img src="{}" style="max-height: 400px; max-width: 400px;" ></img>'.format(x) )

In [30]:
df['pred'] = model.predict(test_X)

In [31]:
pd.set_option('max_colwidth', 300)
HTML(df.to_html(escape=False))

Unnamed: 0,image_name,image,pred
0,small_testing_dir/63.jpg,,1
1,small_testing_dir/77.jpg,,1
2,small_testing_dir/88.jpg,,0
3,small_testing_dir/89.jpg,,1
4,small_testing_dir/76.jpg,,1
5,small_testing_dir/62.jpg,,1
6,small_testing_dir/74.jpg,,0
7,small_testing_dir/60.jpg,,0
8,small_testing_dir/48.jpg,,0
9,small_testing_dir/49.jpg,,0


In [None]:
break

In [None]:
# following along with https://elitedatascience.com/keras-tutorial-deep-learning-in-python (MNIST dataset) -mt
# attempting to fit model for cats and dogs kaggle dataset: https://www.kaggle.com/c/dogs-vs-cats -mt

### Step 3: Import libraries and modules

In [None]:
# importing numpy and setting a seed for reproductibility
import numpy as np
np.random.seed(123) # kinda irrelevant. not really using randomness. -mt

import os

In [None]:
# importing standard keras modules/layers/utilities
from keras.models import Sequential, load_model
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Convolution2D, MaxPooling2D
from keras.utils import np_utils

from keras.preprocessing.image import ImageDataGenerator
from keras import backend as K

In [None]:
from keras.preprocessing.image import ImageDataGenerator, array_to_img, img_to_array, load_img

In [None]:
from matplotlib import pyplot as plt
%matplotlib inline

#### Second attempt using: https://blog.keras.io/building-powerful-image-classification-models-using-very-little-data.html -mt

Directory structure looks like:
```
small_training_dir/
    train/
        dogs/
            dog001.jpg
            dog002.jpg
            ...
        cats/
            cat001.jpg
            cat002.jpg
            ...
    validation/
        dogs/
            dog001.jpg
            dog002.jpg
            ...
        cats/
            cat001.jpg
            cat002.jpg
            ...
```            

### Step 4: Point to images for dataset creation (later)

In [None]:
# dimensions of our images.
img_width, img_height = 150, 150

train_data_dir = '/Users/minhgeneralassembly/Downloads/kaggle_catsanddogs/small_training_dir/train'
validation_data_dir = '/Users/minhgeneralassembly/Downloads/kaggle_catsanddogs/small_training_dir/validation'
nb_train_samples = 1000
nb_validation_samples = 200
epochs = 30
batch_size = 16

In [None]:
# reshape array shapes as needed (tensorflow vs theano)
if K.image_data_format() == 'channels_first':
    input_shape = (3, img_width, img_height)
else:
    input_shape = (img_width, img_height, 3)

### Step 5: Define model

In [None]:
model = Sequential()
model.add(Convolution2D(32, (3, 3), input_shape=input_shape))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))

model.add(Convolution2D(32, (3, 3)))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))

model.add(Convolution2D(64, (3, 3)))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))

model.add(Flatten())
model.add(Dense(64))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(1))
model.add(Activation('sigmoid'))

In [None]:
model.summary()

In [None]:
model.compile(loss='binary_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])

### Step 6: Data augmentation

In [None]:
# this is the augmentation configuration we will use for training
train_datagen = ImageDataGenerator(
    rotation_range=40,
    rescale=1. / 255,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True)

In [None]:
# this is the augmentation configuration we will use for testing:
# only rescaling
test_datagen = ImageDataGenerator(rescale=1. / 255)

In [None]:
train_generator = train_datagen.flow_from_directory(
    train_data_dir,
    target_size=(img_width, img_height),
    batch_size=batch_size,
    class_mode='binary')

In [None]:
validation_generator = test_datagen.flow_from_directory(
    validation_data_dir,
    target_size=(img_width, img_height),
    batch_size=batch_size,
    class_mode='binary')

#### Test data augmentation

In [None]:
# img = load_img('/Users/minhgeneralassembly/Downloads/kaggle_catsanddogs/small_training_dir/train/cats/cat.0.jpg')  # this is a PIL image
# x = img_to_array(img)  # this is a Numpy array with shape (3, 150, 150)
# x = x.reshape((1,) + x.shape)  # this is a Numpy array with shape (1, 3, 150, 150)

In [None]:
# i = 0
# for batch in train_datagen.flow(x, batch_size=1,
#                           save_to_dir='/Users/minhgeneralassembly/Downloads/kaggle_catsanddogs/preview', save_prefix='cat', save_format='jpeg'):
#     i += 1
#     if i > 20:
#         break  # otherwise the generator would loop indefinitely

### Step 7: Fit model

In [None]:
history = model.fit_generator(
    train_generator,
    steps_per_epoch=nb_train_samples // batch_size,
    epochs=epochs,
    validation_data=validation_generator,
    validation_steps=nb_validation_samples // batch_size)

### Step 7.5: Save model

In [None]:
model.save('catsndogs.h5')

### Step 7.9: Reload model (if needed)

In [None]:
# model = load_model('catsndogs.h5')

### Step 8: Evaluate model

In [None]:
print(history.history.keys())

In [None]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper left')
plt.show()

In [None]:
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('model acc')
plt.ylabel('acc')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper left')
plt.show()

### Step 10: Test things

In [None]:
# used to center-crop an image before feeding it through for testing. -mt

def cropResizeImages(img):
    width, height = img.size

    if width > height:
        left = (width - height)/2
        top = (height - height)/2
        right = (width + height)/2
        bottom = (height + height)/2
        
        img = img.crop((left, top, right, bottom))
        img = img.resize([150,150])
    
    elif height > width:
        left = (width - width)/2
        top = (height - width)/2
        right = (width + width)/2
        bottom = (height + width)/2
        
        img = img.crop((left, top, right, bottom))
        img = img.resize([150,150])
        
    return img

In [None]:
# test_img = "/Users/minhgeneralassembly/Downloads/kaggle_catsanddogs/small_testing_dir/11.jpg" # cat
# test_img = "/Users/minhgeneralassembly/Downloads/kaggle_catsanddogs/small_testing_dir/27.jpg" # dog
# test_img = "/Users/minhgeneralassembly/Downloads/kaggle_catsanddogs/small_testing_dir/82.jpg" # cat
test_img = "/Users/minhgeneralassembly/Downloads/kaggle_catsanddogs/small_training_dir/train/cats/cat.100.jpg" #cat, training
# test_img = "/Users/minhgeneralassembly/Downloads/kaggle_catsanddogs/small_training_dir/train/dogs/dog.100.jpg" #dog

In [None]:
test_img = load_img(test_img)

In [None]:
test_img_cropped = cropResizeImages(test_img)
test_img_in = np.array(test_img_cropped)
plt.imshow(test_img_in);

In [None]:
preds = model.predict_classes(test_img_in[np.newaxis, :, :, :])
probs = model.predict_proba(test_img_in[np.newaxis, :, :, :])
print(preds, probs)

In [None]:
break
# less data science / machine learning below. more file manipulation. -mt

### Step 8: Run images through model and sort according to predictions

In [None]:
cleaning_dir = "/Users/minhgeneralassembly/Downloads/kaggle_catsanddogs/small_testing_dir"

In [None]:
for filename in os.listdir(cleaning_dir):
    if filename.endswith('.jpg'):
        with open(os.path.join(cleaning_dir, filename)) as f:
            content = f.read()
            print con