In [1]:
%autosave 1000000

Autosaving every 1000000 seconds


In [2]:
import numpy as np
np.random.seed(2016)

import os
import glob
import cv2
import math
import pickle
import datetime
import pandas as pd
#import statistics
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from keras.models import Sequential
from keras.applications import VGG16,VGG19
from keras.layers.core import Dense, Dropout, Activation, Flatten
from keras.layers.convolutional import Convolution2D, MaxPooling2D
from keras.optimizers import SGD
from keras.utils import np_utils
from keras.models import model_from_json
from sklearn.metrics import log_loss
from scipy.misc import imread, imresize

Using TensorFlow backend.


In [3]:
def get_im_cv2(path, img_rows, img_cols, color_type=3):
    # Load as grayscale
    if color_type == 1:
        img = cv2.imread(path, 0)
    elif color_type == 3:
        img = cv2.imread(path)
    # Reduce size
    resized = cv2.resize(img, (img_cols, img_rows))
    return resized

In [4]:
def load_train(img_rows, img_cols, color_type=3,num_files_to_read=-1):
    X_train = []
    y_train = []
    
    print('Read train images')
    for j,target_type in enumerate(['cat','dog']):
        counter = 0
        print('Load folder Type_{}'.format(j))
        path = os.path.join('..', 'input', 'train',target_type+'*.jpg')
        files = glob.glob(path)
        for fl in files:
            flbase = os.path.basename(fl)
            img = get_im_cv2(fl, img_rows, img_cols, color_type)
            X_train.append(np.asarray(img))
            y_train.append(j)
            counter+=1
            if (counter>=num_files_to_read)&(num_files_to_read>0):
                break
    

    return np.array(X_train), np.array(y_train)

In [5]:
def cache_data(data, path):
    if os.path.isdir(os.path.dirname(path)):
        file = open(path, 'wb')
        pickle.dump(data, file)
        file.close()
    else:
        print('Directory doesnt exists')

In [6]:
def restore_data(path):
    data = dict()
    if os.path.isfile(path):
        file = open(path, 'rb')
        data = pickle.load(file)
    return data

In [13]:
img_rows = 224
img_cols = 224
num_samples = 100
read_from_cache = True
color_type_global = 3

In [14]:
%%time
if not read_from_cache:
    X_train, y_train = load_train(img_rows,img_cols,3,num_samples)
    cache_data(X_train,'../processed_input/X_train_{}X{}X{}_{}_max_samples'.format(img_rows,img_cols,
                                                                                   color_type_global,num_samples))
    cache_data(y_train,'../processed_input/y_train_{}_max_samples'.format(num_samples))
else:    
    X_train = restore_data('../processed_input/X_train_{}X{}X{}_{}_max_samples'.format(img_rows,img_cols,
                                                                               color_type_global,num_samples))
    y_train = restore_data('../processed_input/y_train_{}_max_samples'.format(num_samples))

CPU times: user 0 ns, sys: 30 ms, total: 30 ms
Wall time: 97.4 ms


In [15]:
print (X_train.shape)
print (y_train.shape)

(200, 224, 224, 3)
(200,)


In [17]:
from keras.applications.vgg16 import VGG16
from keras.preprocessing import image
from keras.applications.vgg16 import preprocess_input
from keras.layers import Input, Flatten, Dense
from keras.models import Model

#model = create_model(img_rows, img_cols, color_type_global)
# Generate a model with all layers (with top)
vgg16 = VGG16(weights='imagenet', include_top=True)
vgg16.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 224, 224, 3)       0         
_________________________________________________________________
block1_conv1 (Conv2D)        (None, 224, 224, 64)      1792      
_________________________________________________________________
block1_conv2 (Conv2D)        (None, 224, 224, 64)      36928     
_________________________________________________________________
block1_pool (MaxPooling2D)   (None, 112, 112, 64)      0         
_________________________________________________________________
block2_conv1 (Conv2D)        (None, 112, 112, 128)     73856     
_________________________________________________________________
block2_conv2 (Conv2D)        (None, 112, 112, 128)     147584    
_________________________________________________________________
block2_pool (MaxPooling2D)   (None, 56, 56, 128)       0         
__________

In [18]:
for l in vgg16.layers:
    l.trainable = False

vgg16.summary()   
    
#vgg16.layers['predictions'].trainable = True

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 224, 224, 3)       0         
_________________________________________________________________
block1_conv1 (Conv2D)        (None, 224, 224, 64)      1792      
_________________________________________________________________
block1_conv2 (Conv2D)        (None, 224, 224, 64)      36928     
_________________________________________________________________
block1_pool (MaxPooling2D)   (None, 112, 112, 64)      0         
_________________________________________________________________
block2_conv1 (Conv2D)        (None, 112, 112, 128)     73856     
_________________________________________________________________
block2_conv2 (Conv2D)        (None, 112, 112, 128)     147584    
_________________________________________________________________
block2_pool (MaxPooling2D)   (None, 56, 56, 128)       0         
__________

In [28]:
x = (Dense(2,activation='softmax'))(vgg16.output)
model = Model(inputs = vgg16.input,outputs = x)
#sgd = SGD(lr=0.001, decay=1e-5, momentum=0.95, nesterov=True)
from keras.optimizers import adadelta
model.compile(optimizer='adadelta',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [23]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 224, 224, 3)       0         
_________________________________________________________________
block1_conv1 (Conv2D)        (None, 224, 224, 64)      1792      
_________________________________________________________________
block1_conv2 (Conv2D)        (None, 224, 224, 64)      36928     
_________________________________________________________________
block1_pool (MaxPooling2D)   (None, 112, 112, 64)      0         
_________________________________________________________________
block2_conv1 (Conv2D)        (None, 112, 112, 128)     73856     
_________________________________________________________________
block2_conv2 (Conv2D)        (None, 112, 112, 128)     147584    
_________________________________________________________________
block2_pool (MaxPooling2D)   (None, 56, 56, 128)       0         
__________

In [30]:
from keras.utils.np_utils import to_categorical
OHE_y_train = to_categorical(y_train)
model.fit(X_train,OHE_y_train,validation_split=0.2, epochs=5)

Train on 160 samples, validate on 40 samples
Epoch 1/5


KeyboardInterrupt: 

In [31]:
#Add a layer where input is the output of the  second last layer 
x = (vgg16.layers[-4].output)

#Then create the corresponding model 
model = Model(inputs=vgg16.input, outputs=x)
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 224, 224, 3)       0         
_________________________________________________________________
block1_conv1 (Conv2D)        (None, 224, 224, 64)      1792      
_________________________________________________________________
block1_conv2 (Conv2D)        (None, 224, 224, 64)      36928     
_________________________________________________________________
block1_pool (MaxPooling2D)   (None, 112, 112, 64)      0         
_________________________________________________________________
block2_conv1 (Conv2D)        (None, 112, 112, 128)     73856     
_________________________________________________________________
block2_conv2 (Conv2D)        (None, 112, 112, 128)     147584    
_________________________________________________________________
block2_pool (MaxPooling2D)   (None, 56, 56, 128)       0         
__________

In [32]:
%%time
from keras.utils import to_categorical
from sklearn.metrics import log_loss

sgd = SGD(lr=0.001, decay=1e-5, momentum=0.95, nesterov=True)

# for l in vgg16.layers:
#     l.trainable = False

#vgg16.layers['predictions'].trainable = True

model.compile(optimizer=sgd,
              loss='categorical_crossentropy',
              metrics=['accuracy'])

# we are not going to train the model so we will not run the next line 
# it is only here for context of regular training / fine tuning of pretrained network
#model.fit(X_train, to_categorical(y_train-1,3),validation_split=0.2,shuffle=True, batch_size=batch_size, epochs=epochs, verbose=1)


CPU times: user 20 ms, sys: 0 ns, total: 20 ms
Wall time: 23.7 ms


In [33]:
%%time
pred_train = model.predict(X_train)


CPU times: user 4min 47s, sys: 13.8 s, total: 5min 1s
Wall time: 1min 17s


In [35]:
np.array(pred_train).shape

(200, 25088)

In [None]:
cache_data(pred_train,'../processed_input/processed_{}_first_imgs.pkl'.format(num_samples))

In [10]:
pred_train = restore_data('../processed_input/processed_{}_first_imgs.pkl'.format(num_samples))

In [12]:
np.array(pred_train).shape
train_header = ['f_'+str(x) for x in range(25088)]
train_header
tr_data = pd.DataFrame(pred_train)
tr_data.columns = train_header
tr_data

Unnamed: 0,f_0,f_1,f_2,f_3,f_4,f_5,f_6,f_7,f_8,f_9,...,f_25078,f_25079,f_25080,f_25081,f_25082,f_25083,f_25084,f_25085,f_25086,f_25087
0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,3.515611,6.830776,0.000000,0.000000,0.000000,0.000000,0.000000,9.167677,6.669906,0.000000
1,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,3.782408,0.000000,0.000000,...,0.000000,0.000000,22.248518,8.865723,6.081223,0.000000,0.000000,0.000000,0.000000,0.000000
2,13.179482,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,6.857530,0.000000,0.000000,0.000000,0.000000,0.000000,7.911310,0.000000,0.000000,0.000000
3,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,29.819248,0.000000
4,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,3.748768,3.834193,0.000000,0.000000,0.000000,0.000000
5,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,17.548622,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,8.200901,16.240976,0.000000
6,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
7,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,15.616196,0.000000,5.594573,22.143215,25.966610,0.000000,2.405039,0.000000,4.072012,0.000000
8,1.141834,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,6.788096,0.000000,0.000000,0.000000,2.386732,0.000000,0.000000,17.022707,0.000000,0.000000
9,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,29.381153,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,14.436209,0.000000


In [18]:
from xgboost import XGBClassifier

params = {'objective':"binary:logistic",
          'learning_rate':0.1,
          'subsample':0.7,
          'colsample_bytree':0.9,
          'colsample_bylevel':0.7,
          'max_depth':6,
          'nthread':4,
          'eval_metric':'auc',
          'seed':1234}

bst_cv = xgb.cv(params=params,dtrain=xgb.DMatrix(tr_data,label=y_train),verbose_eval=1,
                nfold=5,early_stopping_rounds=20,num_boost_round=300)

XGBoostError: b'[01:54:59] src/metric/metric.cc:21: Unknown metric function auc\n\nStack trace returned 10 entries:\n[bt] (0) /opt/conda/lib/python3.6/site-packages/xgboost-0.6-py3.6.egg/xgboost/libxgboost.so(_ZN7xgboost6Metric6CreateERKSs+0x59d) [0x7fc857e7b8fd]\n[bt] (1) /opt/conda/lib/python3.6/site-packages/xgboost-0.6-py3.6.egg/xgboost/libxgboost.so(_ZN7xgboost11LearnerImpl9ConfigureERKSt6vectorISt4pairISsSsESaIS3_EE+0x7dd) [0x7fc857e62f2d]\n[bt] (2) /opt/conda/lib/python3.6/site-packages/xgboost-0.6-py3.6.egg/xgboost/libxgboost.so(XGBoosterUpdateOneIter+0x5e) [0x7fc857fe3eee]\n[bt] (3) /opt/conda/lib/python3.6/lib-dynload/_ctypes.cpython-36m-x86_64-linux-gnu.so(ffi_call_unix64+0x4c) [0x7fc8b0f1c550]\n[bt] (4) /opt/conda/lib/python3.6/lib-dynload/_ctypes.cpython-36m-x86_64-linux-gnu.so(ffi_call+0x1f5) [0x7fc8b0f1bcf5]\n[bt] (5) /opt/conda/lib/python3.6/lib-dynload/_ctypes.cpython-36m-x86_64-linux-gnu.so(_ctypes_callproc+0x3dc) [0x7fc8b0f1383c]\n[bt] (6) /opt/conda/lib/python3.6/lib-dynload/_ctypes.cpython-36m-x86_64-linux-gnu.so(+0x9da3) [0x7fc8b0f0bda3]\n[bt] (7) /opt/conda/bin/../lib/libpython3.6m.so.1.0(_PyObject_FastCallDict+0x9e) [0x7fc8b798c92e]\n[bt] (8) /opt/conda/bin/../lib/libpython3.6m.so.1.0(+0x147d1b) [0x7fc8b7a68d1b]\n[bt] (9) /opt/conda/bin/../lib/libpython3.6m.so.1.0(_PyEval_EvalFrameDefault+0x26fd) [0x7fc8b7a6bbbd]\n'

In [None]:
plt.plot(bst_cv.loc[:,['train-mlogloss-mean','test-mlogloss-mean']])

In [33]:
%%time
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

knn4 = KNeighborsClassifier(n_neighbors= 4,n_jobs=4,)
X_train, X_val, y_tr, y_val = train_test_split(tr_data,y_train,test_size = 0.2,random_state = 2017)
knn4.fit(X_train,y_tr)

from sklearn.metrics import roc_auc_score
print('auc score: {}'.format(roc_auc_score(y_val,knn4.predict(X_val))))


auc score: 0.8634113411341134
CPU times: user 30.2 s, sys: 160 ms, total: 30.4 s
Wall time: 9.24 s


In [43]:
%%time
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

knn16 = KNeighborsClassifier(n_neighbors= 8,n_jobs=4)
X_train, X_val, y_tr, y_val = train_test_split(tr_data,y_train,test_size = 0.2,random_state = 2017)
knn16.fit(X_train,y_tr)

from sklearn.metrics import roc_auc_score
print('auc score: {}'.format(roc_auc_score(y_val,knn16.predict(X_val))))


RuntimeError: module compiled against API version 0xb but this version of numpy is 0xa

ImportError: numpy.core.multiarray failed to import