<a href="https://colab.research.google.com/github/monicasjsu/deep_learning/blob/master/keras_amazon_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

import zipfile
import cv2
from tqdm import tqdm

import tensorflow as tf
from tensorflow import keras

import keras as k
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.layers import Conv2D, MaxPooling2D
import os
import datetime

from tensorboard.plugins.hparams import api_pb2
from keras.callbacks import TensorBoard
%reload_ext autoreload
%autoreload 2
%matplotlib inline
%load_ext tensorboard
from tensorboard.plugins.hparams import summary as hparams_summary
from google.protobuf import struct_pb2

Using TensorFlow backend.


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
df_train = pd.read_csv("/content/drive/My Drive/Datasets/amazon_dataset/train_v2.csv")
df_train.head()

Unnamed: 0,image_name,tags
0,train_0,haze primary
1,train_1,agriculture clear primary water
2,train_2,clear primary
3,train_3,clear primary
4,train_4,agriculture clear habitation primary road


In [4]:
df_test = pd.read_csv("/content/drive/My Drive/Datasets/amazon_dataset/sample_submission_v2.csv")
df_test.head()

Unnamed: 0,image_name,tags
0,test_0,primary clear agriculture road water
1,test_1,primary clear agriculture road water
2,test_2,primary clear agriculture road water
3,test_3,primary clear agriculture road water
4,test_4,primary clear agriculture road water


In [0]:
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
link = 'https://drive.google.com/open?id=1SCTIHfKATPoStPtXXs7QLLZysMIzHIFs'
fluff, id = link.split('=')

downloaded = drive.CreateFile({'id':id})
downloaded.GetContentFile('train-jpg.zip')

with zipfile.ZipFile('train-jpg.zip', 'r') as zip_ref:
    zip_ref.extractall()


In [0]:
x_train = []
y_train = []

labels = df_train['tags'].str.get_dummies(sep=' ').columns

In [7]:
label_map = {l: i for i, l in enumerate(labels)}
inv_label_map = {i: l for l, i in label_map.items()}
inv_label_map

{0: 'agriculture',
 1: 'artisinal_mine',
 2: 'bare_ground',
 3: 'blooming',
 4: 'blow_down',
 5: 'clear',
 6: 'cloudy',
 7: 'conventional_mine',
 8: 'cultivation',
 9: 'habitation',
 10: 'haze',
 11: 'partly_cloudy',
 12: 'primary',
 13: 'road',
 14: 'selective_logging',
 15: 'slash_burn',
 16: 'water'}

In [8]:
for f, tags in tqdm(df_train.values, miniters=1000):
    img = cv2.imread('train-jpg/{}.jpg'.format(f))
    targets = np.zeros(17)
    for t in tags.split(' '):
        targets[label_map[t]] = 1 
    x_train.append(cv2.resize(img, (32, 32)))
    y_train.append(targets)

x_train = np.array(x_train, np.float16) / 255.
y_train = np.array(y_train, np.uint8)

100%|██████████| 40479/40479 [00:48<00:00, 843.23it/s]


In [0]:
link = 'https://drive.google.com/open?id=1Mho1wcWEPSb32_uevJfmrUYQFSD9921G'
fluff, id = link.split('=')

downloaded = drive.CreateFile({'id':id}) 
downloaded.GetContentFile('test-jpg.zip')

with zipfile.ZipFile('test-jpg.zip', 'r') as zip_ref:
    zip_ref.extractall()

In [0]:
x_test = []
y_test = []

In [11]:
for f, tags in tqdm(df_train.values, miniters=1000):
  try:
    img = cv2.imread('test-jpg/{}.jpg'.format(f))
    targets = np.zeros(17)
    for t in tags.split(' '):
        targets[label_map[t]] = 1 
    x_test.append(cv2.resize(img, (32, 32)))
    y_test.append(targets)
  except:
    continue
    
x_test = np.array(x_test, np.float16) / 255.
y_test = np.array(y_test, np.uint8)

100%|██████████| 40479/40479 [00:02<00:00, 15689.86it/s]


In [12]:
dataSplit = 35000
x_train, x_valid, y_train, y_valid = x_train[:dataSplit], x_train[dataSplit:], y_train[:dataSplit], y_train[dataSplit:]
print(x_train.shape, x_valid.shape, y_train.shape, y_valid.shape)

(35000, 32, 32, 3) (5479, 32, 32, 3) (35000, 17) (5479, 17)


In [0]:
def amazon_model():
  model_1 = Sequential()
  model_1.add(Conv2D(32, kernel_size=(3, 3), activation='relu', input_shape=(32, 32, 3)))
  model_1.add(Conv2D(64, (3, 3), activation='relu'))
  model_1.add(MaxPooling2D(pool_size=(2, 2)))
  model_1.add(Dropout(0.25))
  model_1.add(Flatten())
  model_1.add(Dense(128, activation='relu'))
  model_1.add(Dropout(0.5))
  model_1.add(Dense(17, activation='sigmoid'))
  return model_1


In [14]:
model = amazon_model()
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_1 (Conv2D)            (None, 30, 30, 32)        896       
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 28, 28, 64)        18496     
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 14, 14, 64)        0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 14, 14, 64)        0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 12544)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 128)               1605760   
_________________________________________________________________
dropout_2 (Dropout)          (None, 128)              

In [0]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [16]:
model.fit(x_train, y_train, batch_size=128, epochs=3, verbose=1, validation_data=(x_valid, y_valid))

Train on 35000 samples, validate on 5479 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.callbacks.History at 0x7f01d04c8f60>

In [0]:
from keras.callbacks import TensorBoard

import tensorflow as tf
import datetime

!rm -rf ./logs/ 

In [0]:
logdir = os.path.join("logs", datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
tensorboard_callback = TensorBoard(log_dir=logdir, histogram_freq=1)

In [0]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [20]:
model.fit(x_train, y_train, batch_size=128, epochs=3, verbose=1, validation_data=(x_valid, y_valid))

Train on 35000 samples, validate on 5479 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.callbacks.History at 0x7f01d03e9da0>

Trying Hyperparameter Tuning by adding extra layers

In [0]:
def amazon_model_2():
  model_2 = Sequential()
  model_2.add(Conv2D(64, (3, 3), input_shape=(32, 32, 3),activation='relu'))
  model_2.add(MaxPooling2D(pool_size=(2, 2)))
  model_2.add(Conv2D(32, (3, 3), activation='relu'))
  model_2.add(MaxPooling2D(pool_size=(2, 2)))
  model_2.add(Flatten())
  model_2.add(Dense(units=128, activation='relu'))
  model_2.add(Dropout(0.5))
  model_2.add(Dense(units=128, activation='relu'))
  model_2.add(Dropout(0.3))
  model_2.add(Dense(units=17, activation='sigmoid'))
  return model_2

In [22]:
model_2 = amazon_model_2()
model_2.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_3 (Conv2D)            (None, 30, 30, 64)        1792      
_________________________________________________________________
max_pooling2d_2 (MaxPooling2 (None, 15, 15, 64)        0         
_________________________________________________________________
conv2d_4 (Conv2D)            (None, 13, 13, 32)        18464     
_________________________________________________________________
max_pooling2d_3 (MaxPooling2 (None, 6, 6, 32)          0         
_________________________________________________________________
flatten_2 (Flatten)          (None, 1152)              0         
_________________________________________________________________
dense_3 (Dense)              (None, 128)               147584    
_________________________________________________________________
dropout_3 (Dropout)          (None, 128)              

In [23]:
model_2.compile(loss='binary_crossentropy', optimizer='sgd', metrics=['accuracy'])
model_2.fit(x_train, y_train,batch_size=128,epochs=5,verbose=1,validation_data=(x_valid, y_valid),callbacks=[tensorboard_callback])

Train on 35000 samples, validate on 5479 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.callbacks.History at 0x7f01d01a36d8>

In [24]:
model_3 = amazon_model_2()
model_3.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model_3.fit(x_train, y_train,batch_size=128,epochs=10,verbose=1,validation_data=(x_valid, y_valid),callbacks=[tensorboard_callback])

Train on 35000 samples, validate on 5479 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.callbacks.History at 0x7f0180248e80>

In [0]:
def amazon_model_3():
  model_4 = Sequential()
  model_4.add(Conv2D(64, (3, 3), input_shape=(32, 32, 3),activation='relu'))
  model_4.add(MaxPooling2D(pool_size=(2, 2)))
  model_4.add(Conv2D(32, (3, 3), activation='relu'))
  model_4.add(MaxPooling2D(pool_size=(2, 2)))
  model_4.add(Flatten())
  model_4.add(Dense(units=128, activation='selu'))
  model_4.add(Dropout(0.5))
  model_4.add(Dense(units=128, activation='relu'))
  model_4.add(Dropout(0.3))
  model_4.add(Dense(units=17, activation='sigmoid'))
  return model_4


In [26]:
model_4 = amazon_model_3()
model_4.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model_4.fit(x_train, y_train,batch_size=128,epochs=10,verbose=1,validation_data=(x_valid, y_valid),callbacks=[tensorboard_callback])

Train on 35000 samples, validate on 5479 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.callbacks.History at 0x7f01698483c8>

In [29]:
!tensorboard dev upload --logdir logs

2020-05-16 07:51:01.952831: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudart.so.10.1
Data for the "graphs" plugin is now uploaded to TensorBoard.dev! Note that uploaded data is public. If you do not want to upload data for this plugin, use the "--plugins" command line argument.
Upload started and will continue reading any new data as it's added
to the logdir. To stop uploading, press Ctrl-C.
View your TensorBoard live at: https://tensorboard.dev/experiment/BKgpZs2vT1CjBgC1t5V4iw/
E0516 07:51:04.951310 139860142589824 uploader.py:770] Attempted to re-upload existing blob.  Skipping.
E0516 07:51:05.951343 139860142589824 uploader.py:770] Attempted to re-upload existing blob.  Skipping.


Upload stopped. View your TensorBoard at https://tensorboard.dev/experiment/BKgpZs2vT1CjBgC1t5V4iw/
Traceback (most recent call last):
  File "/usr/local/bin/tensorboard", line 8, in <module>
    sys.exit(run_main())
  File "/usr/local/lib/pyth

In [30]:
!tensorboard dev update-metadata --experiment_id 'BKgpZs2vT1CjBgC1t5V4iw' --name "Keras_amazon_model" --description "Model tuning by trying different hyperparameters"

2020-05-16 07:58:06.534850: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudart.so.10.1
Data for the "graphs" plugin is now uploaded to TensorBoard.dev! Note that uploaded data is public. If you do not want to upload data for this plugin, use the "--plugins" command line argument.
